diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-25 15:29:11 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-25 15:29:11 +0200 |
commit | 11304728b6607bc2a8d41a640308f3379a25b933 (patch) | |
tree | c49fb64a64e1c39b889a40f48dcd44f87aaea7b1 /src/tokenizer.rs | |
parent | 9c18ff7858730f0c7782206129375c7efcb7d77f (diff) | |
download | markdown-rs-11304728b6607bc2a8d41a640308f3379a25b933.tar.gz markdown-rs-11304728b6607bc2a8d41a640308f3379a25b933.tar.bz2 markdown-rs-11304728b6607bc2a8d41a640308f3379a25b933.zip |
Improve performance w/ a single feed loop
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 187 |
1 files changed, 90 insertions, 97 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 931ffae..7ec0d91 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -173,7 +173,7 @@ struct InternalState { #[allow(clippy::struct_excessive_bools)] pub struct Tokenizer<'a> { /// Jump between line endings. - column_start: Vec<(usize, usize, usize)>, + column_start: Vec<(usize, usize, usize, usize)>, // First line. line_start: usize, /// Track whether a character is expected to be consumed, and whether it’s @@ -183,20 +183,20 @@ pub struct Tokenizer<'a> { consumed: bool, /// Track whether this tokenizer is done. drained: bool, + /// Current character code. + current: Code, + /// Previous character code. + pub previous: Code, + /// Current relative and absolute place in the file. + pub point: Point, /// Semantic labels of one or more codes in `codes`. pub events: Vec<Event>, /// Hierarchy of semantic labels. /// /// Tracked to make sure everything’s valid. pub stack: Vec<Token>, - /// Previous character code. - pub previous: Code, /// To do. pub map: EditMap, - /// Current character code. - current: Code, - /// Current relative and absolute place in the file. - pub point: Point, /// List of attached resolvers, which will be called when done feeding, /// to clean events. resolvers: Vec<Box<Resolver>>, @@ -204,6 +204,8 @@ pub struct Tokenizer<'a> { resolver_ids: Vec<String>, /// Shared parsing state across tokenizers. pub parse_state: &'a ParseState<'a>, + codes: Vec<Code>, + pub index: usize, /// Stack of label (start) that could form images and links. /// /// Used when tokenizing [text content][crate::content::text]. @@ -216,6 +218,8 @@ pub struct Tokenizer<'a> { /// /// Used when tokenizing [text content][crate::content::text]. pub media_list: Vec<Media>, + /// Current container state. + pub container: Option<ContainerState>, /// Whether we would be interrupting something. /// /// Used when tokenizing [flow content][crate::content::flow]. @@ -229,8 +233,6 @@ pub struct Tokenizer<'a> { /// The previous line was a paragraph, and this line’s containers did not /// match. pub lazy: bool, - /// Current container state. - pub container: Option<ContainerState>, } impl<'a> Tokenizer<'a> { @@ -248,14 +250,16 @@ impl<'a> Tokenizer<'a> { stack: vec![], events: vec![], parse_state, + codes: vec![], + index: 0, map: EditMap::new(), label_start_stack: vec![], label_start_list_loose: vec![], media_list: vec![], + container: None, interrupt: false, concrete: false, lazy: false, - container: None, // Assume about 10 resolvers. resolvers: Vec::with_capacity(10), resolver_ids: Vec::with_capacity(10), @@ -288,8 +292,12 @@ impl<'a> Tokenizer<'a> { } /// Define a jump between two places. - pub fn define_skip(&mut self, point: &Point) { - define_skip_impl(self, point.line, (point.column, point.offset, point.index)); + pub fn define_skip(&mut self, point: &Point, index: usize) { + define_skip_impl( + self, + point.line, + (point.column, point.offset, point.index, index), + ); } /// Define the current place as a jump between two places. @@ -297,7 +305,12 @@ impl<'a> Tokenizer<'a> { define_skip_impl( self, self.point.line, - (self.point.column, self.point.offset, self.point.index), + ( + self.point.column, + self.point.offset, + self.point.index, + self.index, + ), ); } @@ -307,10 +320,11 @@ impl<'a> Tokenizer<'a> { let at = self.point.line - self.line_start; if self.point.column == 1 && at != self.column_start.len() { - let (column, offset, index) = &self.column_start[at]; + let (column, offset, index_abs, index_rel) = &self.column_start[at]; self.point.column = *column; self.point.offset = *offset; - self.point.index = *index; + self.point.index = *index_abs; + self.index = *index_rel; } } @@ -326,6 +340,7 @@ impl<'a> Tokenizer<'a> { assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); self.point.index += 1; + self.index += 1; match code { Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { @@ -342,6 +357,7 @@ impl<'a> Tokenizer<'a> { self.point.column, self.point.offset, self.point.index, + self.index, )); } @@ -482,11 +498,13 @@ impl<'a> Tokenizer<'a> { ) -> Box<StateFn> { attempt_impl( state_fn, - |_code| false, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + None, + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { if matches!(state, State::Ok(_)) { - feed_impl(tokenizer, result.1, after) + tokenizer.index = result.1; + tokenizer.consumed = true; + State::Fn(Box::new(after)) } else { state } @@ -505,11 +523,12 @@ impl<'a> Tokenizer<'a> { ) -> Box<StateFn> { attempt_impl( state_fn, - until, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + Some(Box::new(until)), + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { + tokenizer.index = result.1; tokenizer.consumed = true; - feed_impl(tokenizer, result.1, done(state)) + State::Fn(done(state)) }, ) } @@ -532,16 +551,13 @@ impl<'a> Tokenizer<'a> { attempt_impl( state_fn, - |_code| false, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + None, + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { tokenizer.free(previous); - feed_twice_impl( - tokenizer, - result.0, - result.1, - done(matches!(state, State::Ok(_))), - ) + tokenizer.index = result.0; + tokenizer.consumed = true; + State::Fn(done(matches!(state, State::Ok(_)))) }, ) } @@ -566,9 +582,9 @@ impl<'a> Tokenizer<'a> { attempt_impl( state_fn, - |_code| false, - vec![], - |result: (&[Code], &[Code]), tokenizer: &mut Tokenizer, state| { + None, + self.index, + |result: (usize, usize), tokenizer: &mut Tokenizer, state| { let ok = matches!(state, State::Ok(_)); if !ok { @@ -577,12 +593,9 @@ impl<'a> Tokenizer<'a> { log::debug!("attempt: {:?}, at {:?}", ok, tokenizer.point); - feed_twice_impl( - tokenizer, - if ok { &[] } else { result.0 }, - result.1, - done(ok), - ) + tokenizer.index = result.1; + tokenizer.consumed = true; + State::Fn(done(ok)) }, ) } @@ -623,7 +636,7 @@ impl<'a> Tokenizer<'a> { /// markdown into the state machine, and normally pauses after feeding. pub fn push( &mut self, - codes: &[Code], + mut codes: Vec<Code>, start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, drain: bool, ) -> State { @@ -632,7 +645,9 @@ impl<'a> Tokenizer<'a> { // Let’s assume an event per character. self.events.reserve(codes.len()); - let mut result = feed_impl(self, codes, start); + self.codes.append(&mut codes); + + let mut result = feed_impl(self, start); if drain { let func = match result { @@ -667,41 +682,34 @@ impl<'a> Tokenizer<'a> { /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( state: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, - pause: impl Fn(Code) -> bool + 'static, - mut codes: Vec<Code>, - done: impl FnOnce((&[Code], &[Code]), &mut Tokenizer, State) -> State + 'static, + pause: Option<Box<dyn Fn(Code) -> bool + 'static>>, + start: usize, + done: impl FnOnce((usize, usize), &mut Tokenizer, State) -> State + 'static, ) -> Box<StateFn> { - Box::new(|tokenizer, code| { - if !codes.is_empty() && pause(tokenizer.previous) { - let after = if matches!(code, Code::None) { - vec![] - } else { - vec![code] - }; - - return done((&codes, &after), tokenizer, State::Fn(Box::new(state))); + Box::new(move |tokenizer, code| { + if let Some(ref func) = pause { + if tokenizer.index > start && func(tokenizer.previous) { + return done( + (start, tokenizer.index), + tokenizer, + State::Fn(Box::new(state)), + ); + } } let state = state(tokenizer, code); - match code { - Code::None => {} - _ => { - codes.push(code); - } - } - match state { State::Ok(back) => { + let stop = tokenizer.index - back; assert!( - back <= codes.len(), - "`back` must be smaller than or equal to `codes.len()`" + stop >= start, + "`back` must not result in an index smaller than `start`" ); - let remaining = codes.split_off(codes.len() - back); - done((&codes, &remaining), tokenizer, state) + done((start, stop), tokenizer, state) } - State::Nok => done((&[], &codes), tokenizer, state), - State::Fn(func) => State::Fn(attempt_impl(func, pause, codes, done)), + State::Nok => done((start, start), tokenizer, state), + State::Fn(func) => State::Fn(attempt_impl(func, pause, start, done)), } }) } @@ -709,28 +717,23 @@ fn attempt_impl( /// Feed a list of `codes` into `start`. fn feed_impl( tokenizer: &mut Tokenizer, - codes: &[Code], start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, ) -> State { let mut state = State::Fn(Box::new(start)); - let mut index = 0; tokenizer.consumed = true; - while index < codes.len() { - let code = codes[index]; + while tokenizer.index < tokenizer.codes.len() { + let code = tokenizer.codes[tokenizer.index]; match state { - State::Ok(back) => { - state = State::Ok((codes.len() - index) + back); + State::Ok(_) | State::Nok => { break; } - State::Nok => break, State::Fn(func) => { - log::debug!("main: passing: `{:?}` ({:?})", code, index); + log::debug!("main: passing: `{:?}` ({:?})", code, tokenizer.index); tokenizer.expect(code, false); state = func(tokenizer, code); - index += 1; } } } @@ -738,37 +741,27 @@ fn feed_impl( state } -/// Feed a list of `codes` into `start`. -fn feed_twice_impl( - tokenizer: &mut Tokenizer, - left: &[Code], - right: &[Code], - start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, -) -> State { - let res = feed_impl(tokenizer, left, start); - - match res { - State::Fn(func) => feed_impl(tokenizer, right, func), - State::Ok(back) => State::Ok(back + right.len()), - State::Nok => res, - } -} - /// Flush `start`: pass `eof`s to it until done. fn flush_impl( tokenizer: &mut Tokenizer, start: impl FnOnce(&mut Tokenizer, Code) -> State + 'static, ) -> State { let mut state = State::Fn(Box::new(start)); + let max = tokenizer.index; tokenizer.consumed = true; loop { match state { State::Ok(_) | State::Nok => break, State::Fn(func) => { - log::debug!("main: passing eof"); - tokenizer.expect(Code::None, false); - state = func(tokenizer, Code::None); + let code = if tokenizer.index < max { + tokenizer.codes[tokenizer.index] + } else { + Code::None + }; + log::debug!("main: flushing {:?}", code); + tokenizer.expect(code, false); + state = func(tokenizer, code); } } } @@ -785,7 +778,7 @@ fn flush_impl( /// /// This defines how much columns, offsets, and the `index` are increased when /// consuming a line ending. -fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize, usize)) { +fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize, usize, usize)) { log::debug!("position: define skip: {:?} -> ({:?})", line, info); let at = line - tokenizer.line_start; |