diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-30 16:35:13 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-30 16:35:13 +0200 |
commit | be62b2e29a61774100f676cfdd9b100cadf1905f (patch) | |
tree | 4349e259fc0150526dc32242b92d85218091fca5 /src/construct/partial_data.rs | |
parent | 13588776d65601a41ddfce85f618e8aaa55951cc (diff) | |
download | markdown-rs-be62b2e29a61774100f676cfdd9b100cadf1905f.tar.gz markdown-rs-be62b2e29a61774100f676cfdd9b100cadf1905f.tar.bz2 markdown-rs-be62b2e29a61774100f676cfdd9b100cadf1905f.zip |
Add support for trimming whitespace around string, text
This commit introduces trimming initial and final whitespace around the
whole string or text, or around line endings inside that string or text.
* Add `register_resolver_before`, to run resolvers earlier than others,
used for labels
* Add resolver to merge `data` events, which are the most frequent token
that occurs, and can happen adjacently.
In `micromark-js` this sped up parsing a lot
* Fix a bug where a virtual space was not seen as an okay event
* Refactor to enable all turned off whitespace tests
Diffstat (limited to 'src/construct/partial_data.rs')
-rw-r--r-- | src/construct/partial_data.rs | 51 |
1 files changed, 49 insertions, 2 deletions
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index d83787a..9f99570 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -8,7 +8,8 @@ // To do: pass token types in? -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; /// At the beginning of data. /// @@ -39,7 +40,10 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnRe tokenizer.exit(TokenType::LineEnding); (State::Fn(Box::new(|t, c| at_break(t, c, stop))), None) } - _ if stop.contains(&code) => (State::Ok, Some(vec![code])), + _ if stop.contains(&code) => { + tokenizer.register_resolver("data".to_string(), Box::new(resolve)); + (State::Ok, Some(vec![code])) + } _ => { tokenizer.enter(TokenType::Data); data(tokenizer, code, stop) @@ -67,3 +71,46 @@ fn data(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult (State::Fn(Box::new(|t, c| data(t, c, stop))), None) } } + +/// Merge adjacent data events. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut edit_map = EditMap::new(); + let len = tokenizer.events.len(); + let mut index = 0; + + // Loop through events and merge adjacent data events. + while index < len { + let event = &tokenizer.events[index]; + + if event.event_type == EventType::Enter && event.token_type == TokenType::Data { + let exit_index = index + 1; + let mut exit_far_index = exit_index; + + // Find multiple `data` events. + while exit_far_index + 1 < len + && tokenizer.events[exit_far_index + 1].token_type == TokenType::Data + { + exit_far_index += 2; + } + + if exit_far_index > exit_index { + edit_map.add(exit_index, exit_far_index - exit_index, vec![]); + + // Change positional info. + let exit_far = &tokenizer.events[exit_far_index]; + let point_end = exit_far.point.clone(); + let index_end = exit_far.index; + let exit = &mut tokenizer.events[exit_index]; + exit.point = point_end; + exit.index = index_end; + index = exit_far_index; + + continue; + } + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) +} |