diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-14 18:57:28 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-14 18:57:28 +0200 |
commit | b00fafbdcba39e7e17144b07834702629b891062 (patch) | |
tree | 3351cc3ad2bb126d8a93e1ff6b1731bc00cb45c3 /src/subtokenize.rs | |
parent | 129ea34b18aaf7f5a01d404effbdc78cbbe67a74 (diff) | |
download | markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.gz markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.bz2 markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.zip |
Fix support for deep subtokenization
* Fix a couple of forgotten line ending handling in html (text)
* Fix missing initial case for html (text) not having a `<` 😬
* Add line ending handling to `text` construct
Diffstat (limited to 'src/subtokenize.rs')
-rw-r--r-- | src/subtokenize.rs | 28 |
1 files changed, 19 insertions, 9 deletions
diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 35d7672..71a84e1 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -82,12 +82,12 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { let mut subindex = 0; // Index into subevents that starts the current slice. let mut last_start = 0; - // Counter into `ends`. + // Counter into `ends`: the linked token we are at. let mut end_index = 0; let mut index_opt: Option<usize> = Some(index); while subindex < tokenizer.events.len() { - let subevent = &tokenizer.events[subindex]; + let subevent = &mut tokenizer.events[subindex]; // Find the first event that starts after the end we’re looking // for. @@ -101,11 +101,26 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { index_opt = events[link].next; } + // If there is a `next` link in the subevents, we have to change + // its index to account for the shifted events. + // If it points to a next event, we also change the next event’s + // reference back to *this* event. + if let Some(next) = subevent.next { + // The `index` in `events` where the current link is, + // minus 2 events (the enter and exit) for each removed + // link. + let shift = index_opt.unwrap() - (end_index * 2); + + subevent.next = Some(next + shift); + let next_ev = &mut tokenizer.events[next]; + let previous = next_ev.previous.unwrap(); + next_ev.previous = Some(previous + shift); + } + subindex += 1; } - let link = index_opt.unwrap(); - link_to_info.insert(link, (index, last_start, subindex)); + link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex)); head_to_tokenizer.insert(index, tokenizer); } @@ -119,11 +134,6 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // from each slice and slices from events? let mut index = events.len() - 1; - // To do: this is broken, because it can inject linked events, which point - // to their links through indices, and this messes with all indices. - // We should try walking front to end instead, keep a count of the shifted - // index. - // It’s a bit complex but should work? while index > 0 { let slice_opt = link_to_info.get(&index); |