Fix support for deep subtokenization

* Fix a couple of forgotten line ending handling in html (text) * Fix missing initial case for html (text) not having a `<` 😬 * Add line ending handling to `text` construct
author: Titus Wormer <tituswormer@gmail.com> 2022-06-14 18:57:28 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-14 18:57:28 +0200
commit: b00fafbdcba39e7e17144b07834702629b891062 (patch)
tree: 3351cc3ad2bb126d8a93e1ff6b1731bc00cb45c3 /src/subtokenize.rs
parent: 129ea34b18aaf7f5a01d404effbdc78cbbe67a74 (diff)
download: markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.gz
markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.bz2
markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.zip
1 files changed, 19 insertions, 9 deletions
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index 35d7672..71a84e1 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -82,12 +82,12 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
             let mut subindex = 0;
             // Index into subevents that starts the current slice.
             let mut last_start = 0;
-            // Counter into `ends`.
+            // Counter into `ends`: the linked token we are at.
             let mut end_index = 0;
             let mut index_opt: Option<usize> = Some(index);
 
             while subindex < tokenizer.events.len() {
-                let subevent = &tokenizer.events[subindex];
+                let subevent = &mut tokenizer.events[subindex];
 
                 // Find the first event that starts after the end we’re looking
                 // for.
@@ -101,11 +101,26 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
                     index_opt = events[link].next;
                 }
 
+                // If there is a `next` link in the subevents, we have to change
+                // its index to account for the shifted events.
+                // If it points to a next event, we also change the next event’s
+                // reference back to *this* event.
+                if let Some(next) = subevent.next {
+                    // The `index` in `events` where the current link is,
+                    // minus 2 events (the enter and exit) for each removed
+                    // link.
+                    let shift = index_opt.unwrap() - (end_index * 2);
+
+                    subevent.next = Some(next + shift);
+                    let next_ev = &mut tokenizer.events[next];
+                    let previous = next_ev.previous.unwrap();
+                    next_ev.previous = Some(previous + shift);
+                }
+
                 subindex += 1;
             }
 
-            let link = index_opt.unwrap();
-            link_to_info.insert(link, (index, last_start, subindex));
+            link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
             head_to_tokenizer.insert(index, tokenizer);
         }
 
@@ -119,11 +134,6 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
     // from each slice and slices from events?
     let mut index = events.len() - 1;
 
-    // To do: this is broken, because it can inject linked events, which point
-    // to their links through indices, and this messes with all indices.
-    // We should try walking front to end instead, keep a count of the shifted
-    // index.
-    // It’s a bit complex but should work?
     while index > 0 {
         let slice_opt = link_to_info.get(&index);
author	Titus Wormer <tituswormer@gmail.com>	2022-06-14 18:57:28 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-14 18:57:28 +0200
commit	b00fafbdcba39e7e17144b07834702629b891062 (patch)
tree	3351cc3ad2bb126d8a93e1ff6b1731bc00cb45c3 /src/subtokenize.rs
parent	129ea34b18aaf7f5a01d404effbdc78cbbe67a74 (diff)
download	markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.gz markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.bz2 markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.zip