Add improved container exit injection

author: Titus Wormer <tituswormer@gmail.com> 2022-08-11 11:01:49 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-08-11 11:01:49 +0200
commit: 053a2603e4bd5ec9caf40617b52136e5ef3fcf0a (patch)
tree: 14719bc0759a3a9039e88368d3c10ace5075e906
parent: 30e5f806277d14d5dcab708ccd0ce07a4894c1f9 (diff)
download: markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.tar.gz
markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.tar.bz2
markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.zip
5 files changed, 137 insertions, 149 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 57ab40a..4a9ec36 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -479,48 +479,63 @@ fn on_enter_list(context: &mut CompileContext) {
         } else {
             balance -= 1;
 
-            // Blank line directly in list or directly in list item,
-            // but not a blank line after an empty list item.
             if balance < 3 && event.token_type == Token::BlankLineEnding {
-                let mut at_marker = false;
+                // Blank line directly after a prefix:
+                //
+                // ```markdown
+                // > | -␊
+                //      ^
+                //   |   a
+                // ```
+                let mut at_prefix = false;
+                // Blank line directly after item, which is just a prefix.
+                //
+                // ```markdown
+                // > | -␊
+                //      ^
+                //   | - a
+                // ```
+                let mut at_empty_list_item = false;
+                // Blank line at block quote prefix:
+                //
+                // ```markdown
+                // > | * >␊
+                //        ^
+                //   | * a
+                // ```
+                let mut at_empty_block_quote = false;
 
-                if balance == 2 {
+                if balance == 1 {
                     let mut before = index - 2;
 
-                    if events[before].token_type == Token::SpaceOrTab {
-                        before -= 2;
-                    }
-
-                    if events[before].token_type == Token::ListItemPrefix {
-                        at_marker = true;
-                    }
-                }
+                    if events[before].token_type == Token::ListItem {
+                        before -= 1;
 
-                let mut at_empty_list_item = false;
-                let mut at_empty_block_quote = false;
+                        if events[before].token_type == Token::SpaceOrTab {
+                            before -= 2;
+                        }
 
-                if balance == 1 {
+                        if events[before].token_type == Token::BlockQuote
+                            && events[before - 1].token_type == Token::BlockQuotePrefix
+                        {
+                            at_empty_block_quote = true;
+                        } else if events[before].token_type == Token::ListItemPrefix {
+                            at_empty_list_item = true;
+                        }
+                    }
+                } else {
                     let mut before = index - 2;
 
                     if events[before].token_type == Token::SpaceOrTab {
                         before -= 2;
                     }
 
-                    if events[before].token_type == Token::ListItem
-                        && events[before - 1].token_type == Token::ListItemPrefix
-                    {
-                        at_empty_list_item = true;
-                    }
-
-                    if events[before].token_type == Token::ListItem
-                        && events[before - 1].token_type == Token::BlockQuote
-                        && events[before - 2].token_type == Token::BlockQuotePrefix
-                    {
-                        at_empty_block_quote = true;
+                    if events[before].token_type == Token::ListItemPrefix {
+                        at_prefix = true;
                     }
                 }
 
-                if !at_marker && !at_empty_list_item && !at_empty_block_quote {
+                if !at_prefix && !at_empty_list_item && !at_empty_block_quote {
                     loose = true;
                     break;
                 }
diff --git a/src/content/document.rs b/src/content/document.rs
index 73c9803..98f8a7d 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -56,7 +56,11 @@ enum Phase {
 pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
     let mut tokenizer = Tokenizer::new(point, parse_state);
 
-    let state = tokenizer.push(0, parse_state.bytes.len(), StateName::DocumentStart);
+    let state = tokenizer.push(
+        (0, 0),
+        (parse_state.bytes.len(), 0),
+        StateName::DocumentStart,
+    );
     tokenizer.flush(state, true);
 
     let mut index = 0;
@@ -347,8 +351,13 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
         _ => unreachable!("expected state name"),
     };
 
-    // To do: handle VS?
-    let state = child.push(child.point.index, tokenizer.point.index, name);
+    tokenizer.tokenize_state.document_exits.push(None);
+
+    let state = child.push(
+        (child.point.index, child.point.vs),
+        (tokenizer.point.index, tokenizer.point.vs),
+        name,
+    );
 
     let paragraph = matches!(state, State::Next(StateName::ParagraphInside))
         || (!child.events.is_empty()
@@ -408,69 +417,8 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
     }
 
     if !stack_close.is_empty() {
-        let mut inject_index = tokenizer.events.len();
-
-        // Move past the current data to find the last container start if we’re
-        // closing due to a potential lazy flow that was not lazy.
-        if *phase == Phase::After {
-            inject_index -= 2;
-        }
-
-        // Move past the container starts to find the last data if we’re
-        // closing due to a different container or lazy flow like above.
-        if *phase != Phase::Eof {
-            while inject_index > 0 {
-                let event = &tokenizer.events[inject_index - 1];
-
-                if event.token_type == Token::Data {
-                    break;
-                }
-
-                inject_index -= 1;
-            }
-        }
-
-        // Move past data starts that are just whitespace only without
-        // container starts.
-        while inject_index > 0 {
-            let event = &tokenizer.events[inject_index - 1];
-
-            if event.token_type == Token::Data {
-                if event.event_type == EventType::Exit {
-                    let slice = Slice::from_position(
-                        tokenizer.parse_state.bytes,
-                        &Position::from_exit_event(&tokenizer.events, inject_index - 1),
-                    );
-                    let bytes = slice.bytes;
-                    let mut whitespace = true;
-                    let mut index = 0;
-                    while index < bytes.len() {
-                        match bytes[index] {
-                            b'\t' | b'\n' | b'\r' | b' ' => index += 1,
-                            _ => {
-                                whitespace = false;
-                                break;
-                            }
-                        }
-                    }
-
-                    if !whitespace {
-                        break;
-                    }
-                }
-            } else {
-                break;
-            }
-
-            inject_index -= 1;
-        }
-
-        let ref_point = if inject_index == tokenizer.events.len() {
-            tokenizer.point.clone()
-        } else {
-            tokenizer.events[inject_index].point.clone()
-        };
-
+        let index = tokenizer.tokenize_state.document_exits.len()
+            - (if *phase == Phase::After { 2 } else { 1 });
         let mut exits = Vec::with_capacity(stack_close.len());
 
         while !stack_close.is_empty() {
@@ -483,7 +431,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
             exits.push(Event {
                 event_type: EventType::Exit,
                 token_type: token_type.clone(),
-                point: ref_point.clone(),
+                point: tokenizer.point.clone(),
                 link: None,
             });
 
@@ -503,18 +451,49 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
             debug_assert!(found, "expected to find container token to exit");
         }
 
-        tokenizer.map.add(inject_index, 0, exits);
+        if let Some(ref mut list) = tokenizer.tokenize_state.document_exits[index] {
+            list.append(&mut exits);
+        } else {
+            tokenizer.tokenize_state.document_exits[index] = Some(exits);
+        }
     }
 
     child.interrupt = false;
 }
 
-// Inject the container events.
+// Inject everything together.
 fn resolve(tokenizer: &mut Tokenizer) {
     let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
-    // To do: see if we can do this less.
-    tokenizer.map.consume(&mut tokenizer.events);
 
+    // First, add the container exits into `child`.
+    let mut child_index = 0;
+    let mut line = 0;
+
+    while child_index < child.events.len() {
+        let event = &child.events[child_index];
+
+        if event.event_type == EventType::Enter
+            && (event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding)
+        {
+            if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
+                let mut exit_index = 0;
+                while exit_index < exits.len() {
+                    exits[exit_index].point = event.point.clone();
+                    exit_index += 1;
+                }
+
+                child.map.add(child_index, 0, exits);
+            }
+
+            line += 1;
+        }
+
+        child_index += 1;
+    }
+
+    child.map.consume(&mut child.events);
+
+    // Now, add all child events into our parent document tokenizer.
     divide_events(
         &mut tokenizer.map,
         &tokenizer.events,
@@ -522,43 +501,29 @@ fn resolve(tokenizer: &mut Tokenizer) {
         &mut child.events,
     );
 
-    tokenizer
-        .resolvers
-        .append(&mut child.resolvers.split_off(0));
-    tokenizer
-        .resolver_ids
-        .append(&mut child.resolver_ids.split_off(0));
-
-    // To do: see if we can do this less.
+    // Replace the flow data with actual events.
     tokenizer.map.consume(&mut tokenizer.events);
 
-    let mut index = 0;
-    let mut last_eol_enter: Option<usize> = None;
-    while index < tokenizer.events.len() {
-        let event = &tokenizer.events[index];
-
-        if event.event_type == EventType::Exit {
-            if event.token_type == Token::BlockQuote || event.token_type == Token::ListItem {
-                if let Some(inject) = last_eol_enter {
-                    let point = tokenizer.events[inject].point.clone();
-                    let mut clone = event.clone();
-                    clone.point = point;
-                    // Inject a fixed exit.
-                    tokenizer.map.add(inject, 0, vec![clone]);
-                    // Remove this exit.
-                    tokenizer.map.add(index, 1, vec![]);
-                }
-            } else if event.token_type == Token::LineEnding
-                || event.token_type == Token::BlankLineEnding
-            {
-                last_eol_enter = Some(index - 1);
-            } else {
-                last_eol_enter = None;
+    // Now, add some final container exits due to the EOF.
+    // We can’t inject them into the child earlier, as they are “outside” its
+    // linked data.
+    if line < tokenizer.tokenize_state.document_exits.len() {
+        if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
+            let mut exit_index = 0;
+            while exit_index < exits.len() {
+                exits[exit_index].point = tokenizer.point.clone();
+                exit_index += 1;
             }
-        }
 
-        index += 1;
+            tokenizer.events.append(&mut exits);
+        }
     }
 
-    tokenizer.map.consume(&mut tokenizer.events);
+    // Add the resolvers from child.
+    tokenizer
+        .resolvers
+        .append(&mut child.resolvers.split_off(0));
+    tokenizer
+        .resolver_ids
+        .append(&mut child.resolver_ids.split_off(0));
 }
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index e0465a0..3d923d3 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -94,9 +94,11 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> bool {
                         tokenizer.define_skip(enter.point.clone());
                     }
 
+                    let end = &events[index + 1].point;
+
                     state = tokenizer.push(
-                        enter.point.index,
-                        events[index + 1].point.index,
+                        (enter.point.index, enter.point.vs),
+                        (end.index, end.vs),
                         match state {
                             State::Next(func) => func,
                             _ => unreachable!("cannot be ok/nok"),
@@ -140,11 +142,12 @@ pub fn divide_events(
     let mut old_prev: Option<usize> = None;
 
     while subindex < child_events.len() {
+        let current = &child_events[subindex].point;
+        let end = &events[link_index + 1].point;
+
         // Find the first event that starts after the end we’re looking
         // for.
-        if child_events[subindex].event_type == EventType::Enter
-            && child_events[subindex].point.index >= events[link_index + 1].point.index
-        {
+        if current.index > end.index || (current.index == end.index && current.vs > end.vs) {
             slices.push((link_index, slice_start));
             slice_start = subindex;
             link_index = events[link_index].link.as_ref().unwrap().next.unwrap();
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7b8c9a5..3cdd2d3 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -17,7 +17,6 @@ use crate::content;
 use crate::parser::ParseState;
 use crate::token::{Token, VOID_TOKENS};
 use crate::util::edit_map::EditMap;
-use std::str;
 
 /// Embedded content type.
 #[derive(Debug, Clone, PartialEq)]
@@ -473,6 +472,8 @@ pub struct TokenizeState<'a> {
     /// To do.
     pub document_container_stack: Vec<ContainerState>,
     /// To do.
+    pub document_exits: Vec<Option<Vec<Event>>>,
+    /// To do.
     pub document_continued: usize,
     /// To do.
     pub document_paragraph_before: bool,
@@ -607,6 +608,7 @@ impl<'a> Tokenizer<'a> {
             tokenize_state: TokenizeState {
                 connect: false,
                 document_container_stack: vec![],
+                document_exits: vec![],
                 document_continued: 0,
                 document_paragraph_before: false,
                 document_data_index: None,
@@ -897,16 +899,18 @@ impl<'a> Tokenizer<'a> {
     /// This is set up to support repeatedly calling `feed`, and thus streaming
     /// markdown into the state machine, and normally pauses after feeding.
     // Note: if needed: accept `vs`?
-    pub fn push(&mut self, min: usize, max: usize, name: StateName) -> State {
+    pub fn push(&mut self, min: (usize, usize), max: (usize, usize), name: StateName) -> State {
         debug_assert!(!self.resolved, "cannot feed after drain");
+
         // debug_assert!(min >= self.point.index, "cannot move backwards");
-        if min > self.point.index {
-            self.move_to((min, 0));
+
+        if min.0 > self.point.index || (min.0 == self.point.index && min.1 > self.point.vs) {
+            self.move_to(min);
         }
 
         let mut state = State::Next(name);
 
-        while self.point.index < max {
+        while self.point.index < max.0 || (self.point.index == max.0 && self.point.vs < max.1) {
             match state {
                 State::Ok | State::Nok => {
                     if let Some(attempt) = self.attempts.pop() {
@@ -1080,14 +1084,7 @@ fn feed_action_impl(
             None
         };
 
-        log::debug!(
-            "feed:    `{:?}` to {:?}",
-            byte.map_or_else(
-                || "eof".to_string(),
-                |d| str::from_utf8(&[d]).unwrap().to_string()
-            ),
-            name
-        );
+        log::debug!("feed:    `{:?}` to {:?}", byte, name);
         tokenizer.expect(byte);
         call_impl(tokenizer, name)
     }
diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs
index c5e5c43..da54e59 100644
--- a/tests/misc_tabs.rs
+++ b/tests/misc_tabs.rs
@@ -275,4 +275,12 @@ fn tabs_virtual_spaces() {
         "<pre><code> x\n</code></pre>\n",
         "should strip 3 spaces from an initial tab in fenced code if the opening fence is indented as such"
     );
+
+    assert_eq!(
+        micromark("-\ta\n\n\tb"),
+        "<ul>\n<li>\n<p>a</p>\n<p>\tb</p>\n</li>\n</ul>",
+        // To do: CM.js does not output the tab before `b`. See if that makes sense?
+        // "<ul>\n<li>\n<p>a</p>\n<p>b</p>\n</li>\n</ul>",
+        "should support a part of a tab as a container, and the rest of a tab as flow"
+    );
 }
author	Titus Wormer <tituswormer@gmail.com>	2022-08-11 11:01:49 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-08-11 11:01:49 +0200
commit	053a2603e4bd5ec9caf40617b52136e5ef3fcf0a (patch)
tree	14719bc0759a3a9039e88368d3c10ace5075e906
parent	30e5f806277d14d5dcab708ccd0ce07a4894c1f9 (diff)
download	markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.tar.gz markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.tar.bz2 markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.zip