From 30e5f806277d14d5dcab708ccd0ce07a4894c1f9 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Wed, 10 Aug 2022 13:44:09 +0200
Subject: Refactor some code for document parsing

---
 src/construct/list.rs   |  10 +-
 src/content/document.rs | 392 ++++++++++++++++++++++--------------------------
 src/tokenizer.rs        |  16 +-
 3 files changed, 188 insertions(+), 230 deletions(-)

(limited to 'src')

diff --git a/src/construct/list.rs b/src/construct/list.rs
index 36c1dac..d726c73 100644
--- a/src/construct/list.rs
+++ b/src/construct/list.rs
@@ -275,7 +275,9 @@ pub fn after(tokenizer: &mut Tokenizer) -> State {
             prefix += 1;
         }
 
-        let container = tokenizer.container.as_mut().unwrap();
+        let container = &mut tokenizer.tokenize_state.document_container_stack
+            [tokenizer.tokenize_state.document_continued];
+
         container.blank_initial = blank;
         container.size = prefix;
 
@@ -309,7 +311,8 @@ pub fn cont_start(tokenizer: &mut Tokenizer) -> State {
 ///   |   b
 /// ```
 pub fn cont_blank(tokenizer: &mut Tokenizer) -> State {
-    let container = tokenizer.container.as_ref().unwrap();
+    let container = &mut tokenizer.tokenize_state.document_container_stack
+        [tokenizer.tokenize_state.document_continued];
     let size = container.size;
 
     if container.blank_initial {
@@ -329,7 +332,8 @@ pub fn cont_blank(tokenizer: &mut Tokenizer) -> State {
 ///     ^
 /// ```
 pub fn cont_filled(tokenizer: &mut Tokenizer) -> State {
-    let container = tokenizer.container.as_mut().unwrap();
+    let container = &mut tokenizer.tokenize_state.document_container_stack
+        [tokenizer.tokenize_state.document_continued];
     let size = container.size;
 
     container.blank_initial = false;
diff --git a/src/content/document.rs b/src/content/document.rs
index d47a31a..73c9803 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -105,29 +105,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
         tokenizer.point.clone(),
         tokenizer.parse_state,
     )));
-    tokenizer.tokenize_state.document_child_state = Some(State::Next(StateName::FlowStart));
+
     tokenizer.attempt(
         StateName::BomStart,
-        State::Next(StateName::DocumentLineStart),
-        State::Next(StateName::DocumentLineStart),
+        State::Next(StateName::DocumentContainerExistingBefore),
+        State::Next(StateName::DocumentContainerExistingBefore),
     )
 }
 
-/// Start of a line.
-//
-/// ```markdown
-/// > | * a
-///     ^
-/// > | > b
-///     ^
-/// ```
-pub fn line_start(tokenizer: &mut Tokenizer) -> State {
-    tokenizer.tokenize_state.document_continued = 0;
-    // Containers would only be interrupting if we’ve continued.
-    tokenizer.interrupt = false;
-    State::Retry(StateName::DocumentContainerExistingBefore)
-}
-
 /// Before existing containers.
 //
 /// ```markdown
@@ -140,20 +125,16 @@ pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
     if tokenizer.tokenize_state.document_continued
         < tokenizer.tokenize_state.document_container_stack.len()
     {
-        let container = tokenizer
-            .tokenize_state
-            .document_container_stack
-            .remove(tokenizer.tokenize_state.document_continued);
-        let name = match container.kind {
-            Container::BlockQuote => StateName::BlockQuoteContStart,
-            Container::ListItem => StateName::ListContStart,
-        };
+        let container = &tokenizer.tokenize_state.document_container_stack
+            [tokenizer.tokenize_state.document_continued];
 
-        tokenizer.container = Some(container);
         tokenizer.attempt(
-            name,
+            match container.kind {
+                Container::BlockQuote => StateName::BlockQuoteContStart,
+                Container::ListItem => StateName::ListContStart,
+            },
             State::Next(StateName::DocumentContainerExistingAfter),
-            State::Next(StateName::DocumentContainerExistingMissing),
+            State::Next(StateName::DocumentContainerNewBefore),
         )
     }
     // Otherwise, check new containers.
@@ -162,22 +143,6 @@ pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
     }
 }
 
-/// At a missing, existing containers.
-//
-/// ```markdown
-///   | * a
-/// > | > b
-///     ^
-/// ```
-pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State {
-    let container = tokenizer.container.take().unwrap();
-    tokenizer
-        .tokenize_state
-        .document_container_stack
-        .insert(tokenizer.tokenize_state.document_continued, container);
-    State::Retry(StateName::DocumentContainerNewBefore)
-}
-
 /// After an existing container.
 //
 /// ```markdown
@@ -186,11 +151,6 @@ pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State {
 ///       ^
 /// ```
 pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State {
-    let container = tokenizer.container.take().unwrap();
-    tokenizer
-        .tokenize_state
-        .document_container_stack
-        .insert(tokenizer.tokenize_state.document_continued, container);
     tokenizer.tokenize_state.document_continued += 1;
     State::Retry(StateName::DocumentContainerExistingBefore)
 }
@@ -209,33 +169,34 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
     if tokenizer.tokenize_state.document_continued
         == tokenizer.tokenize_state.document_container_stack.len()
     {
-        tokenizer.interrupt = tokenizer
-            .tokenize_state
-            .child_tokenizer
-            .as_ref()
-            .unwrap()
-            .interrupt;
+        let child = tokenizer.tokenize_state.child_tokenizer.as_ref().unwrap();
+
+        tokenizer.interrupt = child.interrupt;
 
         // …and if we’re in a concrete construct, new containers can’t “pierce”
         // into them.
-        if tokenizer
-            .tokenize_state
-            .child_tokenizer
-            .as_ref()
-            .unwrap()
-            .concrete
-        {
+        if child.concrete {
             return State::Retry(StateName::DocumentContainersAfter);
         }
     }
 
     // Check for a new container.
     // Block quote?
-    tokenizer.container = Some(ContainerState {
-        kind: Container::BlockQuote,
-        blank_initial: false,
-        size: 0,
-    });
+    // Add a new container at the end of the stack.
+    let tail = tokenizer.tokenize_state.document_container_stack.len();
+    tokenizer
+        .tokenize_state
+        .document_container_stack
+        .push(ContainerState {
+            kind: Container::BlockQuote,
+            blank_initial: false,
+            size: 0,
+        });
+    // Swap the existing container with the new one.
+    tokenizer
+        .tokenize_state
+        .document_container_stack
+        .swap(tokenizer.tokenize_state.document_continued, tail);
 
     tokenizer.attempt(
         StateName::BlockQuoteStart,
@@ -247,19 +208,34 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
 /// To do.
 pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {
     // List item?
-    tokenizer.container = Some(ContainerState {
+    // We replace the empty block quote container for this new list one.
+    tokenizer.tokenize_state.document_container_stack
+        [tokenizer.tokenize_state.document_continued] = ContainerState {
         kind: Container::ListItem,
         blank_initial: false,
         size: 0,
-    });
+    };
 
     tokenizer.attempt(
         StateName::ListStart,
         State::Next(StateName::DocumentContainerNewAfter),
-        State::Next(StateName::DocumentContainersAfter),
+        State::Next(StateName::DocumentContainerNewBeforeNotList),
     )
 }
 
+/// To do.
+pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State {
+    // It wasn’t a new block quote or a list.
+    // Swap the new container (in the middle) with the existing one (at the end).
+    // Drop what was in the middle.
+    tokenizer
+        .tokenize_state
+        .document_container_stack
+        .swap_remove(tokenizer.tokenize_state.document_continued);
+
+    State::Retry(StateName::DocumentContainersAfter)
+}
+
 /// After a new container.
 ///
 /// ```markdown
@@ -269,7 +245,13 @@ pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State
 ///       ^
 /// ```
 pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
-    let container = tokenizer.container.take().unwrap();
+    // It was a new block quote or a list.
+    // Swap the new container (in the middle) with the existing one (at the end).
+    // Take the new container.
+    let container = tokenizer
+        .tokenize_state
+        .document_container_stack
+        .swap_remove(tokenizer.tokenize_state.document_continued);
 
     // If we did not continue all existing containers, and there is a new one,
     // close the flow and those containers.
@@ -279,13 +261,11 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
         exit_containers(tokenizer, &Phase::Prefix);
     }
 
-    // Try another new container.
     tokenizer
         .tokenize_state
         .document_container_stack
         .push(container);
     tokenizer.tokenize_state.document_continued += 1;
-    tokenizer.tokenize_state.document_interrupt_before = false;
     tokenizer.interrupt = false;
     State::Retry(StateName::DocumentContainerNewBefore)
 }
@@ -299,19 +279,18 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
 ///       ^
 /// ```
 pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
-    if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
-        child.lazy = tokenizer.tokenize_state.document_continued
-            != tokenizer.tokenize_state.document_container_stack.len();
-        child.interrupt = tokenizer.tokenize_state.document_interrupt_before;
-        child.define_skip(tokenizer.point.clone());
-    }
+    let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+
+    child.lazy = tokenizer.tokenize_state.document_continued
+        != tokenizer.tokenize_state.document_container_stack.len();
+    child.define_skip(tokenizer.point.clone());
 
     match tokenizer.current {
         // Note: EOL is part of data.
         None => State::Retry(StateName::DocumentFlowEnd),
         Some(_) => {
             let current = tokenizer.events.len();
-            let previous = tokenizer.tokenize_state.document_data_index.take();
+            let previous = tokenizer.tokenize_state.document_data_index;
             if let Some(previous) = previous {
                 tokenizer.events[previous].link.as_mut().unwrap().next = Some(current);
             }
@@ -357,57 +336,38 @@ pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
 ///     ^  ^
 /// ```
 pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
-    let mut paragraph = false;
-    let mut interrupt = false;
+    let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+    let state = tokenizer
+        .tokenize_state
+        .document_child_state
+        .unwrap_or(State::Next(StateName::FlowStart));
 
-    // We have new data.
-    // Note that everything except for a `null` is data.
-    if tokenizer.events.len() > 1
-        && tokenizer.events[tokenizer.events.len() - 1].token_type == Token::Data
-    {
-        let position = Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
+    let name = match state {
+        State::Next(name) => name,
+        _ => unreachable!("expected state name"),
+    };
 
-        let state = tokenizer
-            .tokenize_state
-            .document_child_state
-            .take()
-            .unwrap_or(State::Next(StateName::FlowStart));
+    // To do: handle VS?
+    let state = child.push(child.point.index, tokenizer.point.index, name);
 
-        let name = match state {
-            State::Next(name) => name,
-            _ => unreachable!("expected state name"),
-        };
+    let paragraph = matches!(state, State::Next(StateName::ParagraphInside))
+        || (!child.events.is_empty()
+            && child.events
+                [skip::opt_back(&child.events, child.events.len() - 1, &[Token::LineEnding])]
+            .token_type
+                == Token::Paragraph);
 
-        if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
-            // To do: handle VS?
-            // if position.start.vs > 0 {
-            // }
-            let state = child.push(position.start.index, position.end.index, name);
-
-            interrupt = child.interrupt;
-            paragraph = matches!(state, State::Next(StateName::ParagraphInside))
-                || (!child.events.is_empty()
-                    && child.events[skip::opt_back(
-                        &child.events,
-                        child.events.len() - 1,
-                        &[Token::LineEnding],
-                    )]
-                    .token_type
-                        == Token::Paragraph);
-
-            tokenizer.tokenize_state.document_child_state = Some(state);
-
-            if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before {
-                tokenizer.tokenize_state.document_continued =
-                    tokenizer.tokenize_state.document_container_stack.len();
-            }
+    tokenizer.tokenize_state.document_child_state = Some(state);
 
-            if tokenizer.tokenize_state.document_continued
-                != tokenizer.tokenize_state.document_container_stack.len()
-            {
-                exit_containers(tokenizer, &Phase::After);
-            }
-        }
+    if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before {
+        tokenizer.tokenize_state.document_continued =
+            tokenizer.tokenize_state.document_container_stack.len();
+    }
+
+    if tokenizer.tokenize_state.document_continued
+        != tokenizer.tokenize_state.document_container_stack.len()
+    {
+        exit_containers(tokenizer, &Phase::After);
     }
 
     match tokenizer.current {
@@ -418,9 +378,11 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
             State::Ok
         }
         Some(_) => {
+            tokenizer.tokenize_state.document_continued = 0;
             tokenizer.tokenize_state.document_paragraph_before = paragraph;
-            tokenizer.tokenize_state.document_interrupt_before = interrupt;
-            State::Retry(StateName::DocumentLineStart)
+            // Containers would only be interrupting if we’ve continued.
+            tokenizer.interrupt = false;
+            State::Retry(StateName::DocumentContainerExistingBefore)
         }
     }
 }
@@ -432,124 +394,124 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
         .document_container_stack
         .split_off(tokenizer.tokenize_state.document_continued);
 
-    // So, we’re at the end of a line, but we need to close the *previous* line.
-    if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
-        if *phase != Phase::After {
-            let state = tokenizer
-                .tokenize_state
-                .document_child_state
-                .take()
-                .unwrap_or(State::Next(StateName::FlowStart));
+    let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
 
-            child.flush(state, false);
-        }
-
-        if !stack_close.is_empty() {
-            let mut inject_index = tokenizer.events.len();
-
-            // Move past the current data to find the last container start if we’re
-            // closing due to a potential lazy flow that was not lazy.
-            if *phase == Phase::After {
-                inject_index -= 2;
-            }
+    // Flush if needed.
+    if *phase != Phase::After {
+        let state = tokenizer
+            .tokenize_state
+            .document_child_state
+            .take()
+            .unwrap_or(State::Next(StateName::FlowStart));
 
-            // Move past the container starts to find the last data if we’re
-            // closing due to a different container or lazy flow like above.
-            if *phase == Phase::After || *phase == Phase::Prefix {
-                while inject_index > 0 {
-                    let event = &tokenizer.events[inject_index - 1];
+        child.flush(state, false);
+    }
 
-                    if event.token_type == Token::Data {
-                        break;
-                    }
+    if !stack_close.is_empty() {
+        let mut inject_index = tokenizer.events.len();
 
-                    inject_index -= 1;
-                }
-            }
+        // Move past the current data to find the last container start if we’re
+        // closing due to a potential lazy flow that was not lazy.
+        if *phase == Phase::After {
+            inject_index -= 2;
+        }
 
-            // Move past data starts that are just whitespace only without
-            // container starts.
+        // Move past the container starts to find the last data if we’re
+        // closing due to a different container or lazy flow like above.
+        if *phase != Phase::Eof {
             while inject_index > 0 {
                 let event = &tokenizer.events[inject_index - 1];
 
                 if event.token_type == Token::Data {
-                    if event.event_type == EventType::Exit {
-                        let slice = Slice::from_position(
-                            tokenizer.parse_state.bytes,
-                            &Position::from_exit_event(&tokenizer.events, inject_index - 1),
-                        );
-                        let bytes = slice.bytes;
-                        let mut whitespace = true;
-                        let mut index = 0;
-                        while index < bytes.len() {
-                            match bytes[index] {
-                                b'\t' | b'\n' | b'\r' | b' ' => index += 1,
-                                _ => {
-                                    whitespace = false;
-                                    break;
-                                }
-                            }
-                        }
-
-                        if !whitespace {
-                            break;
-                        }
-                    }
-                } else {
                     break;
                 }
 
                 inject_index -= 1;
             }
+        }
 
-            let ref_point = if inject_index == tokenizer.events.len() {
-                tokenizer.point.clone()
+        // Move past data starts that are just whitespace only without
+        // container starts.
+        while inject_index > 0 {
+            let event = &tokenizer.events[inject_index - 1];
+
+            if event.token_type == Token::Data {
+                if event.event_type == EventType::Exit {
+                    let slice = Slice::from_position(
+                        tokenizer.parse_state.bytes,
+                        &Position::from_exit_event(&tokenizer.events, inject_index - 1),
+                    );
+                    let bytes = slice.bytes;
+                    let mut whitespace = true;
+                    let mut index = 0;
+                    while index < bytes.len() {
+                        match bytes[index] {
+                            b'\t' | b'\n' | b'\r' | b' ' => index += 1,
+                            _ => {
+                                whitespace = false;
+                                break;
+                            }
+                        }
+                    }
+
+                    if !whitespace {
+                        break;
+                    }
+                }
             } else {
-                tokenizer.events[inject_index].point.clone()
-            };
+                break;
+            }
 
-            let mut exits = Vec::with_capacity(stack_close.len());
+            inject_index -= 1;
+        }
 
-            while !stack_close.is_empty() {
-                let container = stack_close.pop().unwrap();
-                let token_type = match container.kind {
-                    Container::BlockQuote => Token::BlockQuote,
-                    Container::ListItem => Token::ListItem,
-                };
+        let ref_point = if inject_index == tokenizer.events.len() {
+            tokenizer.point.clone()
+        } else {
+            tokenizer.events[inject_index].point.clone()
+        };
 
-                exits.push(Event {
-                    event_type: EventType::Exit,
-                    token_type: token_type.clone(),
-                    point: ref_point.clone(),
-                    link: None,
-                });
+        let mut exits = Vec::with_capacity(stack_close.len());
 
-                let mut stack_index = tokenizer.stack.len();
-                let mut found = false;
+        while !stack_close.is_empty() {
+            let container = stack_close.pop().unwrap();
+            let token_type = match container.kind {
+                Container::BlockQuote => Token::BlockQuote,
+                Container::ListItem => Token::ListItem,
+            };
 
-                while stack_index > 0 {
-                    stack_index -= 1;
+            exits.push(Event {
+                event_type: EventType::Exit,
+                token_type: token_type.clone(),
+                point: ref_point.clone(),
+                link: None,
+            });
 
-                    if tokenizer.stack[stack_index] == token_type {
-                        tokenizer.stack.remove(stack_index);
-                        found = true;
-                        break;
-                    }
-                }
+            let mut stack_index = tokenizer.stack.len();
+            let mut found = false;
+
+            while stack_index > 0 {
+                stack_index -= 1;
 
-                debug_assert!(found, "expected to find container token to exit");
+                if tokenizer.stack[stack_index] == token_type {
+                    tokenizer.stack.remove(stack_index);
+                    found = true;
+                    break;
+                }
             }
 
-            tokenizer.map.add(inject_index, 0, exits);
+            debug_assert!(found, "expected to find container token to exit");
         }
+
+        tokenizer.map.add(inject_index, 0, exits);
     }
 
-    tokenizer.tokenize_state.document_interrupt_before = false;
+    child.interrupt = false;
 }
 
 // Inject the container events.
 fn resolve(tokenizer: &mut Tokenizer) {
-    let mut child = tokenizer.tokenize_state.child_tokenizer.take().unwrap();
+    let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
     // To do: see if we can do this less.
     tokenizer.map.consume(&mut tokenizer.events);
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index dff97dd..7b8c9a5 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -203,12 +203,11 @@ pub enum StateName {
     DestinationRawEscape,
 
     DocumentStart,
-    DocumentLineStart,
     DocumentContainerExistingBefore,
     DocumentContainerExistingAfter,
-    DocumentContainerExistingMissing,
     DocumentContainerNewBefore,
     DocumentContainerNewBeforeNotBlockQuote,
+    DocumentContainerNewBeforeNotList,
     DocumentContainerNewAfter,
     DocumentContainersAfter,
     DocumentFlowInside,
@@ -476,8 +475,6 @@ pub struct TokenizeState<'a> {
     /// To do.
     pub document_continued: usize,
     /// To do.
-    pub document_interrupt_before: bool,
-    /// To do.
     pub document_paragraph_before: bool,
     /// To do.
     pub document_data_index: Option<usize>,
@@ -575,8 +572,6 @@ pub struct Tokenizer<'a> {
     ///
     /// Used when tokenizing [text content][crate::content::text].
     pub media_list: Vec<Media>,
-    /// Current container state.
-    pub container: Option<ContainerState>,
     /// Whether we would be interrupting something.
     ///
     /// Used when tokenizing [flow content][crate::content::flow].
@@ -613,7 +608,6 @@ impl<'a> Tokenizer<'a> {
                 connect: false,
                 document_container_stack: vec![],
                 document_continued: 0,
-                document_interrupt_before: false,
                 document_paragraph_before: false,
                 document_data_index: None,
                 document_child_state: None,
@@ -647,7 +641,6 @@ impl<'a> Tokenizer<'a> {
             label_start_stack: vec![],
             label_start_list_loose: vec![],
             media_list: vec![],
-            container: None,
             interrupt: false,
             concrete: false,
             lazy: false,
@@ -1200,16 +1193,15 @@ fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State {
         StateName::DestinationRawEscape => construct::partial_destination::raw_escape,
 
         StateName::DocumentStart => content::document::start,
-        StateName::DocumentLineStart => content::document::line_start,
         StateName::DocumentContainerExistingBefore => content::document::container_existing_before,
         StateName::DocumentContainerExistingAfter => content::document::container_existing_after,
-        StateName::DocumentContainerExistingMissing => {
-            content::document::container_existing_missing
-        }
         StateName::DocumentContainerNewBefore => content::document::container_new_before,
         StateName::DocumentContainerNewBeforeNotBlockQuote => {
             content::document::container_new_before_not_block_quote
         }
+        StateName::DocumentContainerNewBeforeNotList => {
+            content::document::container_new_before_not_list
+        }
         StateName::DocumentContainerNewAfter => content::document::container_new_after,
         StateName::DocumentContainersAfter => content::document::containers_after,
         StateName::DocumentFlowEnd => content::document::flow_end,
-- 
cgit