Add basic subtokenization, string content in fenced code

author: Titus Wormer <tituswormer@gmail.com> 2022-06-09 15:01:46 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-09 15:01:46 +0200
commit: 021d5f989ae41ae39a9b937b498141d9dc70d894 (patch)
tree: 8009a01d69cbd4f8200ffd34fc4031265b67406e
parent: 344c3db875056d4aec509f24fb2dbeaf7e2a14b6 (diff)
download: markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.tar.gz
markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.tar.bz2
markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.zip
8 files changed, 105 insertions, 52 deletions
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 693ffb5..6f94424 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -26,28 +26,17 @@ use crate::construct::{
     html_flow::start as html_flow, partial_whitespace::start as whitespace,
     thematic_break::start as thematic_break,
 };
-use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+use crate::subtokenize::subtokenize;
+use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
 use crate::util::get_span;
 
 /// Turn `codes` as the flow content type into events.
 // To do: remove this `allow` when all the content types are glued together.
 #[allow(dead_code)]
-pub fn flow(codes: &[Code]) -> Vec<Event> {
-    let mut tokenizer = Tokenizer::new();
-    let (state, remainder) = tokenizer.feed(codes, Box::new(start), true);
-
-    if let Some(ref x) = remainder {
-        if !x.is_empty() {
-            unreachable!("expected no final remainder {:?}", x);
-        }
-    }
-
-    match state {
-        State::Ok => {}
-        _ => unreachable!("expected final state to be `State::Ok`"),
-    }
-
-    tokenizer.events
+pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
+    let mut tokenizer = Tokenizer::new(point, index);
+    tokenizer.feed(codes, Box::new(start), true);
+    subtokenize(tokenizer.events, codes)
 }
 
 /// Before flow.
diff --git a/src/content/string.rs b/src/content/string.rs
index 1239a36..64f544b 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -13,26 +13,14 @@
 use crate::construct::{
     character_escape::start as character_escape, character_reference::start as character_reference,
 };
-use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
 
 /// Turn `codes` as the string content type into events.
 // To do: remove this `allow` when all the content types are glued together.
 #[allow(dead_code)]
-pub fn string(codes: &[Code]) -> Vec<Event> {
-    let mut tokenizer = Tokenizer::new();
-    let (state, remainder) = tokenizer.feed(codes, Box::new(before), true);
-
-    if let Some(ref x) = remainder {
-        if !x.is_empty() {
-            unreachable!("expected no final remainder {:?}", x);
-        }
-    }
-
-    match state {
-        State::Ok => {}
-        _ => unreachable!("expected final state to be `State::Ok`"),
-    }
-
+pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
+    let mut tokenizer = Tokenizer::new(point, index);
+    tokenizer.feed(codes, Box::new(before), true);
     tokenizer.events
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 1624a22..cf0b05b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod constant;
 mod construct;
 mod content;
 mod parser;
+mod subtokenize;
 mod tokenizer;
 mod util;
 
diff --git a/src/parser.rs b/src/parser.rs
index e156e33..5648942 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,13 +2,21 @@
 // To do: this should start with `containers`, when they’re done.
 // To do: definitions and such will mean more data has to be passed around.
 use crate::content::flow::flow;
-use crate::tokenizer::{as_codes, Code, Event};
+use crate::tokenizer::{as_codes, Code, Event, Point};
 
 /// Turn a string of markdown into events.
 /// Passes the codes back so the compiler can access the source.
 pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) {
     let codes = as_codes(value);
     // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough.
-    let events = flow(&codes);
+    let events = flow(
+        &codes,
+        Point {
+            line: 1,
+            column: 1,
+            offset: 0,
+        },
+        0,
+    );
     (events, codes)
 }
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
new file mode 100644
index 0000000..c1a8435
--- /dev/null
+++ b/src/subtokenize.rs
@@ -0,0 +1,67 @@
+use crate::content::string::string;
+use crate::tokenizer::{Code, Event, EventType, TokenType};
+use crate::util::{slice_codes, Span};
+
+pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> Vec<Event> {
+    let mut events = events;
+    let mut index = 0;
+
+    // println!("before");
+    // while index < events.len() {
+    //     let event = &events[index];
+    //     println!(
+    //         "ev1: {:?} {:?} {:?}",
+    //         event.event_type, event.token_type, index
+    //     );
+    //     index += 1;
+    // }
+    //
+    // index = 0;
+    //
+    // println!("change");
+
+    while index < events.len() {
+        let event = &events[index];
+
+        // println!(
+        //     "ev2: {:?} {:?} {:?}",
+        //     event.event_type, event.token_type, index
+        // );
+
+        if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString {
+            let exit = &events[index + 1];
+
+            assert_eq!(
+                exit.event_type,
+                EventType::Exit,
+                "expected `enter` of `{:?}` to be follow by an `exit` event",
+                event.token_type
+            );
+            assert_eq!(
+                exit.token_type, event.token_type,
+                "expected `exit` of `{:?}` to follow its `enter` event",
+                event.token_type
+            );
+
+            let subevents = string(
+                slice_codes(
+                    codes,
+                    &Span {
+                        start_index: event.index,
+                        end_index: exit.index,
+                    },
+                ),
+                event.point.clone(),
+                event.index,
+            );
+            let len = subevents.len();
+            // To do: recursion needed?
+            events.splice(index..(index + 2), subevents);
+            index += len;
+        } else {
+            index += 1;
+        }
+    }
+
+    events
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index faee8d9..35e768e 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -177,16 +177,12 @@ pub struct Tokenizer {
 
 impl Tokenizer {
     /// Create a new tokenizer.
-    pub fn new() -> Tokenizer {
+    pub fn new(point: Point, index: usize) -> Tokenizer {
         Tokenizer {
             current: Code::None,
-            index: 0,
+            index,
             consumed: true,
-            point: Point {
-                line: 1,
-                column: 1,
-                offset: 0,
-            },
+            point,
             stack: vec![],
             events: vec![],
         }
@@ -499,6 +495,11 @@ impl Tokenizer {
             }
         }
 
+        match state {
+            State::Ok => {}
+            _ => unreachable!("expected final state to be `State::Ok`"),
+        }
+
         check_statefn_result((state, None))
     }
 }
diff --git a/src/util.rs b/src/util.rs
index 47359a3..5a916cd 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -165,12 +165,12 @@ pub fn get_span(events: &[Event], index: usize) -> Span {
     assert_eq!(
         exit.event_type,
         EventType::Exit,
-        "expected get_span to be called on `exit` event"
+        "expected `get_span` to be called on `exit` event"
     );
-    let mut start_index = index - 1;
+    let mut enter_index = index - 1;
 
     loop {
-        let enter = &events[start_index];
+        let enter = &events[enter_index];
         if enter.event_type == EventType::Enter && enter.token_type == token_type {
             return Span {
                 // start: enter.point.clone(),
@@ -181,7 +181,7 @@ pub fn get_span(events: &[Event], index: usize) -> Span {
             };
         }
 
-        start_index -= 1;
+        enter_index -= 1;
     }
 }
 
diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs
index 46fa9cb..6419f67 100644
--- a/tests/code_fenced.rs
+++ b/tests/code_fenced.rs
@@ -219,12 +219,11 @@ fn code_fenced() {
         "should support an eof in the prefix, in content"
     );
 
-    // To do: strings.
-    // assert_eq!(
-    //     micromark("```j\\+s&copy;"),
-    //     "<pre><code class=\"language-j+s©\"></code></pre>\n",
-    //     "should support character escapes and character references in info strings"
-    // );
+    assert_eq!(
+        micromark("```j\\+s&copy;"),
+        "<pre><code class=\"language-j+s©\"></code></pre>\n",
+        "should support character escapes and character references in info strings"
+    );
 
     assert_eq!(
       micromark("   ```\naaa\n    ```"),
author	Titus Wormer <tituswormer@gmail.com>	2022-06-09 15:01:46 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-09 15:01:46 +0200
commit	021d5f989ae41ae39a9b937b498141d9dc70d894 (patch)
tree	8009a01d69cbd4f8200ffd34fc4031265b67406e
parent	344c3db875056d4aec509f24fb2dbeaf7e2a14b6 (diff)
download	markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.tar.gz markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.tar.bz2 markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.zip