Add proper support for subtokenization

- Add “content” content type - Add paragraph - Add skips - Add linked tokens
author: Titus Wormer <tituswormer@gmail.com> 2022-06-10 16:29:56 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-10 16:29:56 +0200
commit: 5133042973f31a3992f216e591d840bb491bfd45 (patch)
tree: 810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src/tokenizer.rs
parent: 021d5f989ae41ae39a9b937b498141d9dc70d894 (diff)
download: markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz
markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2
markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip
1 files changed, 37 insertions, 3 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 35e768e..1746a19 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -12,6 +12,7 @@
 //! [`check`]: Tokenizer::check
 
 use crate::constant::TAB_SIZE;
+use std::collections::HashMap;
 
 /// Semantic label of a span.
 // To do: figure out how to share this so extensions can add their own stuff,
@@ -64,7 +65,10 @@ pub enum TokenType {
     Content,
     ContentChunk,
 
+    Paragraph,
+
     ChunkString,
+    ChunkText,
 }
 
 /// Enum representing a character code.
@@ -101,7 +105,7 @@ pub struct Point {
 }
 
 /// Possible event types.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Clone)]
 pub enum EventType {
     /// The start of something.
     Enter,
@@ -110,12 +114,14 @@ pub enum EventType {
 }
 
 /// Something semantic happening somewhere.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct Event {
     pub event_type: EventType,
     pub token_type: TokenType,
     pub point: Point,
     pub index: usize,
+    pub previous: Option<usize>,
+    pub next: Option<usize>,
 }
 
 /// The essence of the state machine are functions: `StateFn`.
@@ -156,6 +162,7 @@ struct InternalState {
 /// A tokenizer itself.
 #[derive(Debug)]
 pub struct Tokenizer {
+    column_start: HashMap<usize, usize>,
     /// Track whether a character is expected to be consumed, and whether it’s
     /// actually consumed
     ///
@@ -180,6 +187,7 @@ impl Tokenizer {
     pub fn new(point: Point, index: usize) -> Tokenizer {
         Tokenizer {
             current: Code::None,
+            column_start: HashMap::new(),
             index,
             consumed: true,
             point,
@@ -195,6 +203,28 @@ impl Tokenizer {
         self.current = code;
     }
 
+    pub fn define_skip(&mut self, point: &Point, index: usize) {
+        self.column_start.insert(point.line, point.column);
+        self.account_for_potential_skip();
+        log::debug!("position: define skip: `{:?}` ({:?})", point, index);
+    }
+
+    fn account_for_potential_skip(&mut self) {
+        println!("account?: {:?} {:?}", self.point, self.index);
+        match self.column_start.get(&self.point.line) {
+            None => {}
+            Some(next_column) => {
+                if self.point.column == 1 {
+                    let col = *next_column;
+                    self.point.column = col;
+                    self.point.offset += col - 1;
+                    self.index += col - 1;
+                    println!("account! {:?} {:?}", self.point, self.index);
+                }
+            }
+        };
+    }
+
     /// Consume the current character.
     /// Each [`StateFn`][] is expected to call this to signal that this code is
     /// used, or call a next `StateFn`.
@@ -215,7 +245,7 @@ impl Tokenizer {
                 } else {
                     1
                 };
-                // To do: accountForPotentialSkip()
+                self.account_for_potential_skip();
                 log::debug!("position: after eol: `{:?}`", self.point);
             }
             Code::VirtualSpace => {
@@ -240,6 +270,8 @@ impl Tokenizer {
             token_type: token_type.clone(),
             point: self.point.clone(),
             index: self.index,
+            previous: None,
+            next: None,
         };
 
         self.events.push(event);
@@ -270,6 +302,8 @@ impl Tokenizer {
             token_type,
             point,
             index: self.index,
+            previous: None,
+            next: None,
         };
 
         self.events.push(event);
author	Titus Wormer <tituswormer@gmail.com>	2022-06-10 16:29:56 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-10 16:29:56 +0200
commit	5133042973f31a3992f216e591d840bb491bfd45 (patch)
tree	810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src/tokenizer.rs
parent	021d5f989ae41ae39a9b937b498141d9dc70d894 (diff)
download	markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2 markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip