Add proper support for subtokenization

- Add “content” content type - Add paragraph - Add skips - Add linked tokens
author: Titus Wormer <tituswormer@gmail.com> 2022-06-10 16:29:56 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-10 16:29:56 +0200
commit: 5133042973f31a3992f216e591d840bb491bfd45 (patch)
tree: 810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src
parent: 021d5f989ae41ae39a9b937b498141d9dc70d894 (diff)
download: markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz
markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2
markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip
7 files changed, 279 insertions, 123 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 3632d29..05a56e1 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -38,7 +38,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
 
         match event.event_type {
             EventType::Enter => match token_type {
-                TokenType::Content => {
+                TokenType::Paragraph => {
                     buf_tail_mut(buffers).push("<p>".to_string());
                 }
                 TokenType::CodeIndented => {
@@ -62,7 +62,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                         ignore_encode = true;
                     }
                 }
-                TokenType::ContentChunk
+                TokenType::Content
                 | TokenType::AtxHeading
                 | TokenType::AtxHeadingSequence
                 | TokenType::AtxHeadingWhitespace
@@ -79,7 +79,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 | TokenType::HtmlFlowData
                 | TokenType::CodeFencedFence
                 | TokenType::CodeFencedFenceSequence
-                | TokenType::ChunkString
+                | TokenType::ChunkText
                 | TokenType::CodeFencedFenceWhitespace
                 | TokenType::Data
                 | TokenType::CharacterEscape
@@ -97,7 +97,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 }
             },
             EventType::Exit => match token_type {
-                TokenType::ThematicBreakSequence
+                TokenType::Content
+                | TokenType::ThematicBreakSequence
                 | TokenType::ThematicBreakWhitespace
                 | TokenType::CodeIndentedPrefixWhitespace
                 | TokenType::BlankLineEnding
@@ -120,7 +121,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                     // last_was_tag = false;
                     buf_tail_mut(buffers).push(res);
                 }
-                TokenType::Content => {
+                TokenType::Paragraph => {
                     buf_tail_mut(buffers).push("</p>".to_string());
                 }
                 TokenType::CodeIndented | TokenType::CodeFenced => {
@@ -278,17 +279,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
 
                     character_reference_kind = None;
                 }
-                // To do: `ContentPhrasing` should be parsed as phrasing first.
                 // This branch below currently acts as the resulting `data` tokens.
-                // To do: initial and final whitespace should be handled in `text`.
-                TokenType::ContentChunk => {
-                    // last_was_tag = false;
-                    buf_tail_mut(buffers).push(encode(
-                        slice_serialize(codes, &get_span(events, index), false).trim(),
-                    ));
-                }
-                // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported.
-                TokenType::ChunkString | TokenType::Data | TokenType::CharacterEscapeValue => {
+                // To do: `ChunkText` does not belong here. Remove it when subtokenization is supported.
+                TokenType::ChunkText | TokenType::Data | TokenType::CharacterEscapeValue => {
                     // last_was_tag = false;
                     buf_tail_mut(buffers).push(encode(&slice_serialize(
                         codes,
diff --git a/src/content/content.rs b/src/content/content.rs
new file mode 100644
index 0000000..7bf692f
--- /dev/null
+++ b/src/content/content.rs
@@ -0,0 +1,84 @@
+//! The `content`, ahum, content type.
+//!
+//! **Content** is zero or more definitions, and then zero or one paragraph.
+//! It’s a weird one, and needed to make certain edge cases around definitions
+//! spec compliant.
+//! Definitions are unlike other things in markdown, in that they behave like
+//! **text** in that they can contain arbitrary line endings, but *have* to end
+//! at a line ending.
+//! If they end in something else, the whole definition instead is seen as a
+//! paragraph.
+//!
+//! The constructs found in content are:
+//!
+//! *   Definition
+//! *   Paragraph
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Before content.
+///
+/// ```markdown
+/// |[x]: y
+/// |asd
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            unreachable!("expected non-eol/eof");
+        }
+        _ => paragraph_initial(tokenizer, code)
+        // To do: definition.
+        // _ => tokenizer.attempt(definition, |ok| {
+        //     Box::new(if ok {
+        //         a
+        //     } else {
+        //         b
+        //     })
+        // })(tokenizer, code),
+    }
+}
+
+/// Before a paragraph.
+///
+/// ```markdown
+/// |asd
+/// ```
+fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            unreachable!("expected non-eol/eof");
+        }
+        _ => {
+            tokenizer.enter(TokenType::Paragraph);
+            tokenizer.enter(TokenType::ChunkText);
+            data(tokenizer, code)
+        }
+    }
+}
+
+/// In a line in a paragraph.
+///
+/// ```markdown
+/// |\&
+/// |qwe
+/// ```
+fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None => {
+            tokenizer.exit(TokenType::ChunkText);
+            tokenizer.exit(TokenType::Paragraph);
+            (State::Ok, None)
+        }
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::ChunkText);
+            tokenizer.enter(TokenType::ChunkText);
+            (State::Fn(Box::new(data)), None)
+        }
+        _ => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(data)), None)
+        }
+    }
+}
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 6f94424..0d1bd22 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -31,8 +31,6 @@ use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Toke
 use crate::util::get_span;
 
 /// Turn `codes` as the flow content type into events.
-// To do: remove this `allow` when all the content types are glued together.
-#[allow(dead_code)]
 pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
     let mut tokenizer = Tokenizer::new(point, index);
     tokenizer.feed(codes, Box::new(start), true);
@@ -49,7 +47,7 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
 /// |    bravo
 /// |***
 /// ```
-fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
         Code::None => (State::Ok, None),
         _ => tokenizer.attempt(blank_line, |ok| {
@@ -168,7 +166,7 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
         _ => {
             tokenizer.enter(TokenType::Content);
             tokenizer.enter(TokenType::ContentChunk);
-            content(tokenizer, code)
+            content(tokenizer, code, tokenizer.events.len() - 1)
         }
     }
 }
@@ -178,21 +176,26 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
 /// al|pha
 /// ```
 // To do: lift limitations as documented above.
-fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult {
     match code {
-        Code::None => {
-            tokenizer.exit(TokenType::ContentChunk);
-            content_end(tokenizer, code)
-        }
+        Code::None => content_end(tokenizer, code),
         Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
-            tokenizer.exit(TokenType::ContentChunk);
-            tokenizer.check(continuation_construct, |ok| {
-                Box::new(if ok { content_continue } else { content_end })
+            tokenizer.check(continuation_construct, move |ok| {
+                Box::new(move |t, c| {
+                    if ok {
+                        content_continue(t, c, previous)
+                    } else {
+                        content_end(t, c)
+                    }
+                })
             })(tokenizer, code)
         }
         _ => {
             tokenizer.consume(code);
-            (State::Fn(Box::new(content)), None)
+            (
+                State::Fn(Box::new(move |t, c| content(t, c, previous))),
+                None,
+            )
         }
     }
 }
@@ -254,17 +257,21 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) ->
     }
 }
 
-fn content_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    // To do: should this be part of the content chunk?
-    // That’s what `micromark-js` does.
-    tokenizer.enter(TokenType::LineEnding);
+fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult {
     tokenizer.consume(code);
-    tokenizer.exit(TokenType::LineEnding);
+    tokenizer.exit(TokenType::ContentChunk);
     tokenizer.enter(TokenType::ContentChunk);
-    (State::Fn(Box::new(content)), None)
+    let next_index = tokenizer.events.len() - 1;
+    tokenizer.events[previous_index].next = Some(next_index);
+    tokenizer.events[next_index].previous = Some(previous_index);
+    (
+        State::Fn(Box::new(move |t, c| content(t, c, next_index))),
+        None,
+    )
 }
 
 fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.exit(TokenType::ContentChunk);
     tokenizer.exit(TokenType::Content);
     after(tokenizer, code)
 }
diff --git a/src/content/mod.rs b/src/content/mod.rs
index d5771a3..4c0a7f4 100644
--- a/src/content/mod.rs
+++ b/src/content/mod.rs
@@ -1,4 +1,6 @@
 //! Content types found in markdown.
 
+#[allow(clippy::module_inception)]
+pub mod content;
 pub mod flow;
 pub mod string;
diff --git a/src/content/string.rs b/src/content/string.rs
index 64f544b..ff9e3fc 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -5,7 +5,7 @@
 //! It exists in things such as identifiers (media references, definitions),
 //! titles, URLs, code (fenced) info and meta parts.
 //!
-//! The constructs found in strin are:
+//! The constructs found in string are:
 //!
 //! *   [Character escape][crate::construct::character_escape]
 //! *   [Character reference][crate::construct::character_reference]
@@ -13,16 +13,7 @@
 use crate::construct::{
     character_escape::start as character_escape, character_reference::start as character_reference,
 };
-use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
-
-/// Turn `codes` as the string content type into events.
-// To do: remove this `allow` when all the content types are glued together.
-#[allow(dead_code)]
-pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
-    let mut tokenizer = Tokenizer::new(point, index);
-    tokenizer.feed(codes, Box::new(before), true);
-    tokenizer.events
-}
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
 
 /// Before string.
 ///
@@ -33,33 +24,12 @@ pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
 /// |\&
 /// |qwe
 /// ```
-fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    match code {
-        Code::None => (State::Ok, None),
-        _ => tokenizer.attempt(character_reference, |ok| {
-            Box::new(if ok {
-                before
-            } else {
-                before_not_character_reference
-            })
-        })(tokenizer, code),
-    }
-}
-
-/// Before string, not at a character reference.
-///
-/// Assume character escape.
-///
-/// ```markdown
-/// |\&
-/// |qwe
-/// ```
-fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
         Code::None => (State::Ok, None),
-        _ => tokenizer.attempt(character_escape, |ok| {
+        _ => tokenizer.attempt_2(character_reference, character_escape, |ok| {
             Box::new(if ok {
-                before
+                start
             } else {
                 before_not_character_escape
             })
@@ -98,7 +68,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
         // To do: somehow get these markers from constructs.
         Code::Char('&' | '\\') => {
             tokenizer.exit(TokenType::Data);
-            before(tokenizer, code)
+            start(tokenizer, code)
         }
         _ => {
             tokenizer.consume(code);
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index c1a8435..adf843f 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -1,66 +1,132 @@
-use crate::content::string::string;
-use crate::tokenizer::{Code, Event, EventType, TokenType};
+use crate::content::content::start as content;
+use crate::content::string::start as string;
+use crate::tokenizer::{
+    Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer,
+};
 use crate::util::{slice_codes, Span};
+use std::collections::HashMap;
 
 pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> Vec<Event> {
     let mut events = events;
     let mut index = 0;
-
-    // println!("before");
-    // while index < events.len() {
-    //     let event = &events[index];
-    //     println!(
-    //         "ev1: {:?} {:?} {:?}",
-    //         event.event_type, event.token_type, index
-    //     );
-    //     index += 1;
-    // }
-    //
-    // index = 0;
-    //
-    // println!("change");
+    // Map of first chunks its tokenizer.
+    let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new();
+    // Map of chunks to their head and corresponding range of events.
+    let mut link_to_info: HashMap<usize, (usize, usize, usize)> = HashMap::new();
 
     while index < events.len() {
         let event = &events[index];
 
-        // println!(
-        //     "ev2: {:?} {:?} {:?}",
-        //     event.event_type, event.token_type, index
-        // );
+        // Find each first opening chunk.
+        if (event.token_type == TokenType::ChunkString
+                || event.token_type == TokenType::ContentChunk) &&
+            event.event_type == EventType::Enter &&
+            // No need to enter linked events again.
+            event.previous == None
+        {
+            // Index into `events` pointing to a chunk.
+            let mut index_opt: Option<usize> = Some(index);
+            // Subtokenizer.
+            let mut tokenizer = Tokenizer::new(event.point.clone(), event.index);
+            // Substate.
+            let mut result: StateFnResult = (
+                State::Fn(Box::new(if event.token_type == TokenType::ContentChunk {
+                    content
+                } else {
+                    string
+                })),
+                None,
+            );
+            // Indices into `codes` of each end of chunk.
+            let mut ends: Vec<usize> = vec![];
 
-        if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString {
-            let exit = &events[index + 1];
+            // Loop through chunks to pass them in order to the subtokenizer.
+            while let Some(index_ptr) = index_opt {
+                let enter = &events[index_ptr];
+                let span = Span {
+                    start_index: enter.index,
+                    end_index: events[index_ptr + 1].index,
+                };
+                ends.push(span.end_index);
 
-            assert_eq!(
-                exit.event_type,
-                EventType::Exit,
-                "expected `enter` of `{:?}` to be follow by an `exit` event",
-                event.token_type
-            );
-            assert_eq!(
-                exit.token_type, event.token_type,
-                "expected `exit` of `{:?}` to follow its `enter` event",
-                event.token_type
-            );
+                if enter.previous != None {
+                    tokenizer.define_skip(&enter.point, span.start_index);
+                }
 
-            let subevents = string(
-                slice_codes(
-                    codes,
-                    &Span {
-                        start_index: event.index,
-                        end_index: exit.index,
-                    },
-                ),
-                event.point.clone(),
-                event.index,
-            );
-            let len = subevents.len();
-            // To do: recursion needed?
-            events.splice(index..(index + 2), subevents);
-            index += len;
-        } else {
-            index += 1;
+                let func: Box<StateFn> = match result.0 {
+                    State::Fn(func) => func,
+                    _ => unreachable!("cannot be ok/nok"),
+                };
+
+                result = tokenizer.feed(slice_codes(codes, &span), func, enter.next == None);
+
+                if let Some(ref x) = result.1 {
+                    if !x.is_empty() {
+                        // To do: handle?
+                        unreachable!("subtokenize:remainder {:?}", x);
+                    }
+                }
+
+                index_opt = enter.next;
+            }
+
+            // Now, loop through all subevents (and `ends`), to figure out
+            // which parts belong where.
+            // Current index.
+            let mut subindex = 0;
+            // Index into subevents that starts the current slice.
+            let mut last_start = 0;
+            // Counter into `ends`.
+            let mut end_index = 0;
+            let mut index_opt: Option<usize> = Some(index);
+
+            while subindex < tokenizer.events.len() {
+                let subevent = &tokenizer.events[subindex];
+
+                // Find the first event that starts after the end we’re looking
+                // for.
+                // To do: is this logic correct?
+                if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] {
+                    let link = index_opt.unwrap();
+                    link_to_info.insert(link, (index, last_start, subindex));
+
+                    last_start = subindex;
+                    end_index += 1;
+                    index_opt = events[link].next;
+                }
+
+                subindex += 1;
+            }
+
+            let link = index_opt.unwrap();
+            link_to_info.insert(link, (index, last_start, subindex));
+            head_to_tokenizer.insert(index, tokenizer);
         }
+
+        index += 1;
+    }
+
+    // Now that we fed everything into a tokenizer, and we know which parts
+    // belong where, the final task is to splice the events from each
+    // tokenizer into the current events.
+    // To do: instead of splicing, it might be possible to create a new `events`
+    // from each slice and slices from events?
+    let mut index = events.len() - 1;
+
+    while index > 0 {
+        let slice_opt = link_to_info.get(&index);
+
+        if let Some(slice) = slice_opt {
+            let (head, start, end) = *slice;
+            // If there’s a slice at this index, it must also point to a head,
+            // and that head must have a tokenizer.
+            let tokenizer = head_to_tokenizer.get(&head).unwrap();
+
+            // To do: figure out a way that moves instead of clones?
+            events.splice(index..(index + 2), tokenizer.events[start..end].to_vec());
+        }
+
+        index -= 1;
     }
 
     events
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 35e768e..1746a19 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -12,6 +12,7 @@
 //! [`check`]: Tokenizer::check
 
 use crate::constant::TAB_SIZE;
+use std::collections::HashMap;
 
 /// Semantic label of a span.
 // To do: figure out how to share this so extensions can add their own stuff,
@@ -64,7 +65,10 @@ pub enum TokenType {
     Content,
     ContentChunk,
 
+    Paragraph,
+
     ChunkString,
+    ChunkText,
 }
 
 /// Enum representing a character code.
@@ -101,7 +105,7 @@ pub struct Point {
 }
 
 /// Possible event types.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Clone)]
 pub enum EventType {
     /// The start of something.
     Enter,
@@ -110,12 +114,14 @@ pub enum EventType {
 }
 
 /// Something semantic happening somewhere.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct Event {
     pub event_type: EventType,
     pub token_type: TokenType,
     pub point: Point,
     pub index: usize,
+    pub previous: Option<usize>,
+    pub next: Option<usize>,
 }
 
 /// The essence of the state machine are functions: `StateFn`.
@@ -156,6 +162,7 @@ struct InternalState {
 /// A tokenizer itself.
 #[derive(Debug)]
 pub struct Tokenizer {
+    column_start: HashMap<usize, usize>,
     /// Track whether a character is expected to be consumed, and whether it’s
     /// actually consumed
     ///
@@ -180,6 +187,7 @@ impl Tokenizer {
     pub fn new(point: Point, index: usize) -> Tokenizer {
         Tokenizer {
             current: Code::None,
+            column_start: HashMap::new(),
             index,
             consumed: true,
             point,
@@ -195,6 +203,28 @@ impl Tokenizer {
         self.current = code;
     }
 
+    pub fn define_skip(&mut self, point: &Point, index: usize) {
+        self.column_start.insert(point.line, point.column);
+        self.account_for_potential_skip();
+        log::debug!("position: define skip: `{:?}` ({:?})", point, index);
+    }
+
+    fn account_for_potential_skip(&mut self) {
+        println!("account?: {:?} {:?}", self.point, self.index);
+        match self.column_start.get(&self.point.line) {
+            None => {}
+            Some(next_column) => {
+                if self.point.column == 1 {
+                    let col = *next_column;
+                    self.point.column = col;
+                    self.point.offset += col - 1;
+                    self.index += col - 1;
+                    println!("account! {:?} {:?}", self.point, self.index);
+                }
+            }
+        };
+    }
+
     /// Consume the current character.
     /// Each [`StateFn`][] is expected to call this to signal that this code is
     /// used, or call a next `StateFn`.
@@ -215,7 +245,7 @@ impl Tokenizer {
                 } else {
                     1
                 };
-                // To do: accountForPotentialSkip()
+                self.account_for_potential_skip();
                 log::debug!("position: after eol: `{:?}`", self.point);
             }
             Code::VirtualSpace => {
@@ -240,6 +270,8 @@ impl Tokenizer {
             token_type: token_type.clone(),
             point: self.point.clone(),
             index: self.index,
+            previous: None,
+            next: None,
         };
 
         self.events.push(event);
@@ -270,6 +302,8 @@ impl Tokenizer {
             token_type,
             point,
             index: self.index,
+            previous: None,
+            next: None,
         };
 
         self.events.push(event);
author	Titus Wormer <tituswormer@gmail.com>	2022-06-10 16:29:56 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-10 16:29:56 +0200
commit	5133042973f31a3992f216e591d840bb491bfd45 (patch)
tree	810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src
parent	021d5f989ae41ae39a9b937b498141d9dc70d894 (diff)
download	markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2 markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip