From b00fafbdcba39e7e17144b07834702629b891062 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Tue, 14 Jun 2022 18:57:28 +0200
Subject: Fix support for deep subtokenization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

*   Fix a couple of forgotten line ending handling in html (text)
*   Fix missing initial case for html (text) not having a `<` 😬
*   Add line ending handling to `text` construct
---
 src/construct/html_text.rs |  24 +++++-----
 src/content/text.rs        |  21 ++++++---
 src/subtokenize.rs         |  28 ++++++++----
 tests/html_flow.rs         |  11 +++--
 tests/html_text.rs         | 108 +++++++++++++++++++++------------------------
 5 files changed, 102 insertions(+), 90 deletions(-)
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index 95fb8c3..c118006 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -58,10 +58,14 @@ use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer
 /// a |<x> b
 /// ```
 pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    tokenizer.enter(TokenType::HtmlText);
-    tokenizer.enter(TokenType::HtmlTextData);
-    tokenizer.consume(code);
-    (State::Fn(Box::new(open)), None)
+    if Code::Char('<') == code {
+        tokenizer.enter(TokenType::HtmlText);
+        tokenizer.enter(TokenType::HtmlTextData);
+        tokenizer.consume(code);
+        (State::Fn(Box::new(open)), None)
+    } else {
+        (State::Nok, None)
+    }
 }
 
 /// After `<`, before a tag name or other stuff.
@@ -582,9 +586,9 @@ pub fn tag_open_attribute_value_quoted(
 pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
         Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None),
-        Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => {
-            tag_open_between(tokenizer, code)
-        }
+        Code::CarriageReturnLineFeed
+        | Code::VirtualSpace
+        | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
         Code::Char(_) => {
             tokenizer.consume(code);
             (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
@@ -603,9 +607,9 @@ pub fn tag_open_attribute_value_quoted_after(
     code: Code,
 ) -> StateFnResult {
     match code {
-        Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => {
-            tag_open_between(tokenizer, code)
-        }
+        Code::CarriageReturnLineFeed
+        | Code::VirtualSpace
+        | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code),
         _ => (State::Nok, None),
     }
 }
diff --git a/src/content/text.rs b/src/content/text.rs
index 73c2d55..433d030 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -52,12 +52,19 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
 /// |qwe
 /// ```
 fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    if let Code::None = code {
-        (State::Ok, None)
-    } else {
-        tokenizer.enter(TokenType::Data);
-        tokenizer.consume(code);
-        (State::Fn(Box::new(in_data)), None)
+    match code {
+        Code::None => (State::Ok, None),
+        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+            tokenizer.enter(TokenType::LineEnding);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::LineEnding);
+            (State::Fn(Box::new(start)), None)
+        }
+        _ => {
+            tokenizer.enter(TokenType::Data);
+            tokenizer.consume(code);
+            (State::Fn(Box::new(in_data)), None)
+        }
     }
 }
 
@@ -73,7 +80,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
             (State::Ok, None)
         }
         // To do: somehow get these markers from constructs.
-        Code::Char('&' | '\\' | '<') => {
+        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '\\' | '<') => {
             tokenizer.exit(TokenType::Data);
             start(tokenizer, code)
         }
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index 35d7672..71a84e1 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -82,12 +82,12 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
             let mut subindex = 0;
             // Index into subevents that starts the current slice.
             let mut last_start = 0;
-            // Counter into `ends`.
+            // Counter into `ends`: the linked token we are at.
             let mut end_index = 0;
             let mut index_opt: Option<usize> = Some(index);
 
             while subindex < tokenizer.events.len() {
-                let subevent = &tokenizer.events[subindex];
+                let subevent = &mut tokenizer.events[subindex];
 
                 // Find the first event that starts after the end we’re looking
                 // for.
@@ -101,11 +101,26 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
                     index_opt = events[link].next;
                 }
 
+                // If there is a `next` link in the subevents, we have to change
+                // its index to account for the shifted events.
+                // If it points to a next event, we also change the next event’s
+                // reference back to *this* event.
+                if let Some(next) = subevent.next {
+                    // The `index` in `events` where the current link is,
+                    // minus 2 events (the enter and exit) for each removed
+                    // link.
+                    let shift = index_opt.unwrap() - (end_index * 2);
+
+                    subevent.next = Some(next + shift);
+                    let next_ev = &mut tokenizer.events[next];
+                    let previous = next_ev.previous.unwrap();
+                    next_ev.previous = Some(previous + shift);
+                }
+
                 subindex += 1;
             }
 
-            let link = index_opt.unwrap();
-            link_to_info.insert(link, (index, last_start, subindex));
+            link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
             head_to_tokenizer.insert(index, tokenizer);
         }
 
@@ -119,11 +134,6 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
     // from each slice and slices from events?
     let mut index = events.len() - 1;
 
-    // To do: this is broken, because it can inject linked events, which point
-    // to their links through indices, and this messes with all indices.
-    // We should try walking front to end instead, keep a count of the shifted
-    // index.
-    // It’s a bit complex but should work?
     while index > 0 {
         let slice_opt = link_to_info.get(&index);
 
diff --git a/tests/html_flow.rs b/tests/html_flow.rs
index 49a6ea8..6445af3 100644
--- a/tests/html_flow.rs
+++ b/tests/html_flow.rs
@@ -116,12 +116,11 @@ p {color:blue;}
         "should support an eof directly after a raw tag name"
     );
 
-    // To do: line endings in html text.
-    // assert_eq!(
-    //     micromark_with_options("</script\nmore", DANGER),
-    //     "<p>&lt;/script\nmore</p>",
-    //     "should not support a raw closing tag"
-    // );
+    assert_eq!(
+        micromark_with_options("</script\nmore", DANGER),
+        "<p>&lt;/script\nmore</p>",
+        "should not support a raw closing tag"
+    );
 
     assert_eq!(
         micromark_with_options("<script/", DANGER),
diff --git a/tests/html_text.rs b/tests/html_text.rs
index 6ec387b..1f85ac4 100644
--- a/tests/html_text.rs
+++ b/tests/html_text.rs
@@ -26,22 +26,20 @@ fn html_text() {
         "should support self-closing tags"
     );
 
-    // To do: line endings.
-    // assert_eq!(
-    //     micromark_with_options("<a  /><b2\ndata=\"foo\" >", DANGER),
-    //     "<p><a  /><b2\ndata=\"foo\" ></p>",
-    //     "should support whitespace in tags"
-    // );
+    assert_eq!(
+        micromark_with_options("<a  /><b2\ndata=\"foo\" >", DANGER),
+        "<p><a  /><b2\ndata=\"foo\" ></p>",
+        "should support whitespace in tags"
+    );
 
-    // To do: line endings.
-    // assert_eq!(
-    //     micromark_with_options(
-    //         "<a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 />",
-    //         DANGER
-    //     ),
-    //     "<p><a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 /></p>",
-    //     "should support attributes on tags"
-    // );
+    assert_eq!(
+        micromark_with_options(
+            "<a foo=\"bar\" bam = 'baz <em>\"</em>'\n_boolean zoop:33=zoop:33 />",
+            DANGER
+        ),
+        "<p><a foo=\"bar\" bam = 'baz <em>\"</em>'\n_boolean zoop:33=zoop:33 /></p>",
+        "should support attributes on tags"
+    );
 
     assert_eq!(
         micromark_with_options("Foo <responsive-image src=\"foo.jpg\" />", DANGER),
@@ -67,12 +65,11 @@ fn html_text() {
         "should not support nonconforming attribute values"
     );
 
-    // To do: line endings.
-    // assert_eq!(
-    //     micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER),
-    //     "<p>&lt; a&gt;&lt;\nfoo&gt;&lt;bar/ &gt;\n&lt;foo bar=baz\nbim!bop /&gt;</p>",
-    //     "should not support nonconforming whitespace"
-    // );
+    assert_eq!(
+        micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER),
+        "<p>&lt; a&gt;&lt;\nfoo&gt;&lt;bar/ &gt;\n&lt;foo bar=baz\nbim!bop /&gt;</p>",
+        "should not support nonconforming whitespace"
+    );
 
     assert_eq!(
         micromark_with_options("<a href='bar'title=title>", DANGER),
@@ -92,12 +89,11 @@ fn html_text() {
         "should not support closing tags w/ attributes"
     );
 
-    // To do: line endings.
-    //     assert_eq!(
-    //         micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER),
-    //         "<p>foo <!-- this is a\ncomment - with hyphen --></p>",
-    //         "should support comments"
-    //     );
+    assert_eq!(
+        micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER),
+        "<p>foo <!-- this is a\ncomment - with hyphen --></p>",
+        "should support comments"
+    );
 
     assert_eq!(
         micromark_with_options("foo <!-- not a comment -- two hyphens -->", DANGER),
@@ -384,12 +380,11 @@ fn html_text() {
         "should not support eof in unquoted attribute value"
     );
 
-    // To do: line endings.
-    // assert_eq!(
-    //     micromark_with_options("foo <a b=\nasd>", DANGER),
-    //     "<p>foo <a b=\nasd></p>",
-    //     "should support an eol before an attribute value"
-    // );
+    assert_eq!(
+        micromark_with_options("foo <a b=\nasd>", DANGER),
+        "<p>foo <a b=\nasd></p>",
+        "should support an eol before an attribute value"
+    );
 
     assert_eq!(
 micromark_with_options("<x> a", DANGER),
@@ -403,32 +398,29 @@ micromark_with_options("<x> a", DANGER),
         "should support an EOF before an attribute value"
     );
 
-    // To do: line endings.
-    // assert_eq!(
-    //     micromark_with_options("a <!b\nc>", DANGER),
-    //     "<p>a <!b\nc></p>",
-    //     "should support an EOL in a declaration"
-    // );
-    // To do: line endings.
-    // assert_eq!(
-    //     micromark_with_options("a <![CDATA[\n]]>", DANGER),
-    //     "<p>a <![CDATA[\n]]></p>",
-    //     "should support an EOL in cdata"
-    // );
+    assert_eq!(
+        micromark_with_options("a <!b\nc>", DANGER),
+        "<p>a <!b\nc></p>",
+        "should support an EOL in a declaration"
+    );
+    assert_eq!(
+        micromark_with_options("a <![CDATA[\n]]>", DANGER),
+        "<p>a <![CDATA[\n]]></p>",
+        "should support an EOL in cdata"
+    );
 
-    // To do: line endings.
-    // // Note: cmjs parses this differently.
-    // // See: <https://github.com/commonmark/commonmark.js/issues/196>
+    // Note: cmjs parses this differently.
+    // See: <https://github.com/commonmark/commonmark.js/issues/196>
+    assert_eq!(
+        micromark_with_options("a <?\n?>", DANGER),
+        "<p>a <?\n?></p>",
+        "should support an EOL in an instruction"
+    );
+
+    // To do: extensions.
     // assert_eq!(
-    //     micromark_with_options("a <?\n?>", DANGER),
-    //     "<p>a <?\n?></p>",
-    //     "should support an EOL in an instruction"
+    //     micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}),
+    //     "<p>a &lt;x&gt;</p>",
+    //     "should support turning off html (text)"
     // );
-
-    //     // To do: extensions.
-    //     // assert_eq!(
-    //     //     micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}),
-    //     //     "<p>a &lt;x&gt;</p>",
-    //     //     "should support turning off html (text)"
-    //     // );
 }
-- 
cgit