From b00fafbdcba39e7e17144b07834702629b891062 Mon Sep 17 00:00:00 2001
From: Titus Wormer
Date: Tue, 14 Jun 2022 18:57:28 +0200
Subject: Fix support for deep subtokenization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Fix a couple of forgotten line ending handling in html (text)
* Fix missing initial case for html (text) not having a `<` 😬
* Add line ending handling to `text` construct
---
src/construct/html_text.rs | 24 +++++-----
src/content/text.rs | 21 ++++++---
src/subtokenize.rs | 28 ++++++++----
tests/html_flow.rs | 11 +++--
tests/html_text.rs | 108 +++++++++++++++++++++------------------------
5 files changed, 102 insertions(+), 90 deletions(-)
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index 95fb8c3..c118006 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -58,10 +58,14 @@ use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer
/// a | b
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.enter(TokenType::HtmlText);
- tokenizer.enter(TokenType::HtmlTextData);
- tokenizer.consume(code);
- (State::Fn(Box::new(open)), None)
+ if Code::Char('<') == code {
+ tokenizer.enter(TokenType::HtmlText);
+ tokenizer.enter(TokenType::HtmlTextData);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(open)), None)
+ } else {
+ (State::Nok, None)
+ }
}
/// After `<`, before a tag name or other stuff.
@@ -582,9 +586,9 @@ pub fn tag_open_attribute_value_quoted(
pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => {
- tag_open_between(tokenizer, code)
- }
+ Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
Code::Char(_) => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
@@ -603,9 +607,9 @@ pub fn tag_open_attribute_value_quoted_after(
code: Code,
) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => {
- tag_open_between(tokenizer, code)
- }
+ Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code),
_ => (State::Nok, None),
}
}
diff --git a/src/content/text.rs b/src/content/text.rs
index 73c2d55..433d030 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -52,12 +52,19 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// |qwe
/// ```
fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- if let Code::None = code {
- (State::Ok, None)
- } else {
- tokenizer.enter(TokenType::Data);
- tokenizer.consume(code);
- (State::Fn(Box::new(in_data)), None)
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(start)), None)
+ }
+ _ => {
+ tokenizer.enter(TokenType::Data);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(in_data)), None)
+ }
}
}
@@ -73,7 +80,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
(State::Ok, None)
}
// To do: somehow get these markers from constructs.
- Code::Char('&' | '\\' | '<') => {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '\\' | '<') => {
tokenizer.exit(TokenType::Data);
start(tokenizer, code)
}
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index 35d7672..71a84e1 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -82,12 +82,12 @@ pub fn subtokenize(events: Vec, codes: &[Code]) -> (Vec, bool) {
let mut subindex = 0;
// Index into subevents that starts the current slice.
let mut last_start = 0;
- // Counter into `ends`.
+ // Counter into `ends`: the linked token we are at.
let mut end_index = 0;
let mut index_opt: Option = Some(index);
while subindex < tokenizer.events.len() {
- let subevent = &tokenizer.events[subindex];
+ let subevent = &mut tokenizer.events[subindex];
// Find the first event that starts after the end we’re looking
// for.
@@ -101,11 +101,26 @@ pub fn subtokenize(events: Vec, codes: &[Code]) -> (Vec, bool) {
index_opt = events[link].next;
}
+ // If there is a `next` link in the subevents, we have to change
+ // its index to account for the shifted events.
+ // If it points to a next event, we also change the next event’s
+ // reference back to *this* event.
+ if let Some(next) = subevent.next {
+ // The `index` in `events` where the current link is,
+ // minus 2 events (the enter and exit) for each removed
+ // link.
+ let shift = index_opt.unwrap() - (end_index * 2);
+
+ subevent.next = Some(next + shift);
+ let next_ev = &mut tokenizer.events[next];
+ let previous = next_ev.previous.unwrap();
+ next_ev.previous = Some(previous + shift);
+ }
+
subindex += 1;
}
- let link = index_opt.unwrap();
- link_to_info.insert(link, (index, last_start, subindex));
+ link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
head_to_tokenizer.insert(index, tokenizer);
}
@@ -119,11 +134,6 @@ pub fn subtokenize(events: Vec, codes: &[Code]) -> (Vec, bool) {
// from each slice and slices from events?
let mut index = events.len() - 1;
- // To do: this is broken, because it can inject linked events, which point
- // to their links through indices, and this messes with all indices.
- // We should try walking front to end instead, keep a count of the shifted
- // index.
- // It’s a bit complex but should work?
while index > 0 {
let slice_opt = link_to_info.get(&index);
diff --git a/tests/html_flow.rs b/tests/html_flow.rs
index 49a6ea8..6445af3 100644
--- a/tests/html_flow.rs
+++ b/tests/html_flow.rs
@@ -116,12 +116,11 @@ p {color:blue;}
"should support an eof directly after a raw tag name"
);
- // To do: line endings in html text.
- // assert_eq!(
- // micromark_with_options("</script\nmore
",
- // "should not support a raw closing tag"
- // );
+ assert_eq!(
+ micromark_with_options("</script\nmore",
+ "should not support a raw closing tag"
+ );
assert_eq!(
micromark_with_options("", DANGER),
- // "
",
- // "should support whitespace in tags"
- // );
+ assert_eq!(
+ micromark_with_options("", DANGER),
+ "
",
+ "should support whitespace in tags"
+ );
- // To do: line endings.
- // assert_eq!(
- // micromark_with_options(
- // "\"\"\n_boolean zoop:33=zoop:33 />",
- // DANGER
- // ),
- // "\"\"\n_boolean zoop:33=zoop:33 />
",
- // "should support attributes on tags"
- // );
+ assert_eq!(
+ micromark_with_options(
+ "",
+ DANGER
+ ),
+ "
",
+ "should support attributes on tags"
+ );
assert_eq!(
micromark_with_options("Foo ", DANGER),
@@ -67,12 +65,11 @@ fn html_text() {
"should not support nonconforming attribute values"
);
- // To do: line endings.
- // assert_eq!(
- // micromark_with_options("< a><\nfoo>\n", DANGER),
- // "< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />
",
- // "should not support nonconforming whitespace"
- // );
+ assert_eq!(
+ micromark_with_options("< a><\nfoo>\n", DANGER),
+ "< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />
",
+ "should not support nonconforming whitespace"
+ );
assert_eq!(
micromark_with_options("", DANGER),
@@ -92,12 +89,11 @@ fn html_text() {
"should not support closing tags w/ attributes"
);
- // To do: line endings.
- // assert_eq!(
- // micromark_with_options("foo ", DANGER),
- // "foo
",
- // "should support comments"
- // );
+ assert_eq!(
+ micromark_with_options("foo ", DANGER),
+ "foo
",
+ "should support comments"
+ );
assert_eq!(
micromark_with_options("foo ", DANGER),
@@ -384,12 +380,11 @@ fn html_text() {
"should not support eof in unquoted attribute value"
);
- // To do: line endings.
- // assert_eq!(
- // micromark_with_options("foo ", DANGER),
- // "foo
",
- // "should support an eol before an attribute value"
- // );
+ assert_eq!(
+ micromark_with_options("foo ", DANGER),
+ "foo
",
+ "should support an eol before an attribute value"
+ );
assert_eq!(
micromark_with_options(" a", DANGER),
@@ -403,32 +398,29 @@ micromark_with_options(" a", DANGER),
"should support an EOF before an attribute value"
);
- // To do: line endings.
- // assert_eq!(
- // micromark_with_options("a ", DANGER),
- // "a
",
- // "should support an EOL in a declaration"
- // );
- // To do: line endings.
- // assert_eq!(
- // micromark_with_options("a ", DANGER),
- // "a
",
- // "should support an EOL in cdata"
- // );
+ assert_eq!(
+ micromark_with_options("a ", DANGER),
+ "a
",
+ "should support an EOL in a declaration"
+ );
+ assert_eq!(
+ micromark_with_options("a ", DANGER),
+ "a
",
+ "should support an EOL in cdata"
+ );
- // To do: line endings.
- // // Note: cmjs parses this differently.
- // // See:
+ // Note: cmjs parses this differently.
+ // See:
+ assert_eq!(
+ micromark_with_options("a \n?>", DANGER),
+ "a \n?>
",
+ "should support an EOL in an instruction"
+ );
+
+ // To do: extensions.
// assert_eq!(
- // micromark_with_options("a \n?>", DANGER),
- // "a \n?>
",
- // "should support an EOL in an instruction"
+ // micromark_with_options("a ", {extensions: [{disable: {null: ["htmlText"]}}]}),
+ // "a <x>
",
+ // "should support turning off html (text)"
// );
-
- // // To do: extensions.
- // // assert_eq!(
- // // micromark_with_options("a ", {extensions: [{disable: {null: ["htmlText"]}}]}),
- // // "a <x>
",
- // // "should support turning off html (text)"
- // // );
}
--
cgit