diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-14 18:57:28 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-14 18:57:28 +0200 |
commit | b00fafbdcba39e7e17144b07834702629b891062 (patch) | |
tree | 3351cc3ad2bb126d8a93e1ff6b1731bc00cb45c3 | |
parent | 129ea34b18aaf7f5a01d404effbdc78cbbe67a74 (diff) | |
download | markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.gz markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.bz2 markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.zip |
Fix support for deep subtokenization
* Fix a couple of forgotten line ending handling in html (text)
* Fix missing initial case for html (text) not having a `<` 😬
* Add line ending handling to `text` construct
-rw-r--r-- | src/construct/html_text.rs | 24 | ||||
-rw-r--r-- | src/content/text.rs | 21 | ||||
-rw-r--r-- | src/subtokenize.rs | 28 | ||||
-rw-r--r-- | tests/html_flow.rs | 11 | ||||
-rw-r--r-- | tests/html_text.rs | 108 |
5 files changed, 102 insertions, 90 deletions
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 95fb8c3..c118006 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -58,10 +58,14 @@ use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer /// a |<x> b /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.enter(TokenType::HtmlText); - tokenizer.enter(TokenType::HtmlTextData); - tokenizer.consume(code); - (State::Fn(Box::new(open)), None) + if Code::Char('<') == code { + tokenizer.enter(TokenType::HtmlText); + tokenizer.enter(TokenType::HtmlTextData); + tokenizer.consume(code); + (State::Fn(Box::new(open)), None) + } else { + (State::Nok, None) + } } /// After `<`, before a tag name or other stuff. @@ -582,9 +586,9 @@ pub fn tag_open_attribute_value_quoted( pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { - tag_open_between(tokenizer, code) - } + Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), Code::Char(_) => { tokenizer.consume(code); (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) @@ -603,9 +607,9 @@ pub fn tag_open_attribute_value_quoted_after( code: Code, ) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => { - tag_open_between(tokenizer, code) - } + Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code), _ => (State::Nok, None), } } diff --git a/src/content/text.rs b/src/content/text.rs index 73c2d55..433d030 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -52,12 +52,19 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |qwe /// ``` fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - if let Code::None = code { - (State::Ok, None) - } else { - tokenizer.enter(TokenType::Data); - tokenizer.consume(code); - (State::Fn(Box::new(in_data)), None) + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(start)), None) + } + _ => { + tokenizer.enter(TokenType::Data); + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } } } @@ -73,7 +80,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Ok, None) } // To do: somehow get these markers from constructs. - Code::Char('&' | '\\' | '<') => { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '\\' | '<') => { tokenizer.exit(TokenType::Data); start(tokenizer, code) } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 35d7672..71a84e1 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -82,12 +82,12 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { let mut subindex = 0; // Index into subevents that starts the current slice. let mut last_start = 0; - // Counter into `ends`. + // Counter into `ends`: the linked token we are at. let mut end_index = 0; let mut index_opt: Option<usize> = Some(index); while subindex < tokenizer.events.len() { - let subevent = &tokenizer.events[subindex]; + let subevent = &mut tokenizer.events[subindex]; // Find the first event that starts after the end we’re looking // for. @@ -101,11 +101,26 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { index_opt = events[link].next; } + // If there is a `next` link in the subevents, we have to change + // its index to account for the shifted events. + // If it points to a next event, we also change the next event’s + // reference back to *this* event. + if let Some(next) = subevent.next { + // The `index` in `events` where the current link is, + // minus 2 events (the enter and exit) for each removed + // link. + let shift = index_opt.unwrap() - (end_index * 2); + + subevent.next = Some(next + shift); + let next_ev = &mut tokenizer.events[next]; + let previous = next_ev.previous.unwrap(); + next_ev.previous = Some(previous + shift); + } + subindex += 1; } - let link = index_opt.unwrap(); - link_to_info.insert(link, (index, last_start, subindex)); + link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex)); head_to_tokenizer.insert(index, tokenizer); } @@ -119,11 +134,6 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // from each slice and slices from events? let mut index = events.len() - 1; - // To do: this is broken, because it can inject linked events, which point - // to their links through indices, and this messes with all indices. - // We should try walking front to end instead, keep a count of the shifted - // index. - // It’s a bit complex but should work? while index > 0 { let slice_opt = link_to_info.get(&index); diff --git a/tests/html_flow.rs b/tests/html_flow.rs index 49a6ea8..6445af3 100644 --- a/tests/html_flow.rs +++ b/tests/html_flow.rs @@ -116,12 +116,11 @@ p {color:blue;} "should support an eof directly after a raw tag name" ); - // To do: line endings in html text. - // assert_eq!( - // micromark_with_options("</script\nmore", DANGER), - // "<p></script\nmore</p>", - // "should not support a raw closing tag" - // ); + assert_eq!( + micromark_with_options("</script\nmore", DANGER), + "<p></script\nmore</p>", + "should not support a raw closing tag" + ); assert_eq!( micromark_with_options("<script/", DANGER), diff --git a/tests/html_text.rs b/tests/html_text.rs index 6ec387b..1f85ac4 100644 --- a/tests/html_text.rs +++ b/tests/html_text.rs @@ -26,22 +26,20 @@ fn html_text() { "should support self-closing tags" ); - // To do: line endings. - // assert_eq!( - // micromark_with_options("<a /><b2\ndata=\"foo\" >", DANGER), - // "<p><a /><b2\ndata=\"foo\" ></p>", - // "should support whitespace in tags" - // ); + assert_eq!( + micromark_with_options("<a /><b2\ndata=\"foo\" >", DANGER), + "<p><a /><b2\ndata=\"foo\" ></p>", + "should support whitespace in tags" + ); - // To do: line endings. - // assert_eq!( - // micromark_with_options( - // "<a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 />", - // DANGER - // ), - // "<p><a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 /></p>", - // "should support attributes on tags" - // ); + assert_eq!( + micromark_with_options( + "<a foo=\"bar\" bam = 'baz <em>\"</em>'\n_boolean zoop:33=zoop:33 />", + DANGER + ), + "<p><a foo=\"bar\" bam = 'baz <em>\"</em>'\n_boolean zoop:33=zoop:33 /></p>", + "should support attributes on tags" + ); assert_eq!( micromark_with_options("Foo <responsive-image src=\"foo.jpg\" />", DANGER), @@ -67,12 +65,11 @@ fn html_text() { "should not support nonconforming attribute values" ); - // To do: line endings. - // assert_eq!( - // micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER), - // "<p>< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop /></p>", - // "should not support nonconforming whitespace" - // ); + assert_eq!( + micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER), + "<p>< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop /></p>", + "should not support nonconforming whitespace" + ); assert_eq!( micromark_with_options("<a href='bar'title=title>", DANGER), @@ -92,12 +89,11 @@ fn html_text() { "should not support closing tags w/ attributes" ); - // To do: line endings. - // assert_eq!( - // micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER), - // "<p>foo <!-- this is a\ncomment - with hyphen --></p>", - // "should support comments" - // ); + assert_eq!( + micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER), + "<p>foo <!-- this is a\ncomment - with hyphen --></p>", + "should support comments" + ); assert_eq!( micromark_with_options("foo <!-- not a comment -- two hyphens -->", DANGER), @@ -384,12 +380,11 @@ fn html_text() { "should not support eof in unquoted attribute value" ); - // To do: line endings. - // assert_eq!( - // micromark_with_options("foo <a b=\nasd>", DANGER), - // "<p>foo <a b=\nasd></p>", - // "should support an eol before an attribute value" - // ); + assert_eq!( + micromark_with_options("foo <a b=\nasd>", DANGER), + "<p>foo <a b=\nasd></p>", + "should support an eol before an attribute value" + ); assert_eq!( micromark_with_options("<x> a", DANGER), @@ -403,32 +398,29 @@ micromark_with_options("<x> a", DANGER), "should support an EOF before an attribute value" ); - // To do: line endings. - // assert_eq!( - // micromark_with_options("a <!b\nc>", DANGER), - // "<p>a <!b\nc></p>", - // "should support an EOL in a declaration" - // ); - // To do: line endings. - // assert_eq!( - // micromark_with_options("a <![CDATA[\n]]>", DANGER), - // "<p>a <![CDATA[\n]]></p>", - // "should support an EOL in cdata" - // ); + assert_eq!( + micromark_with_options("a <!b\nc>", DANGER), + "<p>a <!b\nc></p>", + "should support an EOL in a declaration" + ); + assert_eq!( + micromark_with_options("a <![CDATA[\n]]>", DANGER), + "<p>a <![CDATA[\n]]></p>", + "should support an EOL in cdata" + ); - // To do: line endings. - // // Note: cmjs parses this differently. - // // See: <https://github.com/commonmark/commonmark.js/issues/196> + // Note: cmjs parses this differently. + // See: <https://github.com/commonmark/commonmark.js/issues/196> + assert_eq!( + micromark_with_options("a <?\n?>", DANGER), + "<p>a <?\n?></p>", + "should support an EOL in an instruction" + ); + + // To do: extensions. // assert_eq!( - // micromark_with_options("a <?\n?>", DANGER), - // "<p>a <?\n?></p>", - // "should support an EOL in an instruction" + // micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}), + // "<p>a <x></p>", + // "should support turning off html (text)" // ); - - // // To do: extensions. - // // assert_eq!( - // // micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}), - // // "<p>a <x></p>", - // // "should support turning off html (text)" - // // ); } |