diff options
| author | 2022-06-14 18:57:28 +0200 | |
|---|---|---|
| committer | 2022-06-14 18:57:28 +0200 | |
| commit | b00fafbdcba39e7e17144b07834702629b891062 (patch) | |
| tree | 3351cc3ad2bb126d8a93e1ff6b1731bc00cb45c3 | |
| parent | 129ea34b18aaf7f5a01d404effbdc78cbbe67a74 (diff) | |
| download | markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.gz markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.tar.bz2 markdown-rs-b00fafbdcba39e7e17144b07834702629b891062.zip | |
Fix support for deep subtokenization
*   Fix a couple of forgotten line ending handling in html (text)
*   Fix missing initial case for html (text) not having a `<` 😬
*   Add line ending handling to `text` construct
| -rw-r--r-- | src/construct/html_text.rs | 24 | ||||
| -rw-r--r-- | src/content/text.rs | 21 | ||||
| -rw-r--r-- | src/subtokenize.rs | 28 | ||||
| -rw-r--r-- | tests/html_flow.rs | 11 | ||||
| -rw-r--r-- | tests/html_text.rs | 108 | 
5 files changed, 102 insertions, 90 deletions
| diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 95fb8c3..c118006 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -58,10 +58,14 @@ use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer  /// a |<x> b  /// ```  pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    tokenizer.enter(TokenType::HtmlText); -    tokenizer.enter(TokenType::HtmlTextData); -    tokenizer.consume(code); -    (State::Fn(Box::new(open)), None) +    if Code::Char('<') == code { +        tokenizer.enter(TokenType::HtmlText); +        tokenizer.enter(TokenType::HtmlTextData); +        tokenizer.consume(code); +        (State::Fn(Box::new(open)), None) +    } else { +        (State::Nok, None) +    }  }  /// After `<`, before a tag name or other stuff. @@ -582,9 +586,9 @@ pub fn tag_open_attribute_value_quoted(  pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { -            tag_open_between(tokenizer, code) -        } +        Code::CarriageReturnLineFeed +        | Code::VirtualSpace +        | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),          Code::Char(_) => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) @@ -603,9 +607,9 @@ pub fn tag_open_attribute_value_quoted_after(      code: Code,  ) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => { -            tag_open_between(tokenizer, code) -        } +        Code::CarriageReturnLineFeed +        | Code::VirtualSpace +        | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code),          _ => (State::Nok, None),      }  } diff --git a/src/content/text.rs b/src/content/text.rs index 73c2d55..433d030 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -52,12 +52,19 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// |qwe  /// ```  fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    if let Code::None = code { -        (State::Ok, None) -    } else { -        tokenizer.enter(TokenType::Data); -        tokenizer.consume(code); -        (State::Fn(Box::new(in_data)), None) +    match code { +        Code::None => (State::Ok, None), +        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +            tokenizer.enter(TokenType::LineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::LineEnding); +            (State::Fn(Box::new(start)), None) +        } +        _ => { +            tokenizer.enter(TokenType::Data); +            tokenizer.consume(code); +            (State::Fn(Box::new(in_data)), None) +        }      }  } @@ -73,7 +80,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {              (State::Ok, None)          }          // To do: somehow get these markers from constructs. -        Code::Char('&' | '\\' | '<') => { +        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '\\' | '<') => {              tokenizer.exit(TokenType::Data);              start(tokenizer, code)          } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 35d7672..71a84e1 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -82,12 +82,12 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {              let mut subindex = 0;              // Index into subevents that starts the current slice.              let mut last_start = 0; -            // Counter into `ends`. +            // Counter into `ends`: the linked token we are at.              let mut end_index = 0;              let mut index_opt: Option<usize> = Some(index);              while subindex < tokenizer.events.len() { -                let subevent = &tokenizer.events[subindex]; +                let subevent = &mut tokenizer.events[subindex];                  // Find the first event that starts after the end we’re looking                  // for. @@ -101,11 +101,26 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {                      index_opt = events[link].next;                  } +                // If there is a `next` link in the subevents, we have to change +                // its index to account for the shifted events. +                // If it points to a next event, we also change the next event’s +                // reference back to *this* event. +                if let Some(next) = subevent.next { +                    // The `index` in `events` where the current link is, +                    // minus 2 events (the enter and exit) for each removed +                    // link. +                    let shift = index_opt.unwrap() - (end_index * 2); + +                    subevent.next = Some(next + shift); +                    let next_ev = &mut tokenizer.events[next]; +                    let previous = next_ev.previous.unwrap(); +                    next_ev.previous = Some(previous + shift); +                } +                  subindex += 1;              } -            let link = index_opt.unwrap(); -            link_to_info.insert(link, (index, last_start, subindex)); +            link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));              head_to_tokenizer.insert(index, tokenizer);          } @@ -119,11 +134,6 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {      // from each slice and slices from events?      let mut index = events.len() - 1; -    // To do: this is broken, because it can inject linked events, which point -    // to their links through indices, and this messes with all indices. -    // We should try walking front to end instead, keep a count of the shifted -    // index. -    // It’s a bit complex but should work?      while index > 0 {          let slice_opt = link_to_info.get(&index); diff --git a/tests/html_flow.rs b/tests/html_flow.rs index 49a6ea8..6445af3 100644 --- a/tests/html_flow.rs +++ b/tests/html_flow.rs @@ -116,12 +116,11 @@ p {color:blue;}          "should support an eof directly after a raw tag name"      ); -    // To do: line endings in html text. -    // assert_eq!( -    //     micromark_with_options("</script\nmore", DANGER), -    //     "<p></script\nmore</p>", -    //     "should not support a raw closing tag" -    // ); +    assert_eq!( +        micromark_with_options("</script\nmore", DANGER), +        "<p></script\nmore</p>", +        "should not support a raw closing tag" +    );      assert_eq!(          micromark_with_options("<script/", DANGER), diff --git a/tests/html_text.rs b/tests/html_text.rs index 6ec387b..1f85ac4 100644 --- a/tests/html_text.rs +++ b/tests/html_text.rs @@ -26,22 +26,20 @@ fn html_text() {          "should support self-closing tags"      ); -    // To do: line endings. -    // assert_eq!( -    //     micromark_with_options("<a  /><b2\ndata=\"foo\" >", DANGER), -    //     "<p><a  /><b2\ndata=\"foo\" ></p>", -    //     "should support whitespace in tags" -    // ); +    assert_eq!( +        micromark_with_options("<a  /><b2\ndata=\"foo\" >", DANGER), +        "<p><a  /><b2\ndata=\"foo\" ></p>", +        "should support whitespace in tags" +    ); -    // To do: line endings. -    // assert_eq!( -    //     micromark_with_options( -    //         "<a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 />", -    //         DANGER -    //     ), -    //     "<p><a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 /></p>", -    //     "should support attributes on tags" -    // ); +    assert_eq!( +        micromark_with_options( +            "<a foo=\"bar\" bam = 'baz <em>\"</em>'\n_boolean zoop:33=zoop:33 />", +            DANGER +        ), +        "<p><a foo=\"bar\" bam = 'baz <em>\"</em>'\n_boolean zoop:33=zoop:33 /></p>", +        "should support attributes on tags" +    );      assert_eq!(          micromark_with_options("Foo <responsive-image src=\"foo.jpg\" />", DANGER), @@ -67,12 +65,11 @@ fn html_text() {          "should not support nonconforming attribute values"      ); -    // To do: line endings. -    // assert_eq!( -    //     micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER), -    //     "<p>< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop /></p>", -    //     "should not support nonconforming whitespace" -    // ); +    assert_eq!( +        micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER), +        "<p>< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop /></p>", +        "should not support nonconforming whitespace" +    );      assert_eq!(          micromark_with_options("<a href='bar'title=title>", DANGER), @@ -92,12 +89,11 @@ fn html_text() {          "should not support closing tags w/ attributes"      ); -    // To do: line endings. -    //     assert_eq!( -    //         micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER), -    //         "<p>foo <!-- this is a\ncomment - with hyphen --></p>", -    //         "should support comments" -    //     ); +    assert_eq!( +        micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER), +        "<p>foo <!-- this is a\ncomment - with hyphen --></p>", +        "should support comments" +    );      assert_eq!(          micromark_with_options("foo <!-- not a comment -- two hyphens -->", DANGER), @@ -384,12 +380,11 @@ fn html_text() {          "should not support eof in unquoted attribute value"      ); -    // To do: line endings. -    // assert_eq!( -    //     micromark_with_options("foo <a b=\nasd>", DANGER), -    //     "<p>foo <a b=\nasd></p>", -    //     "should support an eol before an attribute value" -    // ); +    assert_eq!( +        micromark_with_options("foo <a b=\nasd>", DANGER), +        "<p>foo <a b=\nasd></p>", +        "should support an eol before an attribute value" +    );      assert_eq!(  micromark_with_options("<x> a", DANGER), @@ -403,32 +398,29 @@ micromark_with_options("<x> a", DANGER),          "should support an EOF before an attribute value"      ); -    // To do: line endings. -    // assert_eq!( -    //     micromark_with_options("a <!b\nc>", DANGER), -    //     "<p>a <!b\nc></p>", -    //     "should support an EOL in a declaration" -    // ); -    // To do: line endings. -    // assert_eq!( -    //     micromark_with_options("a <![CDATA[\n]]>", DANGER), -    //     "<p>a <![CDATA[\n]]></p>", -    //     "should support an EOL in cdata" -    // ); +    assert_eq!( +        micromark_with_options("a <!b\nc>", DANGER), +        "<p>a <!b\nc></p>", +        "should support an EOL in a declaration" +    ); +    assert_eq!( +        micromark_with_options("a <![CDATA[\n]]>", DANGER), +        "<p>a <![CDATA[\n]]></p>", +        "should support an EOL in cdata" +    ); -    // To do: line endings. -    // // Note: cmjs parses this differently. -    // // See: <https://github.com/commonmark/commonmark.js/issues/196> +    // Note: cmjs parses this differently. +    // See: <https://github.com/commonmark/commonmark.js/issues/196> +    assert_eq!( +        micromark_with_options("a <?\n?>", DANGER), +        "<p>a <?\n?></p>", +        "should support an EOL in an instruction" +    ); + +    // To do: extensions.      // assert_eq!( -    //     micromark_with_options("a <?\n?>", DANGER), -    //     "<p>a <?\n?></p>", -    //     "should support an EOL in an instruction" +    //     micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}), +    //     "<p>a <x></p>", +    //     "should support turning off html (text)"      // ); - -    //     // To do: extensions. -    //     // assert_eq!( -    //     //     micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}), -    //     //     "<p>a <x></p>", -    //     //     "should support turning off html (text)" -    //     // );  } | 
