diff options
-rw-r--r-- | Untitled.txt | 3 | ||||
-rw-r--r-- | src/compiler.rs | 10 | ||||
-rw-r--r-- | src/construct/html_text.rs | 480 | ||||
-rw-r--r-- | src/construct/mod.rs | 1 | ||||
-rw-r--r-- | src/content/content.rs | 17 | ||||
-rw-r--r-- | src/content/text.rs | 14 | ||||
-rw-r--r-- | src/subtokenize.rs | 12 | ||||
-rw-r--r-- | src/tokenizer.rs | 34 | ||||
-rw-r--r-- | tests/html_flow.rs | 11 | ||||
-rw-r--r-- | tests/html_text.rs | 434 |
10 files changed, 995 insertions, 21 deletions
diff --git a/Untitled.txt b/Untitled.txt index cc1576f..e796b86 100644 --- a/Untitled.txt +++ b/Untitled.txt @@ -1 +1,4 @@ micromark.js: unquoted: is `completeAttributeValueUnquoted`s case for `completeAttributeNameAfter` missing a `/`?. I’ve added it here. +micromark.js: `]` case in cdata_end does not need to consume, it can defer to `cdata_close`, which should save 1 line +micromark.js: should `tagOpenAttributeValueUnquoted` also support a slash? +micromark.js: `atLineEnding` in html (text) should always eat arbitrary whitespace? code (indented) has no effect on html (text)? diff --git a/src/compiler.rs b/src/compiler.rs index c451887..619bbe5 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -78,6 +78,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St ignore_encode = true; } } + TokenType::HtmlText => { + if options.allow_dangerous_html { + ignore_encode = true; + } + } TokenType::Content | TokenType::AtxHeading | TokenType::AtxHeadingSequence @@ -93,6 +98,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::BlankLineWhitespace | TokenType::Whitespace | TokenType::HtmlFlowData + | TokenType::HtmlTextData | TokenType::CodeFencedFence | TokenType::CodeFencedFenceSequence | TokenType::CodeFencedFenceWhitespace @@ -131,10 +137,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterReferenceMarkerSemi | TokenType::Autolink | TokenType::AutolinkMarker => {} - TokenType::HtmlFlow => { + TokenType::HtmlFlow | TokenType::HtmlText => { ignore_encode = false; } - TokenType::HtmlFlowData => { + TokenType::HtmlFlowData | TokenType::HtmlTextData => { let slice = slice_serialize(codes, &get_span(events, index), false); let res = if ignore_encode { slice } else { encode(&slice) }; diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs new file mode 100644 index 0000000..da5a018 --- /dev/null +++ b/src/construct/html_text.rs @@ -0,0 +1,480 @@ +//! To do. + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; + +/// Start of HTML (text) +/// +/// ```markdown +/// a |<x> b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HtmlText); + tokenizer.enter(TokenType::HtmlTextData); + tokenizer.consume(code); + (State::Fn(Box::new(open)), None) +} + +/// To do. +pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.consume(code); + (State::Fn(Box::new(declaration_open)), None) + } + Code::Char('/') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close_start)), None) + } + Code::Char('?') => { + tokenizer.consume(code); + (State::Fn(Box::new(instruction)), None) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_open)), None) + } + Code::Char('[') => { + tokenizer.consume(code); + let buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open(tokenizer, code, buffer, 0) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(declaration)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn comment_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_start)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn comment_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('>') => (State::Nok, None), + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_start_dash)), None) + } + _ => comment(tokenizer, code), + } +} + +/// To do. +pub fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('>') => (State::Nok, None), + _ => comment(tokenizer, code), + } +} + +/// To do. +pub fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(comment)) + } + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_close)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(comment)), None) + } + } +} + +/// To do. +pub fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(end)), None) + } + _ => comment(tokenizer, code), + } +} + +/// To do. +pub fn cdata_open( + tokenizer: &mut Tokenizer, + code: Code, + buffer: Vec<char>, + index: usize, +) -> StateFnResult { + match code { + Code::Char(char) if char == buffer[index] => { + tokenizer.consume(code); + + if index + 1 == buffer.len() { + (State::Fn(Box::new(cdata)), None) + } else { + ( + State::Fn(Box::new(move |tokenizer, code| { + cdata_open(tokenizer, code, buffer, index + 1) + })), + None, + ) + } + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(cdata)) + } + Code::Char(']') => { + tokenizer.consume(code); + (State::Fn(Box::new(cdata_close)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(cdata)), None) + } + } +} + +/// To do. +pub fn cdata_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.consume(code); + (State::Fn(Box::new(cdata_end)), None) + } + _ => cdata(tokenizer, code), + } +} + +/// To do. +pub fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => end(tokenizer, code), + Code::Char(']') => cdata_close(tokenizer, code), + _ => cdata(tokenizer, code), + } +} + +/// To do. +pub fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('>') => end(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(declaration)) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(declaration)), None) + } + } +} + +/// To do. +pub fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(instruction)) + } + Code::Char('?') => { + tokenizer.consume(code); + (State::Fn(Box::new(instruction_close)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(instruction)), None) + } + } +} + +/// To do. +pub fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => end(tokenizer, code), + _ => instruction(tokenizer, code), + } +} + +/// To do. +pub fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close)), None) + } + _ => tag_close_between(tokenizer, code), + } +} + +/// To do. +pub fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_close_between)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close_between)), None) + } + _ => end(tokenizer, code), + } +} + +/// To do. +pub fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open)), None) + } + + Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), + _ => (State::Nok, None), + } +} + +/// To do. +pub fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_open_between)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_between)), None) + } + Code::Char('/') => { + tokenizer.consume(code); + (State::Fn(Box::new(end)), None) + } + Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_name)), None) + } + _ => end(tokenizer, code), + } +} + +/// To do. +pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) + if char == '-' + || char == '.' + || char == ':' + || char == '_' + || char.is_ascii_alphanumeric() => + { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_name)), None) + } + _ => tag_open_attribute_name_after(tokenizer, code), + } +} + +/// To do. +pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_name_after)), None) + } + Code::Char('=') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_before)), None) + } + _ => tag_open_between(tokenizer, code), + } +} + +/// To do. +pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_before)), None) + } + Code::Char(char) if char == '"' || char == '\'' => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + tag_open_attribute_value_quoted(tokenizer, code, char) + })), + None, + ) + } + Code::Char(_) => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) + } + } +} + +/// To do. +pub fn tag_open_attribute_value_quoted( + tokenizer: &mut Tokenizer, + code: Code, + marker: char, +) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending( + tokenizer, + code, + Box::new(move |tokenizer, code| { + tag_open_attribute_value_quoted(tokenizer, code, marker) + }), + ), + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(tag_open_attribute_value_quoted_after)), + None, + ) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + tag_open_attribute_value_quoted(tokenizer, code, marker) + })), + None, + ) + } + } +} + +/// To do. +pub fn tag_open_attribute_value_quoted_after( + tokenizer: &mut Tokenizer, + code: Code, +) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => { + tag_open_between(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>') => { + tag_open_between(tokenizer, code) + } + Code::Char(_) => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) + } + } +} + +/// To do. +// We can’t have blank lines in content, so no need to worry about empty +// tokens. +pub fn at_line_ending( + tokenizer: &mut Tokenizer, + code: Code, + return_state: Box<StateFn>, +) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.exit(TokenType::HtmlTextData); + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|t, c| after_line_ending(t, c, return_state))), + None, + ) + } + _ => unreachable!("expected line ending"), + } +} + +pub fn after_line_ending( + tokenizer: &mut Tokenizer, + code: Code, + return_state: Box<StateFn>, +) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|t, c| after_line_ending_prefix(t, c, return_state)), + )(tokenizer, code) +} + +pub fn after_line_ending_prefix( + tokenizer: &mut Tokenizer, + code: Code, + return_state: Box<StateFn>, +) -> StateFnResult { + tokenizer.enter(TokenType::HtmlTextData); + return_state(tokenizer, code) +} + +/// To do. +pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + tokenizer.exit(TokenType::HtmlTextData); + tokenizer.exit(TokenType::HtmlText); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 0bc8746..31d9f6d 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -8,5 +8,6 @@ pub mod code_fenced; pub mod code_indented; pub mod heading_atx; pub mod html_flow; +pub mod html_text; pub mod partial_whitespace; pub mod thematic_break; diff --git a/src/content/content.rs b/src/content/content.rs index 7bf692f..4660fbe 100644 --- a/src/content/content.rs +++ b/src/content/content.rs @@ -52,7 +52,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { _ => { tokenizer.enter(TokenType::Paragraph); tokenizer.enter(TokenType::ChunkText); - data(tokenizer, code) + data(tokenizer, code, tokenizer.events.len() - 1) } } } @@ -63,7 +63,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |\& /// |qwe /// ``` -fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn data(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { match code { Code::None => { tokenizer.exit(TokenType::ChunkText); @@ -74,11 +74,20 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); tokenizer.exit(TokenType::ChunkText); tokenizer.enter(TokenType::ChunkText); - (State::Fn(Box::new(data)), None) + let next_index = tokenizer.events.len() - 1; + tokenizer.events[previous_index].next = Some(next_index); + tokenizer.events[next_index].previous = Some(previous_index); + ( + State::Fn(Box::new(move |t, c| data(t, c, next_index))), + None, + ) } _ => { tokenizer.consume(code); - (State::Fn(Box::new(data)), None) + ( + State::Fn(Box::new(move |t, c| data(t, c, previous_index))), + None, + ) } } } diff --git a/src/content/text.rs b/src/content/text.rs index a7b40e7..3db82f5 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -7,7 +7,7 @@ //! //! * [Autolink][crate::construct::autolink] //! * Attention -//! * HTML (text) +//! * [HTML (text)][crate::construct::html-text] //! * Hard break escape //! * Code (text) //! * Line ending @@ -18,7 +18,7 @@ use crate::construct::{ autolink::start as autolink, character_escape::start as character_escape, - character_reference::start as character_reference, + character_reference::start as character_reference, html_text::start as html_text, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -34,9 +34,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), - _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| { - Box::new(if ok { start } else { before_data }) - })(tokenizer, code), + _ => tokenizer.attempt_4( + character_reference, + character_escape, + autolink, + html_text, + |ok| Box::new(if ok { start } else { before_data }), + )(tokenizer, code), } } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index d72eb69..ee826b8 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -36,10 +36,10 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { let mut result: StateFnResult = ( State::Fn(Box::new(if event.token_type == TokenType::ChunkContent { content - } else if event.token_type == TokenType::ChunkText { - text - } else { + } else if event.token_type == TokenType::ChunkString { string + } else { + text })), None, ); @@ -49,6 +49,7 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // Loop through chunks to pass them in order to the subtokenizer. while let Some(index_ptr) = index_opt { let enter = &events[index_ptr]; + assert_eq!(enter.event_type, EventType::Enter); let span = Span { start_index: enter.index, end_index: events[index_ptr + 1].index, @@ -119,6 +120,11 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // from each slice and slices from events? let mut index = events.len() - 1; + // To do: this is broken, because it can inject linked events, which point + // to their links through indices, and this messes with all indices. + // We should try walking front to end instead, keep a count of the shifted + // index. + // It’s a bit complex but should work? while index > 0 { let slice_opt = link_to_info.get(&index); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4c1caa4..8a2f477 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -58,6 +58,9 @@ pub enum TokenType { HtmlFlow, HtmlFlowData, + HtmlText, + HtmlTextData, + ThematicBreak, ThematicBreakSequence, ThematicBreakWhitespace, @@ -420,7 +423,14 @@ impl Tokenizer { b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { - self.call_multiple(false, Some(Box::new(a)), Some(Box::new(b)), None, done) + self.call_multiple( + false, + Some(Box::new(a)), + Some(Box::new(b)), + None, + None, + done, + ) } pub fn attempt_3( @@ -435,6 +445,25 @@ impl Tokenizer { Some(Box::new(a)), Some(Box::new(b)), Some(Box::new(c)), + None, + done, + ) + } + + pub fn attempt_4( + &mut self, + a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + done: impl FnOnce(bool) -> Box<StateFn> + 'static, + ) -> Box<StateFn> { + self.call_multiple( + false, + Some(Box::new(a)), + Some(Box::new(b)), + Some(Box::new(c)), + Some(Box::new(d)), done, ) } @@ -445,6 +474,7 @@ impl Tokenizer { a: Option<Box<StateFn>>, b: Option<Box<StateFn>>, c: Option<Box<StateFn>>, + d: Option<Box<StateFn>>, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { if let Some(head) = a { @@ -453,7 +483,7 @@ impl Tokenizer { done(ok) } else { Box::new(move |tokenizer: &mut Tokenizer, code| { - tokenizer.call_multiple(check, b, c, None, done)(tokenizer, code) + tokenizer.call_multiple(check, b, c, d, None, done)(tokenizer, code) }) } }; diff --git a/tests/html_flow.rs b/tests/html_flow.rs index 6445af3..49a6ea8 100644 --- a/tests/html_flow.rs +++ b/tests/html_flow.rs @@ -116,11 +116,12 @@ p {color:blue;} "should support an eof directly after a raw tag name" ); - assert_eq!( - micromark_with_options("</script\nmore", DANGER), - "<p></script\nmore</p>", - "should not support a raw closing tag" - ); + // To do: line endings in html text. + // assert_eq!( + // micromark_with_options("</script\nmore", DANGER), + // "<p></script\nmore</p>", + // "should not support a raw closing tag" + // ); assert_eq!( micromark_with_options("<script/", DANGER), diff --git a/tests/html_text.rs b/tests/html_text.rs new file mode 100644 index 0000000..6ec387b --- /dev/null +++ b/tests/html_text.rs @@ -0,0 +1,434 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, CompileOptions}; + +const DANGER: &CompileOptions = &CompileOptions { + allow_dangerous_html: true, + allow_dangerous_protocol: false, +}; + +#[test] +fn html_text() { + assert_eq!( + micromark("a <b> c"), + "<p>a <b> c</p>", + "should encode dangerous html by default" + ); + + assert_eq!( + micromark_with_options("<a><bab><c2c>", DANGER), + "<p><a><bab><c2c></p>", + "should support opening tags" + ); + + assert_eq!( + micromark_with_options("<a/><b2/>", DANGER), + "<p><a/><b2/></p>", + "should support self-closing tags" + ); + + // To do: line endings. + // assert_eq!( + // micromark_with_options("<a /><b2\ndata=\"foo\" >", DANGER), + // "<p><a /><b2\ndata=\"foo\" ></p>", + // "should support whitespace in tags" + // ); + + // To do: line endings. + // assert_eq!( + // micromark_with_options( + // "<a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 />", + // DANGER + // ), + // "<p><a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 /></p>", + // "should support attributes on tags" + // ); + + assert_eq!( + micromark_with_options("Foo <responsive-image src=\"foo.jpg\" />", DANGER), + "<p>Foo <responsive-image src=\"foo.jpg\" /></p>", + "should support non-html tags" + ); + + assert_eq!( + micromark_with_options("<33> <__>", DANGER), + "<p><33> <__></p>", + "should not support nonconforming tag names" + ); + + assert_eq!( + micromark_with_options("<a h*#ref=\"hi\">", DANGER), + "<p><a h*#ref="hi"></p>", + "should not support nonconforming attribute names" + ); + + assert_eq!( + micromark_with_options("<a href=\"hi'> <a href=hi'>", DANGER), + "<p><a href="hi'> <a href=hi'></p>", + "should not support nonconforming attribute values" + ); + + // To do: line endings. + // assert_eq!( + // micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER), + // "<p>< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop /></p>", + // "should not support nonconforming whitespace" + // ); + + assert_eq!( + micromark_with_options("<a href='bar'title=title>", DANGER), + "<p><a href='bar'title=title></p>", + "should not support missing whitespace" + ); + + assert_eq!( + micromark_with_options("</a></foo >", DANGER), + "<p></a></foo ></p>", + "should support closing tags" + ); + + assert_eq!( + micromark_with_options("</a href=\"foo\">", DANGER), + "<p></a href="foo"></p>", + "should not support closing tags w/ attributes" + ); + + // To do: line endings. + // assert_eq!( + // micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER), + // "<p>foo <!-- this is a\ncomment - with hyphen --></p>", + // "should support comments" + // ); + + assert_eq!( + micromark_with_options("foo <!-- not a comment -- two hyphens -->", DANGER), + "<p>foo <!-- not a comment -- two hyphens --></p>", + "should not support comments w/ two dashes inside" + ); + + assert_eq!( + micromark_with_options("foo <!--> foo -->", DANGER), + "<p>foo <!--> foo --></p>", + "should not support nonconforming comments (1)" + ); + + assert_eq!( + micromark_with_options("foo <!-- foo--->", DANGER), + "<p>foo <!-- foo---></p>", + "should not support nonconforming comments (2)" + ); + + assert_eq!( + micromark_with_options("foo <?php echo $a; ?>", DANGER), + "<p>foo <?php echo $a; ?></p>", + "should support instructions" + ); + + assert_eq!( + micromark_with_options("foo <!ELEMENT br EMPTY>", DANGER), + "<p>foo <!ELEMENT br EMPTY></p>", + "should support declarations" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA[>&<]]>", DANGER), + "<p>foo <![CDATA[>&<]]></p>", + "should support cdata" + ); + + assert_eq!( + micromark_with_options("foo <a href=\"ö\">", DANGER), + "<p>foo <a href=\"ö\"></p>", + "should support (ignore) character references" + ); + + assert_eq!( + micromark_with_options("foo <a href=\"\\*\">", DANGER), + "<p>foo <a href=\"\\*\"></p>", + "should not support character escapes (1)" + ); + + assert_eq!( + micromark_with_options("<a href=\"\\\"\">", DANGER), + "<p><a href="""></p>", + "should not support character escapes (2)" + ); + + // Extra: + assert_eq!( + micromark_with_options("foo <!1>", DANGER), + "<p>foo <!1></p>", + "should not support non-comment, non-cdata, and non-named declaration" + ); + + assert_eq!( + micromark_with_options("foo <!-not enough!-->", DANGER), + "<p>foo <!-not enough!--></p>", + "should not support comments w/ not enough dashes" + ); + + assert_eq!( + micromark_with_options("foo <!---ok-->", DANGER), + "<p>foo <!---ok--></p>", + "should support comments that start w/ a dash, if it’s not followed by a greater than" + ); + + assert_eq!( + micromark_with_options("foo <!--->", DANGER), + "<p>foo <!---></p>", + "should not support comments that start w/ `->`" + ); + + assert_eq!( + micromark_with_options("foo <!-- -> -->", DANGER), + "<p>foo <!-- -> --></p>", + "should support `->` in a comment" + ); + + assert_eq!( + micromark_with_options("foo <!--", DANGER), + "<p>foo <!--</p>", + "should not support eof in a comment (1)" + ); + + assert_eq!( + micromark_with_options("foo <!--a", DANGER), + "<p>foo <!--a</p>", + "should not support eof in a comment (2)" + ); + + assert_eq!( + micromark_with_options("foo <!--a-", DANGER), + "<p>foo <!--a-</p>", + "should not support eof in a comment (3)" + ); + + assert_eq!( + micromark_with_options("foo <!--a--", DANGER), + "<p>foo <!--a--</p>", + "should not support eof in a comment (4)" + ); + + // Note: cmjs parses this differently. + // See: <https://github.com/commonmark/commonmark.js/issues/193> + assert_eq!( + micromark_with_options("foo <![cdata[]]>", DANGER), + "<p>foo <![cdata[]]></p>", + "should not support lowercase “cdata”" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA", DANGER), + "<p>foo <![CDATA</p>", + "should not support eof in a CDATA (1)" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA[", DANGER), + "<p>foo <![CDATA[</p>", + "should not support eof in a CDATA (2)" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA[]", DANGER), + "<p>foo <![CDATA[]</p>", + "should not support eof in a CDATA (3)" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA[]]", DANGER), + "<p>foo <![CDATA[]]</p>", + "should not support eof in a CDATA (4)" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA[asd", DANGER), + "<p>foo <![CDATA[asd</p>", + "should not support eof in a CDATA (5)" + ); + + assert_eq!( + micromark_with_options("foo <![CDATA[]]]]>", DANGER), + "<p>foo <![CDATA[]]]]></p>", + "should support end-like constructs in CDATA" + ); + + assert_eq!( + micromark_with_options("foo <!doctype", DANGER), + "<p>foo <!doctype</p>", + "should not support eof in declarations" + ); + + assert_eq!( + micromark_with_options("foo <?php", DANGER), + "<p>foo <?php</p>", + "should not support eof in instructions (1)" + ); + + assert_eq!( + micromark_with_options("foo <?php?", DANGER), + "<p>foo <?php?</p>", + "should not support eof in instructions (2)" + ); + + assert_eq!( + micromark_with_options("foo <???>", DANGER), + "<p>foo <???></p>", + "should support question marks in instructions" + ); + + assert_eq!( + micromark_with_options("foo </3>", DANGER), + "<p>foo </3></p>", + "should not support closing tags that don’t start w/ alphas" + ); + + assert_eq!( + micromark_with_options("foo </a->", DANGER), + "<p>foo </a-></p>", + "should support dashes in closing tags" + ); + + assert_eq!( + micromark_with_options("foo </a >", DANGER), + "<p>foo </a ></p>", + "should support whitespace after closing tag names" + ); + + assert_eq!( + micromark_with_options("foo </a!>", DANGER), + "<p>foo </a!></p>", + "should not support other characters after closing tag names" + ); + + assert_eq!( + micromark_with_options("foo <a->", DANGER), + "<p>foo <a-></p>", + "should support dashes in opening tags" + ); + + assert_eq!( + micromark_with_options("foo <a >", DANGER), + "<p>foo <a ></p>", + "should support whitespace after opening tag names" + ); + + assert_eq!( + micromark_with_options("foo <a!>", DANGER), + "<p>foo <a!></p>", + "should not support other characters after opening tag names" + ); + + assert_eq!( + micromark_with_options("foo <a !>", DANGER), + "<p>foo <a !></p>", + "should not support other characters in opening tags (1)" + ); + + assert_eq!( + micromark_with_options("foo <a b!>", DANGER), + "<p>foo <a b!></p>", + "should not support other characters in opening tags (2)" + ); + + assert_eq!( + micromark_with_options("foo <a b/>", DANGER), + "<p>foo <a b/></p>", + "should support a self-closing slash after an attribute name" + ); + + assert_eq!( + micromark_with_options("foo <a b>", DANGER), + "<p>foo <a b></p>", + "should support a greater than after an attribute name" + ); + + assert_eq!( + micromark_with_options("foo <a b=<>", DANGER), + "<p>foo <a b=<></p>", + "should not support less than to start an unquoted attribute value" + ); + + assert_eq!( + micromark_with_options("foo <a b=>>", DANGER), + "<p>foo <a b=>></p>", + "should not support greater than to start an unquoted attribute value" + ); + + assert_eq!( + micromark_with_options("foo <a b==>", DANGER), + "<p>foo <a b==></p>", + "should not support equals to to start an unquoted attribute value" + ); + + assert_eq!( + micromark_with_options("foo <a b=`>", DANGER), + "<p>foo <a b=`></p>", + "should not support grave accent to start an unquoted attribute value" + ); + + assert_eq!( + micromark_with_options("foo <a b=\"asd", DANGER), + "<p>foo <a b="asd</p>", + "should not support eof in double quoted attribute value" + ); + + assert_eq!( + micromark_with_options("foo <a b='asd", DANGER), + "<p>foo <a b='asd</p>", + "should not support eof in single quoted attribute value" + ); + + assert_eq!( + micromark_with_options("foo <a b=asd", DANGER), + "<p>foo <a b=asd</p>", + "should not support eof in unquoted attribute value" + ); + + // To do: line endings. + // assert_eq!( + // micromark_with_options("foo <a b=\nasd>", DANGER), + // "<p>foo <a b=\nasd></p>", + // "should support an eol before an attribute value" + // ); + + assert_eq!( +micromark_with_options("<x> a", DANGER), +"<p><x> a</p>", +"should support starting a line w/ a tag if followed by anything other than an eol (after optional space/tabs)" +); + + assert_eq!( + micromark_with_options("<span foo=", DANGER), + "<p><span foo=</p>", + "should support an EOF before an attribute value" + ); + + // To do: line endings. + // assert_eq!( + // micromark_with_options("a <!b\nc>", DANGER), + // "<p>a <!b\nc></p>", + // "should support an EOL in a declaration" + // ); + // To do: line endings. + // assert_eq!( + // micromark_with_options("a <![CDATA[\n]]>", DANGER), + // "<p>a <![CDATA[\n]]></p>", + // "should support an EOL in cdata" + // ); + + // To do: line endings. + // // Note: cmjs parses this differently. + // // See: <https://github.com/commonmark/commonmark.js/issues/196> + // assert_eq!( + // micromark_with_options("a <?\n?>", DANGER), + // "<p>a <?\n?></p>", + // "should support an EOL in an instruction" + // ); + + // // To do: extensions. + // // assert_eq!( + // // micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}), + // // "<p>a <x></p>", + // // "should support turning off html (text)" + // // ); +} |