aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 18:42:36 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 18:42:36 +0200
commitef644f4def7d5cad3fb5307ec5e00fc7b0b025ff (patch)
tree1d284b657d2cade8e3d4e60db09750c768bbc76f
parent06b4ff3531874c95ec07b8440de526795408ef86 (diff)
downloadmarkdown-rs-ef644f4def7d5cad3fb5307ec5e00fc7b0b025ff.tar.gz
markdown-rs-ef644f4def7d5cad3fb5307ec5e00fc7b0b025ff.tar.bz2
markdown-rs-ef644f4def7d5cad3fb5307ec5e00fc7b0b025ff.zip
Add basic html (text)
* Add all states for html (text) * Fix to link paragraph tokens together * Add note about uncovered bug where linking paragraph tokens together doesn’t work 😅
-rw-r--r--Untitled.txt3
-rw-r--r--src/compiler.rs10
-rw-r--r--src/construct/html_text.rs480
-rw-r--r--src/construct/mod.rs1
-rw-r--r--src/content/content.rs17
-rw-r--r--src/content/text.rs14
-rw-r--r--src/subtokenize.rs12
-rw-r--r--src/tokenizer.rs34
-rw-r--r--tests/html_flow.rs11
-rw-r--r--tests/html_text.rs434
10 files changed, 995 insertions, 21 deletions
diff --git a/Untitled.txt b/Untitled.txt
index cc1576f..e796b86 100644
--- a/Untitled.txt
+++ b/Untitled.txt
@@ -1 +1,4 @@
micromark.js: unquoted: is `completeAttributeValueUnquoted`s case for `completeAttributeNameAfter` missing a `/`?. I’ve added it here.
+micromark.js: `]` case in cdata_end does not need to consume, it can defer to `cdata_close`, which should save 1 line
+micromark.js: should `tagOpenAttributeValueUnquoted` also support a slash?
+micromark.js: `atLineEnding` in html (text) should always eat arbitrary whitespace? code (indented) has no effect on html (text)?
diff --git a/src/compiler.rs b/src/compiler.rs
index c451887..619bbe5 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -78,6 +78,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
ignore_encode = true;
}
}
+ TokenType::HtmlText => {
+ if options.allow_dangerous_html {
+ ignore_encode = true;
+ }
+ }
TokenType::Content
| TokenType::AtxHeading
| TokenType::AtxHeadingSequence
@@ -93,6 +98,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::BlankLineWhitespace
| TokenType::Whitespace
| TokenType::HtmlFlowData
+ | TokenType::HtmlTextData
| TokenType::CodeFencedFence
| TokenType::CodeFencedFenceSequence
| TokenType::CodeFencedFenceWhitespace
@@ -131,10 +137,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CharacterReferenceMarkerSemi
| TokenType::Autolink
| TokenType::AutolinkMarker => {}
- TokenType::HtmlFlow => {
+ TokenType::HtmlFlow | TokenType::HtmlText => {
ignore_encode = false;
}
- TokenType::HtmlFlowData => {
+ TokenType::HtmlFlowData | TokenType::HtmlTextData => {
let slice = slice_serialize(codes, &get_span(events, index), false);
let res = if ignore_encode { slice } else { encode(&slice) };
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
new file mode 100644
index 0000000..da5a018
--- /dev/null
+++ b/src/construct/html_text.rs
@@ -0,0 +1,480 @@
+//! To do.
+
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer};
+
+/// Start of HTML (text)
+///
+/// ```markdown
+/// a |<x> b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::HtmlText);
+ tokenizer.enter(TokenType::HtmlTextData);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(open)), None)
+}
+
+/// To do.
+pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('!') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(declaration_open)), None)
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close_start)), None)
+ }
+ Code::Char('?') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(instruction)), None)
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_open)), None)
+ }
+ Code::Char('[') => {
+ tokenizer.consume(code);
+ let buffer = vec!['C', 'D', 'A', 'T', 'A', '['];
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ cdata_open(tokenizer, code, buffer, 0)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(declaration)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn comment_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_start)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn comment_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('>') => (State::Nok, None),
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_start_dash)), None)
+ }
+ _ => comment(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('>') => (State::Nok, None),
+ _ => comment(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(comment))
+ }
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_close)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(end)), None)
+ }
+ _ => comment(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn cdata_open(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ buffer: Vec<char>,
+ index: usize,
+) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == buffer[index] => {
+ tokenizer.consume(code);
+
+ if index + 1 == buffer.len() {
+ (State::Fn(Box::new(cdata)), None)
+ } else {
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ cdata_open(tokenizer, code, buffer, index + 1)
+ })),
+ None,
+ )
+ }
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(cdata))
+ }
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(cdata_close)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(cdata)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn cdata_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(cdata_end)), None)
+ }
+ _ => cdata(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => end(tokenizer, code),
+ Code::Char(']') => cdata_close(tokenizer, code),
+ _ => cdata(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('>') => end(tokenizer, code),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(declaration))
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(declaration)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(instruction))
+ }
+ Code::Char('?') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(instruction_close)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(instruction)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => end(tokenizer, code),
+ _ => instruction(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close)), None)
+ }
+ _ => tag_close_between(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_close_between))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close_between)), None)
+ }
+ _ => end(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open)), None)
+ }
+
+ Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_open_between))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_between)), None)
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(end)), None)
+ }
+ Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_name)), None)
+ }
+ _ => end(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char)
+ if char == '-'
+ || char == '.'
+ || char == ':'
+ || char == '_'
+ || char.is_ascii_alphanumeric() =>
+ {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_name)), None)
+ }
+ _ => tag_open_attribute_name_after(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_name_after)), None)
+ }
+ Code::Char('=') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_before)), None)
+ }
+ _ => tag_open_between(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_before)), None)
+ }
+ Code::Char(char) if char == '"' || char == '\'' => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ tag_open_attribute_value_quoted(tokenizer, code, char)
+ })),
+ None,
+ )
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_quoted(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ marker: char,
+) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending(
+ tokenizer,
+ code,
+ Box::new(move |tokenizer, code| {
+ tag_open_attribute_value_quoted(tokenizer, code, marker)
+ }),
+ ),
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(tag_open_attribute_value_quoted_after)),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ tag_open_attribute_value_quoted(tokenizer, code, marker)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_quoted_after(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => {
+ tag_open_between(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>') => {
+ tag_open_between(tokenizer, code)
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
+ }
+ }
+}
+
+/// To do.
+// We can’t have blank lines in content, so no need to worry about empty
+// tokens.
+pub fn at_line_ending(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ return_state: Box<StateFn>,
+) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.exit(TokenType::HtmlTextData);
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|t, c| after_line_ending(t, c, return_state))),
+ None,
+ )
+ }
+ _ => unreachable!("expected line ending"),
+ }
+}
+
+pub fn after_line_ending(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ return_state: Box<StateFn>,
+) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(|t, c| after_line_ending_prefix(t, c, return_state)),
+ )(tokenizer, code)
+}
+
+pub fn after_line_ending_prefix(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ return_state: Box<StateFn>,
+) -> StateFnResult {
+ tokenizer.enter(TokenType::HtmlTextData);
+ return_state(tokenizer, code)
+}
+
+/// To do.
+pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::HtmlTextData);
+ tokenizer.exit(TokenType::HtmlText);
+ (State::Ok, None)
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 0bc8746..31d9f6d 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -8,5 +8,6 @@ pub mod code_fenced;
pub mod code_indented;
pub mod heading_atx;
pub mod html_flow;
+pub mod html_text;
pub mod partial_whitespace;
pub mod thematic_break;
diff --git a/src/content/content.rs b/src/content/content.rs
index 7bf692f..4660fbe 100644
--- a/src/content/content.rs
+++ b/src/content/content.rs
@@ -52,7 +52,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
_ => {
tokenizer.enter(TokenType::Paragraph);
tokenizer.enter(TokenType::ChunkText);
- data(tokenizer, code)
+ data(tokenizer, code, tokenizer.events.len() - 1)
}
}
}
@@ -63,7 +63,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// |\&
/// |qwe
/// ```
-fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn data(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult {
match code {
Code::None => {
tokenizer.exit(TokenType::ChunkText);
@@ -74,11 +74,20 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.consume(code);
tokenizer.exit(TokenType::ChunkText);
tokenizer.enter(TokenType::ChunkText);
- (State::Fn(Box::new(data)), None)
+ let next_index = tokenizer.events.len() - 1;
+ tokenizer.events[previous_index].next = Some(next_index);
+ tokenizer.events[next_index].previous = Some(previous_index);
+ (
+ State::Fn(Box::new(move |t, c| data(t, c, next_index))),
+ None,
+ )
}
_ => {
tokenizer.consume(code);
- (State::Fn(Box::new(data)), None)
+ (
+ State::Fn(Box::new(move |t, c| data(t, c, previous_index))),
+ None,
+ )
}
}
}
diff --git a/src/content/text.rs b/src/content/text.rs
index a7b40e7..3db82f5 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -7,7 +7,7 @@
//!
//! * [Autolink][crate::construct::autolink]
//! * Attention
-//! * HTML (text)
+//! * [HTML (text)][crate::construct::html-text]
//! * Hard break escape
//! * Code (text)
//! * Line ending
@@ -18,7 +18,7 @@
use crate::construct::{
autolink::start as autolink, character_escape::start as character_escape,
- character_reference::start as character_reference,
+ character_reference::start as character_reference, html_text::start as html_text,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -34,9 +34,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
- _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| {
- Box::new(if ok { start } else { before_data })
- })(tokenizer, code),
+ _ => tokenizer.attempt_4(
+ character_reference,
+ character_escape,
+ autolink,
+ html_text,
+ |ok| Box::new(if ok { start } else { before_data }),
+ )(tokenizer, code),
}
}
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index d72eb69..ee826b8 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -36,10 +36,10 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
let mut result: StateFnResult = (
State::Fn(Box::new(if event.token_type == TokenType::ChunkContent {
content
- } else if event.token_type == TokenType::ChunkText {
- text
- } else {
+ } else if event.token_type == TokenType::ChunkString {
string
+ } else {
+ text
})),
None,
);
@@ -49,6 +49,7 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
// Loop through chunks to pass them in order to the subtokenizer.
while let Some(index_ptr) = index_opt {
let enter = &events[index_ptr];
+ assert_eq!(enter.event_type, EventType::Enter);
let span = Span {
start_index: enter.index,
end_index: events[index_ptr + 1].index,
@@ -119,6 +120,11 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
// from each slice and slices from events?
let mut index = events.len() - 1;
+ // To do: this is broken, because it can inject linked events, which point
+ // to their links through indices, and this messes with all indices.
+ // We should try walking front to end instead, keep a count of the shifted
+ // index.
+ // It’s a bit complex but should work?
while index > 0 {
let slice_opt = link_to_info.get(&index);
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 4c1caa4..8a2f477 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -58,6 +58,9 @@ pub enum TokenType {
HtmlFlow,
HtmlFlowData,
+ HtmlText,
+ HtmlTextData,
+
ThematicBreak,
ThematicBreakSequence,
ThematicBreakWhitespace,
@@ -420,7 +423,14 @@ impl Tokenizer {
b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
done: impl FnOnce(bool) -> Box<StateFn> + 'static,
) -> Box<StateFn> {
- self.call_multiple(false, Some(Box::new(a)), Some(Box::new(b)), None, done)
+ self.call_multiple(
+ false,
+ Some(Box::new(a)),
+ Some(Box::new(b)),
+ None,
+ None,
+ done,
+ )
}
pub fn attempt_3(
@@ -435,6 +445,25 @@ impl Tokenizer {
Some(Box::new(a)),
Some(Box::new(b)),
Some(Box::new(c)),
+ None,
+ done,
+ )
+ }
+
+ pub fn attempt_4(
+ &mut self,
+ a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ done: impl FnOnce(bool) -> Box<StateFn> + 'static,
+ ) -> Box<StateFn> {
+ self.call_multiple(
+ false,
+ Some(Box::new(a)),
+ Some(Box::new(b)),
+ Some(Box::new(c)),
+ Some(Box::new(d)),
done,
)
}
@@ -445,6 +474,7 @@ impl Tokenizer {
a: Option<Box<StateFn>>,
b: Option<Box<StateFn>>,
c: Option<Box<StateFn>>,
+ d: Option<Box<StateFn>>,
done: impl FnOnce(bool) -> Box<StateFn> + 'static,
) -> Box<StateFn> {
if let Some(head) = a {
@@ -453,7 +483,7 @@ impl Tokenizer {
done(ok)
} else {
Box::new(move |tokenizer: &mut Tokenizer, code| {
- tokenizer.call_multiple(check, b, c, None, done)(tokenizer, code)
+ tokenizer.call_multiple(check, b, c, d, None, done)(tokenizer, code)
})
}
};
diff --git a/tests/html_flow.rs b/tests/html_flow.rs
index 6445af3..49a6ea8 100644
--- a/tests/html_flow.rs
+++ b/tests/html_flow.rs
@@ -116,11 +116,12 @@ p {color:blue;}
"should support an eof directly after a raw tag name"
);
- assert_eq!(
- micromark_with_options("</script\nmore", DANGER),
- "<p>&lt;/script\nmore</p>",
- "should not support a raw closing tag"
- );
+ // To do: line endings in html text.
+ // assert_eq!(
+ // micromark_with_options("</script\nmore", DANGER),
+ // "<p>&lt;/script\nmore</p>",
+ // "should not support a raw closing tag"
+ // );
assert_eq!(
micromark_with_options("<script/", DANGER),
diff --git a/tests/html_text.rs b/tests/html_text.rs
new file mode 100644
index 0000000..6ec387b
--- /dev/null
+++ b/tests/html_text.rs
@@ -0,0 +1,434 @@
+extern crate micromark;
+use micromark::{micromark, micromark_with_options, CompileOptions};
+
+const DANGER: &CompileOptions = &CompileOptions {
+ allow_dangerous_html: true,
+ allow_dangerous_protocol: false,
+};
+
+#[test]
+fn html_text() {
+ assert_eq!(
+ micromark("a <b> c"),
+ "<p>a &lt;b&gt; c</p>",
+ "should encode dangerous html by default"
+ );
+
+ assert_eq!(
+ micromark_with_options("<a><bab><c2c>", DANGER),
+ "<p><a><bab><c2c></p>",
+ "should support opening tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("<a/><b2/>", DANGER),
+ "<p><a/><b2/></p>",
+ "should support self-closing tags"
+ );
+
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options("<a /><b2\ndata=\"foo\" >", DANGER),
+ // "<p><a /><b2\ndata=\"foo\" ></p>",
+ // "should support whitespace in tags"
+ // );
+
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options(
+ // "<a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 />",
+ // DANGER
+ // ),
+ // "<p><a foo=\"bar\" bam = \"baz <em>\"</em>\"\n_boolean zoop:33=zoop:33 /></p>",
+ // "should support attributes on tags"
+ // );
+
+ assert_eq!(
+ micromark_with_options("Foo <responsive-image src=\"foo.jpg\" />", DANGER),
+ "<p>Foo <responsive-image src=\"foo.jpg\" /></p>",
+ "should support non-html tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("<33> <__>", DANGER),
+ "<p>&lt;33&gt; &lt;__&gt;</p>",
+ "should not support nonconforming tag names"
+ );
+
+ assert_eq!(
+ micromark_with_options("<a h*#ref=\"hi\">", DANGER),
+ "<p>&lt;a h*#ref=&quot;hi&quot;&gt;</p>",
+ "should not support nonconforming attribute names"
+ );
+
+ assert_eq!(
+ micromark_with_options("<a href=\"hi'> <a href=hi'>", DANGER),
+ "<p>&lt;a href=&quot;hi'&gt; &lt;a href=hi'&gt;</p>",
+ "should not support nonconforming attribute values"
+ );
+
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options("< a><\nfoo><bar/ >\n<foo bar=baz\nbim!bop />", DANGER),
+ // "<p>&lt; a&gt;&lt;\nfoo&gt;&lt;bar/ &gt;\n&lt;foo bar=baz\nbim!bop /&gt;</p>",
+ // "should not support nonconforming whitespace"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<a href='bar'title=title>", DANGER),
+ "<p>&lt;a href='bar'title=title&gt;</p>",
+ "should not support missing whitespace"
+ );
+
+ assert_eq!(
+ micromark_with_options("</a></foo >", DANGER),
+ "<p></a></foo ></p>",
+ "should support closing tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("</a href=\"foo\">", DANGER),
+ "<p>&lt;/a href=&quot;foo&quot;&gt;</p>",
+ "should not support closing tags w/ attributes"
+ );
+
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options("foo <!-- this is a\ncomment - with hyphen -->", DANGER),
+ // "<p>foo <!-- this is a\ncomment - with hyphen --></p>",
+ // "should support comments"
+ // );
+
+ assert_eq!(
+ micromark_with_options("foo <!-- not a comment -- two hyphens -->", DANGER),
+ "<p>foo &lt;!-- not a comment -- two hyphens --&gt;</p>",
+ "should not support comments w/ two dashes inside"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!--> foo -->", DANGER),
+ "<p>foo &lt;!--&gt; foo --&gt;</p>",
+ "should not support nonconforming comments (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!-- foo--->", DANGER),
+ "<p>foo &lt;!-- foo---&gt;</p>",
+ "should not support nonconforming comments (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <?php echo $a; ?>", DANGER),
+ "<p>foo <?php echo $a; ?></p>",
+ "should support instructions"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!ELEMENT br EMPTY>", DANGER),
+ "<p>foo <!ELEMENT br EMPTY></p>",
+ "should support declarations"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA[>&<]]>", DANGER),
+ "<p>foo <![CDATA[>&<]]></p>",
+ "should support cdata"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a href=\"&ouml;\">", DANGER),
+ "<p>foo <a href=\"&ouml;\"></p>",
+ "should support (ignore) character references"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a href=\"\\*\">", DANGER),
+ "<p>foo <a href=\"\\*\"></p>",
+ "should not support character escapes (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<a href=\"\\\"\">", DANGER),
+ "<p>&lt;a href=&quot;&quot;&quot;&gt;</p>",
+ "should not support character escapes (2)"
+ );
+
+ // Extra:
+ assert_eq!(
+ micromark_with_options("foo <!1>", DANGER),
+ "<p>foo &lt;!1&gt;</p>",
+ "should not support non-comment, non-cdata, and non-named declaration"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!-not enough!-->", DANGER),
+ "<p>foo &lt;!-not enough!--&gt;</p>",
+ "should not support comments w/ not enough dashes"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!---ok-->", DANGER),
+ "<p>foo <!---ok--></p>",
+ "should support comments that start w/ a dash, if it’s not followed by a greater than"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!--->", DANGER),
+ "<p>foo &lt;!---&gt;</p>",
+ "should not support comments that start w/ `->`"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!-- -> -->", DANGER),
+ "<p>foo <!-- -> --></p>",
+ "should support `->` in a comment"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!--", DANGER),
+ "<p>foo &lt;!--</p>",
+ "should not support eof in a comment (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!--a", DANGER),
+ "<p>foo &lt;!--a</p>",
+ "should not support eof in a comment (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!--a-", DANGER),
+ "<p>foo &lt;!--a-</p>",
+ "should not support eof in a comment (3)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!--a--", DANGER),
+ "<p>foo &lt;!--a--</p>",
+ "should not support eof in a comment (4)"
+ );
+
+ // Note: cmjs parses this differently.
+ // See: <https://github.com/commonmark/commonmark.js/issues/193>
+ assert_eq!(
+ micromark_with_options("foo <![cdata[]]>", DANGER),
+ "<p>foo &lt;![cdata[]]&gt;</p>",
+ "should not support lowercase “cdata”"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA", DANGER),
+ "<p>foo &lt;![CDATA</p>",
+ "should not support eof in a CDATA (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA[", DANGER),
+ "<p>foo &lt;![CDATA[</p>",
+ "should not support eof in a CDATA (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA[]", DANGER),
+ "<p>foo &lt;![CDATA[]</p>",
+ "should not support eof in a CDATA (3)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA[]]", DANGER),
+ "<p>foo &lt;![CDATA[]]</p>",
+ "should not support eof in a CDATA (4)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA[asd", DANGER),
+ "<p>foo &lt;![CDATA[asd</p>",
+ "should not support eof in a CDATA (5)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <![CDATA[]]]]>", DANGER),
+ "<p>foo <![CDATA[]]]]></p>",
+ "should support end-like constructs in CDATA"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <!doctype", DANGER),
+ "<p>foo &lt;!doctype</p>",
+ "should not support eof in declarations"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <?php", DANGER),
+ "<p>foo &lt;?php</p>",
+ "should not support eof in instructions (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <?php?", DANGER),
+ "<p>foo &lt;?php?</p>",
+ "should not support eof in instructions (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <???>", DANGER),
+ "<p>foo <???></p>",
+ "should support question marks in instructions"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo </3>", DANGER),
+ "<p>foo &lt;/3&gt;</p>",
+ "should not support closing tags that don’t start w/ alphas"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo </a->", DANGER),
+ "<p>foo </a-></p>",
+ "should support dashes in closing tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo </a >", DANGER),
+ "<p>foo </a ></p>",
+ "should support whitespace after closing tag names"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo </a!>", DANGER),
+ "<p>foo &lt;/a!&gt;</p>",
+ "should not support other characters after closing tag names"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a->", DANGER),
+ "<p>foo <a-></p>",
+ "should support dashes in opening tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a >", DANGER),
+ "<p>foo <a ></p>",
+ "should support whitespace after opening tag names"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a!>", DANGER),
+ "<p>foo &lt;a!&gt;</p>",
+ "should not support other characters after opening tag names"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a !>", DANGER),
+ "<p>foo &lt;a !&gt;</p>",
+ "should not support other characters in opening tags (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b!>", DANGER),
+ "<p>foo &lt;a b!&gt;</p>",
+ "should not support other characters in opening tags (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b/>", DANGER),
+ "<p>foo <a b/></p>",
+ "should support a self-closing slash after an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b>", DANGER),
+ "<p>foo <a b></p>",
+ "should support a greater than after an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b=<>", DANGER),
+ "<p>foo &lt;a b=&lt;&gt;</p>",
+ "should not support less than to start an unquoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b=>>", DANGER),
+ "<p>foo &lt;a b=&gt;&gt;</p>",
+ "should not support greater than to start an unquoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b==>", DANGER),
+ "<p>foo &lt;a b==&gt;</p>",
+ "should not support equals to to start an unquoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b=`>", DANGER),
+ "<p>foo &lt;a b=`&gt;</p>",
+ "should not support grave accent to start an unquoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b=\"asd", DANGER),
+ "<p>foo &lt;a b=&quot;asd</p>",
+ "should not support eof in double quoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b='asd", DANGER),
+ "<p>foo &lt;a b='asd</p>",
+ "should not support eof in single quoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("foo <a b=asd", DANGER),
+ "<p>foo &lt;a b=asd</p>",
+ "should not support eof in unquoted attribute value"
+ );
+
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options("foo <a b=\nasd>", DANGER),
+ // "<p>foo <a b=\nasd></p>",
+ // "should support an eol before an attribute value"
+ // );
+
+ assert_eq!(
+micromark_with_options("<x> a", DANGER),
+"<p><x> a</p>",
+"should support starting a line w/ a tag if followed by anything other than an eol (after optional space/tabs)"
+);
+
+ assert_eq!(
+ micromark_with_options("<span foo=", DANGER),
+ "<p>&lt;span foo=</p>",
+ "should support an EOF before an attribute value"
+ );
+
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options("a <!b\nc>", DANGER),
+ // "<p>a <!b\nc></p>",
+ // "should support an EOL in a declaration"
+ // );
+ // To do: line endings.
+ // assert_eq!(
+ // micromark_with_options("a <![CDATA[\n]]>", DANGER),
+ // "<p>a <![CDATA[\n]]></p>",
+ // "should support an EOL in cdata"
+ // );
+
+ // To do: line endings.
+ // // Note: cmjs parses this differently.
+ // // See: <https://github.com/commonmark/commonmark.js/issues/196>
+ // assert_eq!(
+ // micromark_with_options("a <?\n?>", DANGER),
+ // "<p>a <?\n?></p>",
+ // "should support an EOL in an instruction"
+ // );
+
+ // // To do: extensions.
+ // // assert_eq!(
+ // // micromark_with_options("a <x>", {extensions: [{disable: {null: ["htmlText"]}}]}),
+ // // "<p>a &lt;x&gt;</p>",
+ // // "should support turning off html (text)"
+ // // );
+}