diff options
Diffstat (limited to '')
-rw-r--r-- | src/compiler.rs | 10 | ||||
-rw-r--r-- | src/construct/html_text.rs | 480 | ||||
-rw-r--r-- | src/construct/mod.rs | 1 | ||||
-rw-r--r-- | src/content/content.rs | 17 | ||||
-rw-r--r-- | src/content/text.rs | 14 | ||||
-rw-r--r-- | src/subtokenize.rs | 12 | ||||
-rw-r--r-- | src/tokenizer.rs | 34 |
7 files changed, 552 insertions, 16 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index c451887..619bbe5 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -78,6 +78,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St ignore_encode = true; } } + TokenType::HtmlText => { + if options.allow_dangerous_html { + ignore_encode = true; + } + } TokenType::Content | TokenType::AtxHeading | TokenType::AtxHeadingSequence @@ -93,6 +98,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::BlankLineWhitespace | TokenType::Whitespace | TokenType::HtmlFlowData + | TokenType::HtmlTextData | TokenType::CodeFencedFence | TokenType::CodeFencedFenceSequence | TokenType::CodeFencedFenceWhitespace @@ -131,10 +137,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterReferenceMarkerSemi | TokenType::Autolink | TokenType::AutolinkMarker => {} - TokenType::HtmlFlow => { + TokenType::HtmlFlow | TokenType::HtmlText => { ignore_encode = false; } - TokenType::HtmlFlowData => { + TokenType::HtmlFlowData | TokenType::HtmlTextData => { let slice = slice_serialize(codes, &get_span(events, index), false); let res = if ignore_encode { slice } else { encode(&slice) }; diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs new file mode 100644 index 0000000..da5a018 --- /dev/null +++ b/src/construct/html_text.rs @@ -0,0 +1,480 @@ +//! To do. + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; + +/// Start of HTML (text) +/// +/// ```markdown +/// a |<x> b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HtmlText); + tokenizer.enter(TokenType::HtmlTextData); + tokenizer.consume(code); + (State::Fn(Box::new(open)), None) +} + +/// To do. +pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.consume(code); + (State::Fn(Box::new(declaration_open)), None) + } + Code::Char('/') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close_start)), None) + } + Code::Char('?') => { + tokenizer.consume(code); + (State::Fn(Box::new(instruction)), None) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_open)), None) + } + Code::Char('[') => { + tokenizer.consume(code); + let buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open(tokenizer, code, buffer, 0) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(declaration)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn comment_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_start)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn comment_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('>') => (State::Nok, None), + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_start_dash)), None) + } + _ => comment(tokenizer, code), + } +} + +/// To do. +pub fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('>') => (State::Nok, None), + _ => comment(tokenizer, code), + } +} + +/// To do. +pub fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(comment)) + } + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(comment_close)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(comment)), None) + } + } +} + +/// To do. +pub fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + (State::Fn(Box::new(end)), None) + } + _ => comment(tokenizer, code), + } +} + +/// To do. +pub fn cdata_open( + tokenizer: &mut Tokenizer, + code: Code, + buffer: Vec<char>, + index: usize, +) -> StateFnResult { + match code { + Code::Char(char) if char == buffer[index] => { + tokenizer.consume(code); + + if index + 1 == buffer.len() { + (State::Fn(Box::new(cdata)), None) + } else { + ( + State::Fn(Box::new(move |tokenizer, code| { + cdata_open(tokenizer, code, buffer, index + 1) + })), + None, + ) + } + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(cdata)) + } + Code::Char(']') => { + tokenizer.consume(code); + (State::Fn(Box::new(cdata_close)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(cdata)), None) + } + } +} + +/// To do. +pub fn cdata_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.consume(code); + (State::Fn(Box::new(cdata_end)), None) + } + _ => cdata(tokenizer, code), + } +} + +/// To do. +pub fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => end(tokenizer, code), + Code::Char(']') => cdata_close(tokenizer, code), + _ => cdata(tokenizer, code), + } +} + +/// To do. +pub fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('>') => end(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(declaration)) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(declaration)), None) + } + } +} + +/// To do. +pub fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(instruction)) + } + Code::Char('?') => { + tokenizer.consume(code); + (State::Fn(Box::new(instruction_close)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(instruction)), None) + } + } +} + +/// To do. +pub fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => end(tokenizer, code), + _ => instruction(tokenizer, code), + } +} + +/// To do. +pub fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close)), None) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close)), None) + } + _ => tag_close_between(tokenizer, code), + } +} + +/// To do. +pub fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_close_between)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_close_between)), None) + } + _ => end(tokenizer, code), + } +} + +/// To do. +pub fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open)), None) + } + + Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), + _ => (State::Nok, None), + } +} + +/// To do. +pub fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_open_between)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_between)), None) + } + Code::Char('/') => { + tokenizer.consume(code); + (State::Fn(Box::new(end)), None) + } + Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_name)), None) + } + _ => end(tokenizer, code), + } +} + +/// To do. +pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) + if char == '-' + || char == '.' + || char == ':' + || char == '_' + || char.is_ascii_alphanumeric() => + { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_name)), None) + } + _ => tag_open_attribute_name_after(tokenizer, code), + } +} + +/// To do. +pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_name_after)), None) + } + Code::Char('=') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_before)), None) + } + _ => tag_open_between(tokenizer, code), + } +} + +/// To do. +pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before)) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_before)), None) + } + Code::Char(char) if char == '"' || char == '\'' => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + tag_open_attribute_value_quoted(tokenizer, code, char) + })), + None, + ) + } + Code::Char(_) => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) + } + } +} + +/// To do. +pub fn tag_open_attribute_value_quoted( + tokenizer: &mut Tokenizer, + code: Code, + marker: char, +) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending( + tokenizer, + code, + Box::new(move |tokenizer, code| { + tag_open_attribute_value_quoted(tokenizer, code, marker) + }), + ), + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(tag_open_attribute_value_quoted_after)), + None, + ) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + tag_open_attribute_value_quoted(tokenizer, code, marker) + })), + None, + ) + } + } +} + +/// To do. +pub fn tag_open_attribute_value_quoted_after( + tokenizer: &mut Tokenizer, + code: Code, +) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => { + tag_open_between(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// To do. +pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>') => { + tag_open_between(tokenizer, code) + } + Code::Char(_) => { + tokenizer.consume(code); + (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) + } + } +} + +/// To do. +// We can’t have blank lines in content, so no need to worry about empty +// tokens. +pub fn at_line_ending( + tokenizer: &mut Tokenizer, + code: Code, + return_state: Box<StateFn>, +) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.exit(TokenType::HtmlTextData); + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|t, c| after_line_ending(t, c, return_state))), + None, + ) + } + _ => unreachable!("expected line ending"), + } +} + +pub fn after_line_ending( + tokenizer: &mut Tokenizer, + code: Code, + return_state: Box<StateFn>, +) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|t, c| after_line_ending_prefix(t, c, return_state)), + )(tokenizer, code) +} + +pub fn after_line_ending_prefix( + tokenizer: &mut Tokenizer, + code: Code, + return_state: Box<StateFn>, +) -> StateFnResult { + tokenizer.enter(TokenType::HtmlTextData); + return_state(tokenizer, code) +} + +/// To do. +pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + tokenizer.exit(TokenType::HtmlTextData); + tokenizer.exit(TokenType::HtmlText); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 0bc8746..31d9f6d 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -8,5 +8,6 @@ pub mod code_fenced; pub mod code_indented; pub mod heading_atx; pub mod html_flow; +pub mod html_text; pub mod partial_whitespace; pub mod thematic_break; diff --git a/src/content/content.rs b/src/content/content.rs index 7bf692f..4660fbe 100644 --- a/src/content/content.rs +++ b/src/content/content.rs @@ -52,7 +52,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { _ => { tokenizer.enter(TokenType::Paragraph); tokenizer.enter(TokenType::ChunkText); - data(tokenizer, code) + data(tokenizer, code, tokenizer.events.len() - 1) } } } @@ -63,7 +63,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |\& /// |qwe /// ``` -fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn data(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { match code { Code::None => { tokenizer.exit(TokenType::ChunkText); @@ -74,11 +74,20 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); tokenizer.exit(TokenType::ChunkText); tokenizer.enter(TokenType::ChunkText); - (State::Fn(Box::new(data)), None) + let next_index = tokenizer.events.len() - 1; + tokenizer.events[previous_index].next = Some(next_index); + tokenizer.events[next_index].previous = Some(previous_index); + ( + State::Fn(Box::new(move |t, c| data(t, c, next_index))), + None, + ) } _ => { tokenizer.consume(code); - (State::Fn(Box::new(data)), None) + ( + State::Fn(Box::new(move |t, c| data(t, c, previous_index))), + None, + ) } } } diff --git a/src/content/text.rs b/src/content/text.rs index a7b40e7..3db82f5 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -7,7 +7,7 @@ //! //! * [Autolink][crate::construct::autolink] //! * Attention -//! * HTML (text) +//! * [HTML (text)][crate::construct::html-text] //! * Hard break escape //! * Code (text) //! * Line ending @@ -18,7 +18,7 @@ use crate::construct::{ autolink::start as autolink, character_escape::start as character_escape, - character_reference::start as character_reference, + character_reference::start as character_reference, html_text::start as html_text, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -34,9 +34,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), - _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| { - Box::new(if ok { start } else { before_data }) - })(tokenizer, code), + _ => tokenizer.attempt_4( + character_reference, + character_escape, + autolink, + html_text, + |ok| Box::new(if ok { start } else { before_data }), + )(tokenizer, code), } } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index d72eb69..ee826b8 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -36,10 +36,10 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { let mut result: StateFnResult = ( State::Fn(Box::new(if event.token_type == TokenType::ChunkContent { content - } else if event.token_type == TokenType::ChunkText { - text - } else { + } else if event.token_type == TokenType::ChunkString { string + } else { + text })), None, ); @@ -49,6 +49,7 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // Loop through chunks to pass them in order to the subtokenizer. while let Some(index_ptr) = index_opt { let enter = &events[index_ptr]; + assert_eq!(enter.event_type, EventType::Enter); let span = Span { start_index: enter.index, end_index: events[index_ptr + 1].index, @@ -119,6 +120,11 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // from each slice and slices from events? let mut index = events.len() - 1; + // To do: this is broken, because it can inject linked events, which point + // to their links through indices, and this messes with all indices. + // We should try walking front to end instead, keep a count of the shifted + // index. + // It’s a bit complex but should work? while index > 0 { let slice_opt = link_to_info.get(&index); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4c1caa4..8a2f477 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -58,6 +58,9 @@ pub enum TokenType { HtmlFlow, HtmlFlowData, + HtmlText, + HtmlTextData, + ThematicBreak, ThematicBreakSequence, ThematicBreakWhitespace, @@ -420,7 +423,14 @@ impl Tokenizer { b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { - self.call_multiple(false, Some(Box::new(a)), Some(Box::new(b)), None, done) + self.call_multiple( + false, + Some(Box::new(a)), + Some(Box::new(b)), + None, + None, + done, + ) } pub fn attempt_3( @@ -435,6 +445,25 @@ impl Tokenizer { Some(Box::new(a)), Some(Box::new(b)), Some(Box::new(c)), + None, + done, + ) + } + + pub fn attempt_4( + &mut self, + a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + done: impl FnOnce(bool) -> Box<StateFn> + 'static, + ) -> Box<StateFn> { + self.call_multiple( + false, + Some(Box::new(a)), + Some(Box::new(b)), + Some(Box::new(c)), + Some(Box::new(d)), done, ) } @@ -445,6 +474,7 @@ impl Tokenizer { a: Option<Box<StateFn>>, b: Option<Box<StateFn>>, c: Option<Box<StateFn>>, + d: Option<Box<StateFn>>, done: impl FnOnce(bool) -> Box<StateFn> + 'static, ) -> Box<StateFn> { if let Some(head) = a { @@ -453,7 +483,7 @@ impl Tokenizer { done(ok) } else { Box::new(move |tokenizer: &mut Tokenizer, code| { - tokenizer.call_multiple(check, b, c, None, done)(tokenizer, code) + tokenizer.call_multiple(check, b, c, d, None, done)(tokenizer, code) }) } }; |