aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 18:42:36 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 18:42:36 +0200
commitef644f4def7d5cad3fb5307ec5e00fc7b0b025ff (patch)
tree1d284b657d2cade8e3d4e60db09750c768bbc76f /src/construct
parent06b4ff3531874c95ec07b8440de526795408ef86 (diff)
downloadmarkdown-rs-ef644f4def7d5cad3fb5307ec5e00fc7b0b025ff.tar.gz
markdown-rs-ef644f4def7d5cad3fb5307ec5e00fc7b0b025ff.tar.bz2
markdown-rs-ef644f4def7d5cad3fb5307ec5e00fc7b0b025ff.zip
Add basic html (text)
* Add all states for html (text) * Fix to link paragraph tokens together * Add note about uncovered bug where linking paragraph tokens together doesn’t work 😅
Diffstat (limited to 'src/construct')
-rw-r--r--src/construct/html_text.rs480
-rw-r--r--src/construct/mod.rs1
2 files changed, 481 insertions, 0 deletions
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
new file mode 100644
index 0000000..da5a018
--- /dev/null
+++ b/src/construct/html_text.rs
@@ -0,0 +1,480 @@
+//! To do.
+
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer};
+
+/// Start of HTML (text)
+///
+/// ```markdown
+/// a |<x> b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::HtmlText);
+ tokenizer.enter(TokenType::HtmlTextData);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(open)), None)
+}
+
+/// To do.
+pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('!') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(declaration_open)), None)
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close_start)), None)
+ }
+ Code::Char('?') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(instruction)), None)
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_open)), None)
+ }
+ Code::Char('[') => {
+ tokenizer.consume(code);
+ let buffer = vec!['C', 'D', 'A', 'T', 'A', '['];
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ cdata_open(tokenizer, code, buffer, 0)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(declaration)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn comment_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_start)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn comment_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('>') => (State::Nok, None),
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_start_dash)), None)
+ }
+ _ => comment(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('>') => (State::Nok, None),
+ _ => comment(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(comment))
+ }
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment_close)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(comment)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(end)), None)
+ }
+ _ => comment(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn cdata_open(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ buffer: Vec<char>,
+ index: usize,
+) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == buffer[index] => {
+ tokenizer.consume(code);
+
+ if index + 1 == buffer.len() {
+ (State::Fn(Box::new(cdata)), None)
+ } else {
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ cdata_open(tokenizer, code, buffer, index + 1)
+ })),
+ None,
+ )
+ }
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(cdata))
+ }
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(cdata_close)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(cdata)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn cdata_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(cdata_end)), None)
+ }
+ _ => cdata(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => end(tokenizer, code),
+ Code::Char(']') => cdata_close(tokenizer, code),
+ _ => cdata(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('>') => end(tokenizer, code),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(declaration))
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(declaration)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(instruction))
+ }
+ Code::Char('?') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(instruction_close)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(instruction)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => end(tokenizer, code),
+ _ => instruction(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close)), None)
+ }
+ _ => tag_close_between(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_close_between))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_close_between)), None)
+ }
+ _ => end(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open)), None)
+ }
+
+ Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_open_between))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_between)), None)
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(end)), None)
+ }
+ Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_name)), None)
+ }
+ _ => end(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char)
+ if char == '-'
+ || char == '.'
+ || char == ':'
+ || char == '_'
+ || char.is_ascii_alphanumeric() =>
+ {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_name)), None)
+ }
+ _ => tag_open_attribute_name_after(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_name_after)), None)
+ }
+ Code::Char('=') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_before)), None)
+ }
+ _ => tag_open_between(tokenizer, code),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_before)), None)
+ }
+ Code::Char(char) if char == '"' || char == '\'' => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ tag_open_attribute_value_quoted(tokenizer, code, char)
+ })),
+ None,
+ )
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
+ }
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_quoted(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ marker: char,
+) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending(
+ tokenizer,
+ code,
+ Box::new(move |tokenizer, code| {
+ tag_open_attribute_value_quoted(tokenizer, code, marker)
+ }),
+ ),
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(tag_open_attribute_value_quoted_after)),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ tag_open_attribute_value_quoted(tokenizer, code, marker)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_quoted_after(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>' | '/') => {
+ tag_open_between(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// To do.
+pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | ' ' | '>') => {
+ tag_open_between(tokenizer, code)
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
+ }
+ }
+}
+
+/// To do.
+// We can’t have blank lines in content, so no need to worry about empty
+// tokens.
+pub fn at_line_ending(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ return_state: Box<StateFn>,
+) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.exit(TokenType::HtmlTextData);
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|t, c| after_line_ending(t, c, return_state))),
+ None,
+ )
+ }
+ _ => unreachable!("expected line ending"),
+ }
+}
+
+pub fn after_line_ending(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ return_state: Box<StateFn>,
+) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(|t, c| after_line_ending_prefix(t, c, return_state)),
+ )(tokenizer, code)
+}
+
+pub fn after_line_ending_prefix(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ return_state: Box<StateFn>,
+) -> StateFnResult {
+ tokenizer.enter(TokenType::HtmlTextData);
+ return_state(tokenizer, code)
+}
+
+/// To do.
+pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::HtmlTextData);
+ tokenizer.exit(TokenType::HtmlText);
+ (State::Ok, None)
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 0bc8746..31d9f6d 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -8,5 +8,6 @@ pub mod code_fenced;
pub mod code_indented;
pub mod heading_atx;
pub mod html_flow;
+pub mod html_text;
pub mod partial_whitespace;
pub mod thematic_break;