aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct/html_flow.rs
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-08 15:52:16 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-08 15:52:16 +0200
commit4c06c8554c35887f8f5147783953b2b7e7c2327f (patch)
tree1b2463848a3ae4c645f7f1a325877ee829ab65c5 /src/construct/html_flow.rs
downloadmarkdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz
markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2
markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip
.
Diffstat (limited to 'src/construct/html_flow.rs')
-rw-r--r--src/construct/html_flow.rs1068
1 files changed, 1068 insertions, 0 deletions
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
new file mode 100644
index 0000000..b7d5570
--- /dev/null
+++ b/src/construct/html_flow.rs
@@ -0,0 +1,1068 @@
+//! HTML (flow) is a construct that occurs in the flow content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete
+//!
+//! ; Note: closing tag name need to match opening tag name.
+//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ]
+//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ]
+//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ]
+//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ]
+//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ]
+//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ]
+//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional )
+//!
+//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive.
+//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive.
+//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>'
+//! closing_tag ::= '</' tag_name whitespace_optional '>'
+//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric )
+//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ]
+//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric )
+//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`')
+//!
+//! whitespace ::= 1*space_or_tab
+//! whitespace_optional ::= [ space_or_tab ]
+//! line ::= code - eol
+//! eol ::= '\r' | '\r\n' | '\n'
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! The grammar for HTML in markdown does not resemble the rules of parsing
+//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML
+//! spec][html-parsing].
+//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?)
+//! attempt to parse an XML-like language.
+//! By extension, another notable property of the grammar is that it can
+//! result in invalid HTML, in that it allows things that wouldn’t work or
+//! wouldn’t work well in HTML, such as mismatched tags.
+//!
+//! Because the **basic** and **complete** productions in the grammar form with
+//! a tag, followed by more stuff, and stop at a blank line, it is possible to
+//! interleave (a word for switching between languages) markdown and HTML
+//! together, by placing the opening and closing tags on their own lines,
+//! with blank lines between them and markdown.
+//! For example:
+//!
+//! ```markdown
+//! <div>This is a <code>div</code> but *this* is not emphasis.</div>
+//!
+//! <div>
+//!
+//! This is a paragraph in a `div` and *this* is emphasis.
+//!
+//! </div>
+//! ```
+//!
+//! The **complete** production of HTML (flow) is not allowed to interrupt
+//! content.
+//! That means that a blank line is needed between a paragraph and it.
+//! However, HTML (text) has a similar production, which will typically kick-in
+//! instead.
+//!
+//! The list of tag names allowed in the **raw** production are defined in
+//! [`HTML_RAW_NAMES`][html_raw_names].
+//! This production exists because there are a few cases where markdown
+//! *inside* some elements, and hence interleaving, does not make sense.
+//!
+//! The list of tag names allowed in the **basic** production are defined in
+//! [`HTML_BLOCK_NAMES`][html_block_names].
+//! This production exists because there are a few cases where we can decide
+//! early that something is going to be a flow (block) element instead of a
+//! phrasing (inline) element.
+//! We *can* interrupt and don’t have to care too much about it being
+//! well-formed.
+//!
+//! ## References
+//!
+//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js)
+//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks)
+//!
+//! [html_raw_names]: crate::constant::HTML_RAW_NAMES
+//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
+//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
+//!
+//! <!-- To do: link stuff -->
+
+use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX};
+use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Kind of HTML (flow).
+#[derive(Debug, Clone, PartialEq)]
+enum Kind {
+ /// Not yet known.
+ Unknown,
+ /// Symbol for `<script>` (condition 1).
+ Raw,
+ /// Symbol for `<!---->` (condition 2).
+ Comment,
+ /// Symbol for `<?php?>` (condition 3).
+ Instruction,
+ /// Symbol for `<!doctype>` (condition 4).
+ Declaration,
+ /// Symbol for `<![CDATA[]]>` (condition 5).
+ Cdata,
+ /// Symbol for `<div` (condition 6).
+ Basic,
+ /// Symbol for `<x>` (condition 7).
+ Complete,
+}
+
+/// Type of quote, if we’re in an attribure, in complete (condition 7).
+#[derive(Debug, Clone, PartialEq)]
+enum QuoteKind {
+ /// Not in a quoted attribute.
+ None,
+ /// In a double quoted (`"`) attribute.
+ Double,
+ /// In a single quoted (`"`) attribute.
+ Single,
+}
+
+/// State needed to parse HTML (flow).
+#[derive(Debug, Clone)]
+struct Info {
+ /// Kind of HTML (flow).
+ kind: Kind,
+ /// Whether this is a start tag (`<` not followed by `/`).
+ start_tag: bool,
+ /// Used depending on `kind` to either collect all parsed characters, or to
+ /// store expected characters.
+ buffer: Vec<char>,
+ /// `index` into `buffer` when expecting certain characters.
+ index: usize,
+ /// Current quote, when in a double or single quoted attribute value.
+ quote: QuoteKind,
+}
+
+// To do: mark as concrete (block quotes or lists can’t “pierce” into HTML).
+
+/// Start of HTML (flow), before optional whitespace.
+///
+/// ```markdown
+/// |<x />
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::HtmlFlow);
+ tokenizer.enter(TokenType::HtmlFlowData);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(before),
+ )(tokenizer, code)
+}
+
+/// After optional whitespace, before `<`.
+///
+/// ```markdown
+/// |<x />
+/// ```
+fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if Code::Char('<') == code {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ open(
+ tokenizer,
+ Info {
+ kind: Kind::Unknown,
+ start_tag: false,
+ buffer: vec![],
+ index: 0,
+ quote: QuoteKind::None,
+ },
+ code,
+ )
+ })),
+ None,
+ )
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// After `<`, before a tag name or other stuff.
+///
+/// ```markdown
+/// <|x />
+/// <|!doctype />
+/// <|!--xxx--/>
+/// ```
+fn open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('!') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ declaration_start(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ tag_close_start(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('?') => {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Instruction;
+ tokenizer.consume(code);
+ // While we’re in an instruction instead of a declaration, we’re on a `?`
+ // right now, so we do need to search for `>`, similar to declarations.
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ // To do: life times.
+ let mut clone = info;
+ clone.start_tag = true;
+ tag_name(tokenizer, clone, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<!`, so inside a declaration, comment, or CDATA.
+///
+/// ```markdown
+/// <!|doctype />
+/// <!|--xxx--/>
+/// <!|[CDATA[>&<]]>
+/// ```
+fn declaration_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ let mut clone = info;
+ clone.kind = Kind::Comment;
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ comment_open_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('[') => {
+ tokenizer.consume(code);
+ let mut clone = info;
+ clone.kind = Kind::Cdata;
+ clone.buffer = vec!['C', 'D', 'A', 'T', 'A', '['];
+ clone.index = 0;
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ cdata_open_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Declaration;
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<!-`, inside a comment, before another `-`.
+///
+/// ```markdown
+/// <!-|-xxx--/>
+/// ```
+fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<![`, inside CDATA, expecting `CDATA[`.
+///
+/// ```markdown
+/// <![|CDATA[>&<]]>
+/// <![CD|ATA[>&<]]>
+/// <![CDA|TA[>&<]]>
+/// <![CDAT|A[>&<]]>
+/// <![CDATA|[>&<]]>
+/// ```
+fn cdata_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == info.buffer[info.index] => {
+ let mut clone = info;
+ clone.index += 1;
+ tokenizer.consume(code);
+
+ if clone.index == clone.buffer.len() {
+ clone.buffer.clear();
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ cdata_open_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `</`, in a closing tag, before a tag name.
+///
+/// ```markdown
+/// </|x>
+/// ```
+fn tag_close_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In a tag name.
+///
+/// ```markdown
+/// <a|b>
+/// </a|b>
+/// ```
+fn tag_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => {
+ let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase();
+ let name = tag_name_buffer.as_str();
+ let slash = if let Code::Char(char) = code {
+ char == '/'
+ } else {
+ false
+ };
+
+ if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Raw;
+ clone.buffer.clear();
+ continuation(tokenizer, clone, code)
+ } else if HTML_BLOCK_NAMES.contains(&name) {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Basic;
+ clone.buffer.clear();
+
+ if slash {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ basic_self_closing(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ continuation(tokenizer, clone, code)
+ }
+ } else {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Complete;
+
+ // To do: do not support complete HTML when interrupting.
+ if clone.start_tag {
+ complete_attribute_name_before(tokenizer, clone, code)
+ } else {
+ complete_closing_tag_after(tokenizer, clone, code)
+ }
+ }
+ }
+ Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ tokenizer.consume(code);
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))),
+ None,
+ )
+ }
+ Code::Char(_) => (State::Nok, None),
+ }
+}
+
+/// After a closing slash of a basic tag name.
+///
+/// ```markdown
+/// <div/|>
+/// ```
+fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After a closing slash of a complete tag name.
+///
+/// ```markdown
+/// <x/|>
+/// </x/|>
+/// ```
+fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_closing_tag_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_end(tokenizer, info, code),
+ }
+}
+
+/// At a place where an attribute name would be valid.
+///
+/// At first, this state is used after a complete tag name, after whitespace,
+/// where it expects optional attributes or the end of the tag.
+/// It is also reused after attributes, when expecting more optional
+/// attributes.
+///
+/// ```markdown
+/// <x |/>
+/// <x |:asd>
+/// <x |_asd>
+/// <x |asd>
+/// <x | >
+/// <x |>
+/// ```
+fn complete_attribute_name_before(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_end(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_end(tokenizer, info, code),
+ }
+}
+
+/// In an attribute name.
+///
+/// ```markdown
+/// <x :|>
+/// <x _|>
+/// <x a|>
+/// ```
+fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char)
+ if char == '-'
+ || char == '.'
+ || char == ':'
+ || char == '_'
+ || char.is_ascii_alphanumeric() =>
+ {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_attribute_name_after(tokenizer, info, code),
+ }
+}
+
+/// After an attribute name, before an attribute initializer, the end of the
+/// tag, or whitespace.
+///
+/// ```markdown
+/// <x a|>
+/// <x a|=b>
+/// <x a|="c">
+/// ```
+fn complete_attribute_name_after(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char('=') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_attribute_name_before(tokenizer, info, code),
+ }
+}
+
+/// Before an unquoted, double quoted, or single quoted attribute value,
+/// allowing whitespace.
+///
+/// ```markdown
+/// <x a=|b>
+/// <x a=|"c">
+/// ```
+fn complete_attribute_value_before(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None),
+ Code::Char(char) if char == '"' || char == '\'' => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.quote = if char == '"' {
+ QuoteKind::Double
+ } else {
+ QuoteKind::Single
+ };
+
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_quoted(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_attribute_value_unquoted(tokenizer, info, code),
+ }
+}
+
+/// In a double or single quoted attribute value.
+///
+/// ```markdown
+/// <x a="|">
+/// <x a='|'>
+/// ```
+fn complete_attribute_value_quoted(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ let marker = if info.quote == QuoteKind::Double {
+ '"'
+ } else {
+ '\''
+ };
+
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None),
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_quoted_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_quoted(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// In an unquoted attribute value.
+///
+/// ```markdown
+/// <x a=b|c>
+/// ```
+fn complete_attribute_value_unquoted(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => {
+ complete_attribute_name_after(tokenizer, info, code)
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_unquoted(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// After a double or single quoted attribute value, before whitespace or the
+/// end of the tag.
+///
+/// ```markdown
+/// <x a="b"|>
+/// ```
+fn complete_attribute_value_quoted_after(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => {
+ complete_attribute_name_before(tokenizer, info, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In certain circumstances of a complete tag where only an `>` is allowed.
+///
+/// ```markdown
+/// <x a="b"|>
+/// ```
+fn complete_end(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `>` in a complete tag.
+///
+/// ```markdown
+/// <x>|
+/// ```
+fn complete_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ continuation(tokenizer, info, code)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(_) => (State::Nok, None),
+ }
+}
+
+/// Inside continuation of any HTML kind.
+///
+/// ```markdown
+/// <!--x|xx-->
+/// ```
+fn continuation(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') if info.kind == Kind::Comment => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_comment_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('<') if info.kind == Kind::Raw => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_tag_open(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('>') if info.kind == Kind::Declaration => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('?') if info.kind == Kind::Instruction => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(']') if info.kind == Kind::Cdata => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_character_data_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
+ if info.kind == Kind::Basic || info.kind == Kind::Complete =>
+ {
+ let clone = info;
+
+ tokenizer.check(blank_line_before, |ok| {
+ if ok {
+ Box::new(|tokenizer, code| continuation_close(tokenizer, clone, code))
+ } else {
+ Box::new(|tokenizer, code| continuation_at_line_ending(tokenizer, clone, code))
+ }
+ })(tokenizer, code)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ continuation_at_line_ending(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// In continuation, before an eol or eof.
+///
+/// ```markdown
+/// <x>|
+/// ```
+fn continuation_at_line_ending(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::HtmlFlowData);
+ html_continue_start(tokenizer, info, code)
+}
+
+/// In continuation, after an eol.
+///
+/// ```markdown
+/// <x>|
+/// asd
+/// ```
+fn html_continue_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None => {
+ tokenizer.exit(TokenType::HtmlFlow);
+ (State::Ok, Some(vec![code]))
+ }
+ // To do: do not allow lazy lines.
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ html_continue_start(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.enter(TokenType::HtmlFlowData);
+ continuation(tokenizer, info, code)
+ }
+ }
+}
+
+/// In comment continuation, after one `-`, expecting another.
+///
+/// ```markdown
+/// <!--xxx-|->
+/// ```
+fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') if info.kind == Kind::Comment => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In raw continuation, after `<`, expecting a `/`.
+///
+/// ```markdown
+/// <script>console.log(1)<|/script>
+/// ```
+fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_end_tag(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In raw continuation, after `</`, expecting or inside a raw tag name.
+///
+/// ```markdown
+/// <script>console.log(1)</|script>
+/// <script>console.log(1)</s|cript>
+/// <script>console.log(1)</script|>
+/// ```
+fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase();
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.clear();
+
+ if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ continuation(tokenizer, clone, code)
+ }
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_end_tag(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In cdata continuation, after `]`, expecting `]>`.
+///
+/// ```markdown
+/// <![CDATA[>&<]|]>
+/// ```
+fn continuation_character_data_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In declaration or instruction continuation, waiting for `>` to close it.
+///
+/// ```markdown
+/// <!--|>
+/// <?ab?|>
+/// <?|>
+/// <!q|>
+/// <!--ab--|>
+/// <!--ab--|->
+/// <!--ab---|>
+/// <![CDATA[>&<]]|>
+/// ```
+fn continuation_declaration_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('-') if info.kind == Kind::Comment => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In closed continuation: everything we get until the eol/eof is part of it.
+///
+/// ```markdown
+/// <!doctype>|
+/// ```
+fn continuation_close(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::HtmlFlowData);
+ tokenizer.exit(TokenType::HtmlFlow);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// Before a line ending, expecting a blank line.
+///
+/// ```markdown
+/// <div>|
+///
+/// ```
+fn blank_line_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(blank_line)), None)
+}