diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
commit | 4c06c8554c35887f8f5147783953b2b7e7c2327f (patch) | |
tree | 1b2463848a3ae4c645f7f1a325877ee829ab65c5 /src/construct/html_flow.rs | |
download | markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2 markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip |
.
Diffstat (limited to 'src/construct/html_flow.rs')
-rw-r--r-- | src/construct/html_flow.rs | 1068 |
1 files changed, 1068 insertions, 0 deletions
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ] +//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ] +//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '</' tag_name whitespace_optional '>' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//! <div>This is a <code>div</code> but *this* is not emphasis.</div> +//! +//! <div> +//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//! </div> +//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! <!-- To do: link stuff --> + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { + /// Not yet known. + Unknown, + /// Symbol for `<script>` (condition 1). + Raw, + /// Symbol for `<!---->` (condition 2). + Comment, + /// Symbol for `<?php?>` (condition 3). + Instruction, + /// Symbol for `<!doctype>` (condition 4). + Declaration, + /// Symbol for `<![CDATA[]]>` (condition 5). + Cdata, + /// Symbol for `<div` (condition 6). + Basic, + /// Symbol for `<x>` (condition 7). + Complete, +} + +/// Type of quote, if we’re in an attribure, in complete (condition 7). +#[derive(Debug, Clone, PartialEq)] +enum QuoteKind { + /// Not in a quoted attribute. + None, + /// In a double quoted (`"`) attribute. + Double, + /// In a single quoted (`"`) attribute. + Single, +} + +/// State needed to parse HTML (flow). +#[derive(Debug, Clone)] +struct Info { + /// Kind of HTML (flow). + kind: Kind, + /// Whether this is a start tag (`<` not followed by `/`). + start_tag: bool, + /// Used depending on `kind` to either collect all parsed characters, or to + /// store expected characters. + buffer: Vec<char>, + /// `index` into `buffer` when expecting certain characters. + index: usize, + /// Current quote, when in a double or single quoted attribute value. + quote: QuoteKind, +} + +// To do: mark as concrete (block quotes or lists can’t “pierce” into HTML). + +/// Start of HTML (flow), before optional whitespace. +/// +/// ```markdown +/// |<x /> +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HtmlFlow); + tokenizer.enter(TokenType::HtmlFlowData); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before), + )(tokenizer, code) +} + +/// After optional whitespace, before `<`. +/// +/// ```markdown +/// |<x /> +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('<') == code { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + open( + tokenizer, + Info { + kind: Kind::Unknown, + start_tag: false, + buffer: vec![], + index: 0, + quote: QuoteKind::None, + }, + code, + ) + })), + None, + ) + } else { + (State::Nok, None) + } +} + +/// After `<`, before a tag name or other stuff. +/// +/// ```markdown +/// <|x /> +/// <|!doctype /> +/// <|!--xxx--/> +/// ``` +fn open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + declaration_start(tokenizer, info, code) + })), + None, + ) + } + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + tag_close_start(tokenizer, info, code) + })), + None, + ) + } + Code::Char('?') => { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Instruction; + tokenizer.consume(code); + // While we’re in an instruction instead of a declaration, we’re on a `?` + // right now, so we do need to search for `>`, similar to declarations. + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + // To do: life times. + let mut clone = info; + clone.start_tag = true; + tag_name(tokenizer, clone, code) + } + _ => (State::Nok, None), + } +} + +/// After `<!`, so inside a declaration, comment, or CDATA. +/// +/// ```markdown +/// <!|doctype /> +/// <!|--xxx--/> +/// <!|[CDATA[>&<]]> +/// ``` +fn declaration_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + let mut clone = info; + clone.kind = Kind::Comment; + ( + State::Fn(Box::new(|tokenizer, code| { + comment_open_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char('[') => { + tokenizer.consume(code); + let mut clone = info; + clone.kind = Kind::Cdata; + clone.buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + clone.index = 0; + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.kind = Kind::Declaration; + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, clone, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `<!-`, inside a comment, before another `-`. +/// +/// ```markdown +/// <!-|-xxx--/> +/// ``` +fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `<![`, inside CDATA, expecting `CDATA[`. +/// +/// ```markdown +/// <![|CDATA[>&<]]> +/// <![CD|ATA[>&<]]> +/// <![CDA|TA[>&<]]> +/// <![CDAT|A[>&<]]> +/// <![CDATA|[>&<]]> +/// ``` +fn cdata_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == info.buffer[info.index] => { + let mut clone = info; + clone.index += 1; + tokenizer.consume(code); + + if clone.index == clone.buffer.len() { + clone.buffer.clear(); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, clone, code) + })), + None, + ) + } else { + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open_inside(tokenizer, clone, code) + })), + None, + ) + } + } + _ => (State::Nok, None), + } +} + +/// After `</`, in a closing tag, before a tag name. +/// +/// ```markdown +/// </|x> +/// ``` +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// In a tag name. +/// +/// ```markdown +/// <a|b> +/// </a|b> +/// ``` +fn tag_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { + let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + let name = tag_name_buffer.as_str(); + let slash = if let Code::Char(char) = code { + char == '/' + } else { + false + }; + + if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Raw; + clone.buffer.clear(); + continuation(tokenizer, clone, code) + } else if HTML_BLOCK_NAMES.contains(&name) { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Basic; + clone.buffer.clear(); + + if slash { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + basic_self_closing(tokenizer, clone, code) + })), + None, + ) + } else { + continuation(tokenizer, clone, code) + } + } else { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Complete; + + // To do: do not support complete HTML when interrupting. + if clone.start_tag { + complete_attribute_name_before(tokenizer, clone, code) + } else { + complete_closing_tag_after(tokenizer, clone, code) + } + } + } + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), + None, + ) + } + Code::Char(_) => (State::Nok, None), + } +} + +/// After a closing slash of a basic tag name. +/// +/// ```markdown +/// <div/|> +/// ``` +fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After a closing slash of a complete tag name. +/// +/// ```markdown +/// <x/|> +/// </x/|> +/// ``` +fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_closing_tag_after(tokenizer, info, code) + })), + None, + ) + } + _ => complete_end(tokenizer, info, code), + } +} + +/// At a place where an attribute name would be valid. +/// +/// At first, this state is used after a complete tag name, after whitespace, +/// where it expects optional attributes or the end of the tag. +/// It is also reused after attributes, when expecting more optional +/// attributes. +/// +/// ```markdown +/// <x |/> +/// <x |:asd> +/// <x |_asd> +/// <x |asd> +/// <x | > +/// <x |> +/// ``` +fn complete_attribute_name_before( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_end(tokenizer, info, code) + })), + None, + ) + } + Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name(tokenizer, info, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name_before(tokenizer, info, code) + })), + None, + ) + } + _ => complete_end(tokenizer, info, code), + } +} + +/// In an attribute name. +/// +/// ```markdown +/// <x :|> +/// <x _|> +/// <x a|> +/// ``` +fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) + if char == '-' + || char == '.' + || char == ':' + || char == '_' + || char.is_ascii_alphanumeric() => + { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_name_after(tokenizer, info, code), + } +} + +/// After an attribute name, before an attribute initializer, the end of the +/// tag, or whitespace. +/// +/// ```markdown +/// <x a|> +/// <x a|=b> +/// <x a|="c"> +/// ``` +fn complete_attribute_name_after( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('=') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_before(tokenizer, info, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name_after(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_name_before(tokenizer, info, code), + } +} + +/// Before an unquoted, double quoted, or single quoted attribute value, +/// allowing whitespace. +/// +/// ```markdown +/// <x a=|b> +/// <x a=|"c"> +/// ``` +fn complete_attribute_value_before( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), + Code::Char(char) if char == '"' || char == '\'' => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.quote = if char == '"' { + QuoteKind::Double + } else { + QuoteKind::Single + }; + + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted(tokenizer, clone, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_before(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_value_unquoted(tokenizer, info, code), + } +} + +/// In a double or single quoted attribute value. +/// +/// ```markdown +/// <x a="|"> +/// <x a='|'> +/// ``` +fn complete_attribute_value_quoted( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + let marker = if info.quote == QuoteKind::Double { + '"' + } else { + '\'' + }; + + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted_after(tokenizer, info, code) + })), + None, + ) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// In an unquoted attribute value. +/// +/// ```markdown +/// <x a=b|c> +/// ``` +fn complete_attribute_value_unquoted( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + complete_attribute_name_after(tokenizer, info, code) + } + Code::Char(_) => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_unquoted(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After a double or single quoted attribute value, before whitespace or the +/// end of the tag. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_attribute_value_quoted_after( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { + complete_attribute_name_before(tokenizer, info, code) + } + _ => (State::Nok, None), + } +} + +/// In certain circumstances of a complete tag where only an `>` is allowed. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_end(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_after(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `>` in a complete tag. +/// +/// ```markdown +/// <x>| +/// ``` +fn complete_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + continuation(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_after(tokenizer, info, code) + })), + None, + ) + } + Code::Char(_) => (State::Nok, None), + } +} + +/// Inside continuation of any HTML kind. +/// +/// ```markdown +/// <!--x|xx--> +/// ``` +fn continuation(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_comment_inside(tokenizer, info, code) + })), + None, + ) + } + Code::Char('<') if info.kind == Kind::Raw => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_tag_open(tokenizer, info, code) + })), + None, + ) + } + Code::Char('>') if info.kind == Kind::Declaration => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + Code::Char('?') if info.kind == Kind::Instruction => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + Code::Char(']') if info.kind == Kind::Cdata => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_character_data_inside(tokenizer, info, code) + })), + None, + ) + } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') + if info.kind == Kind::Basic || info.kind == Kind::Complete => + { + let clone = info; + + tokenizer.check(blank_line_before, |ok| { + if ok { + Box::new(|tokenizer, code| continuation_close(tokenizer, clone, code)) + } else { + Box::new(|tokenizer, code| continuation_at_line_ending(tokenizer, clone, code)) + } + })(tokenizer, code) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + continuation_at_line_ending(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// In continuation, before an eol or eof. +/// +/// ```markdown +/// <x>| +/// ``` +fn continuation_at_line_ending(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::HtmlFlowData); + html_continue_start(tokenizer, info, code) +} + +/// In continuation, after an eol. +/// +/// ```markdown +/// <x>| +/// asd +/// ``` +fn html_continue_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::HtmlFlow); + (State::Ok, Some(vec![code])) + } + // To do: do not allow lazy lines. + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + html_continue_start(tokenizer, info, code) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::HtmlFlowData); + continuation(tokenizer, info, code) + } + } +} + +/// In comment continuation, after one `-`, expecting another. +/// +/// ```markdown +/// <!--xxx-|-> +/// ``` +fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In raw continuation, after `<`, expecting a `/`. +/// +/// ```markdown +/// <script>console.log(1)<|/script> +/// ``` +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_end_tag(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In raw continuation, after `</`, expecting or inside a raw tag name. +/// +/// ```markdown +/// <script>console.log(1)</|script> +/// <script>console.log(1)</s|cript> +/// <script>console.log(1)</script|> +/// ``` +fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + // To do: life times. + let mut clone = info; + clone.buffer.clear(); + + if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, clone, code) + })), + None, + ) + } else { + continuation(tokenizer, clone, code) + } + } + Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_end_tag(tokenizer, clone, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In cdata continuation, after `]`, expecting `]>`. +/// +/// ```markdown +/// <![CDATA[>&<]|]> +/// ``` +fn continuation_character_data_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In declaration or instruction continuation, waiting for `>` to close it. +/// +/// ```markdown +/// <!--|> +/// <?ab?|> +/// <?|> +/// <!q|> +/// <!--ab--|> +/// <!--ab--|-> +/// <!--ab---|> +/// <![CDATA[>&<]]|> +/// ``` +fn continuation_declaration_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In closed continuation: everything we get until the eol/eof is part of it. +/// +/// ```markdown +/// <!doctype>| +/// ``` +fn continuation_close(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::HtmlFlowData); + tokenizer.exit(TokenType::HtmlFlow); + (State::Ok, Some(vec![code])) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// Before a line ending, expecting a blank line. +/// +/// ```markdown +/// <div>| +/// +/// ``` +fn blank_line_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(blank_line)), None) +} |