//! HTML (flow) occurs in the [flow][] content type.
//!
//! ## Grammar
//!
//! HTML (flow) forms with the following BNF
//! (see [construct][crate::construct] for character groups):
//!
//! ```bnf
//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete
//!
//! ; Note: closing tag name does not need to match opening tag name.
//! raw ::= '<' raw_tag_name [[space_or_tab *line | '>' *line] eol] *(*line eol) ['' raw_tag_name *line]
//! comment ::= '' *line]]
//! instruction ::= '' ['>' *line | *line *(eol *line) ['?>' *line]]
//! declaration ::= '' *line]
//! cdata ::= '' *line]
//! basic ::= '< ['/'] basic_tag_name [['/'] '>' *line *(eol 1*line)]
//! complete ::= (opening_tag | closing_tag) [*space_or_tab *(eol 1*line)]
//!
//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive.
//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive.
//! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>'
//! closing_tag ::= '' tag_name [space_or_tab_eol] '>'
//! tag_name ::= ascii_alphabetic *('-' | ascii_alphanumeric)
//! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value]
//! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric)
//! attribute_value ::= '"' *(line - '"') '"' | "'" *(line - "'") "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`')
//! ```
//!
//! As this construct occurs in flow, like all flow constructs, it must be
//! followed by an eol (line ending) or eof (end of file).
//!
//! The grammar for HTML in markdown does not follow the rules of parsing
//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML
//! spec][html_parsing].
//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?)
//! attempt to parse an XML-like language.
//! By extension, another notable property of the grammar is that it can
//! result in invalid HTML, in that it allows things that wouldn’t work or
//! wouldn’t work well in HTML, such as mismatched tags.
//!
//! Interestingly, most of the productions above have a clear opening and
//! closing condition (raw, comment, insutrction, declaration, cdata), but the
//! closing condition does not need to be satisfied.
//! In this case, the parser never has to backtrack.
//!
//! Because the **basic** and **complete** productions in the grammar form with
//! a tag, followed by more stuff, and stop at a blank line, it is possible to
//! interleave (a word for switching between languages) markdown and HTML
//! together, by placing the opening and closing tags on their own lines,
//! with blank lines between them and markdown.
//! For example:
//!
//! ```markdown
//!
This is code
but this is not *emphasis*.
//!
//!
//!
//! This is a paragraph in a `div` and with `code` and *emphasis*.
//!
//!
//! ```
//!
//! The **complete** production of HTML (flow) is not allowed to interrupt
//! content.
//! That means that a blank line is needed between a [paragraph][] and it.
//! However, [HTML (text)][html_text] has a similar production, which will
//! typically kick-in instead.
//!
//! The list of tag names allowed in the **raw** production are defined in
//! [`HTML_RAW_NAMES`][html_raw_names].
//! This production exists because there are a few cases where markdown
//! *inside* some elements, and hence interleaving, does not make sense.
//!
//! The list of tag names allowed in the **basic** production are defined in
//! [`HTML_BLOCK_NAMES`][html_block_names].
//! This production exists because there are a few cases where we can decide
//! early that something is going to be a flow (block) element instead of a
//! phrasing (inline) element.
//! We *can* interrupt and don’t have to care too much about it being
//! well-formed.
//!
//! ## Tokens
//!
//! * [`HtmlFlow`][Name::HtmlFlow]
//! * [`HtmlFlowData`][Name::HtmlFlowData]
//! * [`LineEnding`][Name::LineEnding]
//!
//! ## References
//!
//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js)
//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks)
//!
//! [flow]: crate::construct::flow
//! [html_text]: crate::construct::html_text
//! [paragraph]: crate::construct::paragraph
//! [html_raw_names]: crate::util::constant::HTML_RAW_NAMES
//! [html_block_names]: crate::util::constant::HTML_BLOCK_NAMES
//! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
use crate::construct::partial_space_or_tab::{
space_or_tab_with_options, Options as SpaceOrTabOptions,
};
use crate::event::Name;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::{
constant::{HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE},
slice::Slice,
};
/// Symbol for `
/// ^
/// ```
pub fn continuation_raw_tag_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'/') => {
tokenizer.consume();
tokenizer.tokenize_state.start = tokenizer.point.index;
State::Next(StateName::HtmlFlowContinuationRawEndTag)
}
_ => State::Retry(StateName::HtmlFlowContinuation),
}
}
/// In raw continuation, after ``, in a raw tag name.
///
/// ```markdown
/// > |
/// ^^^^^^
/// ```
pub fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'>') => {
// Guaranteed to be valid ASCII bytes.
let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
tokenizer.tokenize_state.start,
tokenizer.point.index,
);
let name = slice.as_str().to_ascii_lowercase();
tokenizer.tokenize_state.start = 0;
if HTML_RAW_NAMES.contains(&name.as_str()) {
tokenizer.consume();
State::Next(StateName::HtmlFlowContinuationClose)
} else {
State::Retry(StateName::HtmlFlowContinuation)
}
}
Some(b'A'..=b'Z' | b'a'..=b'z')
if tokenizer.point.index - tokenizer.tokenize_state.start < HTML_RAW_SIZE_MAX =>
{
tokenizer.consume();
State::Next(StateName::HtmlFlowContinuationRawEndTag)
}
_ => {
tokenizer.tokenize_state.start = 0;
State::Retry(StateName::HtmlFlowContinuation)
}
}
}
/// In cdata continuation, after `]`, expecting `]>`.
///
/// ```markdown
/// > | &<]]>
/// ^
/// ```
pub fn continuation_cdata_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b']') => {
tokenizer.consume();
State::Next(StateName::HtmlFlowContinuationDeclarationInside)
}
_ => State::Retry(StateName::HtmlFlowContinuation),
}
}
/// In declaration or instruction continuation, at `>`.
///
/// ```markdown
/// > |
/// ^
/// > | >
/// ^
/// > |
/// ^
/// > |
/// ^
/// > | &<]]>
/// ^
/// ```
pub fn continuation_declaration_inside(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.marker == COMMENT && tokenizer.current == Some(b'-') {
tokenizer.consume();
State::Next(StateName::HtmlFlowContinuationDeclarationInside)
} else if tokenizer.current == Some(b'>') {
tokenizer.consume();
State::Next(StateName::HtmlFlowContinuationClose)
} else {
State::Retry(StateName::HtmlFlowContinuation)
}
}
/// In closed continuation: everything we get until the eol/eof is part of it.
///
/// ```markdown
/// > |
/// ^
/// ```
pub fn continuation_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'\n') => {
tokenizer.exit(Name::HtmlFlowData);
State::Retry(StateName::HtmlFlowContinuationAfter)
}
_ => {
tokenizer.consume();
State::Next(StateName::HtmlFlowContinuationClose)
}
}
}
/// Done.
///
/// ```markdown
/// > |
/// ^
/// ```
pub fn continuation_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Name::HtmlFlow);
tokenizer.tokenize_state.marker = 0;
// Feel free to interrupt.
tokenizer.interrupt = false;
// No longer concrete.
tokenizer.concrete = false;
State::Ok
}
/// Before eol, expecting blank line.
///
/// ```markdown
/// > |
/// ^
/// |
/// ```
pub fn blank_line_before(tokenizer: &mut Tokenizer) -> State {
tokenizer.enter(Name::LineEnding);
tokenizer.consume();
tokenizer.exit(Name::LineEnding);
State::Next(StateName::BlankLineStart)
}