//! HTML (flow) occurs in the [flow][] content type. //! //! ## Grammar //! //! HTML (flow) forms with the following BNF //! (see [construct][crate::construct] for character groups): //! //! ```bnf //! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete //! //! ; Note: closing tag name does not need to match opening tag name. //! raw ::= '<' raw_tag_name [[space_or_tab *line | '>' *line] eol] *(*line eol) ['' *line | *line *(eol *line) ['-->' *line]] //! instruction ::= '' *line | *line *(eol *line) ['?>' *line]] //! declaration ::= '' *line] //! cdata ::= '' *line] //! basic ::= '< ['/'] basic_tag_name [['/'] '>' *line *(eol 1*line)] //! complete ::= (opening_tag | closing_tag) [*space_or_tab *(eol 1*line)] //! //! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. //! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. //! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>' //! closing_tag ::= '' //! tag_name ::= ascii_alphabetic *('-' | ascii_alphanumeric) //! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value] //! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric) //! attribute_value ::= '"' *(line - '"') '"' | "'" *(line - "'") "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`') //! ``` //! //! As this construct occurs in flow, like all flow constructs, it must be //! followed by an eol (line ending) or eof (end of file). //! //! The grammar for HTML in markdown does not follow the rules of parsing //! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML //! spec][html_parsing]. //! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) //! attempt to parse an XML-like language. //! By extension, another notable property of the grammar is that it can //! result in invalid HTML, in that it allows things that wouldn’t work or //! wouldn’t work well in HTML, such as mismatched tags. //! //! Interestingly, most of the productions above have a clear opening and //! closing condition (raw, comment, insutrction, declaration, cdata), but the //! closing condition does not need to be satisfied. //! In this case, the parser never has to backtrack. //! //! Because the **basic** and **complete** productions in the grammar form with //! a tag, followed by more stuff, and stop at a blank line, it is possible to //! interleave (a word for switching between languages) markdown and HTML //! together, by placing the opening and closing tags on their own lines, //! with blank lines between them and markdown. //! For example: //! //! ```markdown //!
This is code but this is not *emphasis*.
//! //!
//! //! This is a paragraph in a `div` and with `code` and *emphasis*. //! //!
//! ``` //! //! The **complete** production of HTML (flow) is not allowed to interrupt //! content. //! That means that a blank line is needed between a [paragraph][] and it. //! However, [HTML (text)][html_text] has a similar production, which will //! typically kick-in instead. //! //! The list of tag names allowed in the **raw** production are defined in //! [`HTML_RAW_NAMES`][html_raw_names]. //! This production exists because there are a few cases where markdown //! *inside* some elements, and hence interleaving, does not make sense. //! //! The list of tag names allowed in the **basic** production are defined in //! [`HTML_BLOCK_NAMES`][html_block_names]. //! This production exists because there are a few cases where we can decide //! early that something is going to be a flow (block) element instead of a //! phrasing (inline) element. //! We *can* interrupt and don’t have to care too much about it being //! well-formed. //! //! ## Tokens //! //! * [`HtmlFlow`][Name::HtmlFlow] //! * [`HtmlFlowData`][Name::HtmlFlowData] //! * [`LineEnding`][Name::LineEnding] //! //! ## References //! //! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) //! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) //! //! [flow]: crate::construct::flow //! [html_text]: crate::construct::html_text //! [paragraph]: crate::construct::paragraph //! [html_raw_names]: crate::util::constant::HTML_RAW_NAMES //! [html_block_names]: crate::util::constant::HTML_BLOCK_NAMES //! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing use crate::construct::partial_space_or_tab::{ space_or_tab_with_options, Options as SpaceOrTabOptions, }; use crate::event::Name; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use crate::util::{ constant::{HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}, slice::Slice, }; /// Symbol for ` /// ^ /// ``` pub fn continuation_raw_tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'/') => { tokenizer.consume(); tokenizer.tokenize_state.start = tokenizer.point.index; State::Next(StateName::HtmlFlowContinuationRawEndTag) } _ => State::Retry(StateName::HtmlFlowContinuation), } } /// In raw continuation, after ` | /// ^^^^^^ /// ``` pub fn continuation_raw_end_tag(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'>') => { // Guaranteed to be valid ASCII bytes. let slice = Slice::from_indices( tokenizer.parse_state.bytes, tokenizer.tokenize_state.start, tokenizer.point.index, ); let name = slice.as_str().to_ascii_lowercase(); tokenizer.tokenize_state.start = 0; if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); State::Next(StateName::HtmlFlowContinuationClose) } else { State::Retry(StateName::HtmlFlowContinuation) } } Some(b'A'..=b'Z' | b'a'..=b'z') if tokenizer.point.index - tokenizer.tokenize_state.start < HTML_RAW_SIZE_MAX => { tokenizer.consume(); State::Next(StateName::HtmlFlowContinuationRawEndTag) } _ => { tokenizer.tokenize_state.start = 0; State::Retry(StateName::HtmlFlowContinuation) } } } /// In cdata continuation, after `]`, expecting `]>`. /// /// ```markdown /// > | &<]]> /// ^ /// ``` pub fn continuation_cdata_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b']') => { tokenizer.consume(); State::Next(StateName::HtmlFlowContinuationDeclarationInside) } _ => State::Retry(StateName::HtmlFlowContinuation), } } /// In declaration or instruction continuation, at `>`. /// /// ```markdown /// > | /// ^ /// > | /// ^ /// > | /// ^ /// > | /// ^ /// > | &<]]> /// ^ /// ``` pub fn continuation_declaration_inside(tokenizer: &mut Tokenizer) -> State { if tokenizer.tokenize_state.marker == COMMENT && tokenizer.current == Some(b'-') { tokenizer.consume(); State::Next(StateName::HtmlFlowContinuationDeclarationInside) } else if tokenizer.current == Some(b'>') { tokenizer.consume(); State::Next(StateName::HtmlFlowContinuationClose) } else { State::Retry(StateName::HtmlFlowContinuation) } } /// In closed continuation: everything we get until the eol/eof is part of it. /// /// ```markdown /// > | /// ^ /// ``` pub fn continuation_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Name::HtmlFlowData); State::Retry(StateName::HtmlFlowContinuationAfter) } _ => { tokenizer.consume(); State::Next(StateName::HtmlFlowContinuationClose) } } } /// Done. /// /// ```markdown /// > | /// ^ /// ``` pub fn continuation_after(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Name::HtmlFlow); tokenizer.tokenize_state.marker = 0; // Feel free to interrupt. tokenizer.interrupt = false; // No longer concrete. tokenizer.concrete = false; State::Ok } /// Before eol, expecting blank line. /// /// ```markdown /// > |
/// ^ /// | /// ``` pub fn blank_line_before(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(Name::LineEnding); tokenizer.consume(); tokenizer.exit(Name::LineEnding); State::Next(StateName::BlankLineStart) }