From 4c06c8554c35887f8f5147783953b2b7e7c2327f Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 8 Jun 2022 15:52:16 +0200 Subject: . --- src/construct/html_flow.rs | 1068 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1068 insertions(+) create mode 100644 src/construct/html_flow.rs (limited to 'src/construct/html_flow.rs') diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '' *line ] +//! cdata ::= '' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//!
This is a div but *this* is not emphasis.
+//! +//!
+//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//!
+//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { + /// Not yet known. + Unknown, + /// Symbol for `