diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/blank_line.rs | 61 | ||||
-rw-r--r-- | src/construct/character_escape.rs | 69 | ||||
-rw-r--r-- | src/construct/character_reference.rs | 237 | ||||
-rw-r--r-- | src/construct/code_fenced.rs | 581 | ||||
-rw-r--r-- | src/construct/code_indented.rs | 190 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 175 | ||||
-rw-r--r-- | src/construct/html_flow.rs | 1068 | ||||
-rw-r--r-- | src/construct/mod.rs | 11 | ||||
-rw-r--r-- | src/construct/partial_whitespace.rs | 66 | ||||
-rw-r--r-- | src/construct/thematic_break.rs | 137 |
10 files changed, 2595 insertions, 0 deletions
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs new file mode 100644 index 0000000..7b7962b --- /dev/null +++ b/src/construct/blank_line.rs @@ -0,0 +1,61 @@ +//! Blank lines are a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! blank_line ::= *(' ' '\t') +//! ``` +//! +//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! from another paragraph. +//! In several cases, blank lines are not needed between flow constructs, +//! such as between two headings. +//! Sometimes, whether blank lines are present, changes the behavior of how +//! HTML is rendered, such as whether blank lines are present between list +//! items in a list. +//! More than one blank line is never needed in `CommonMark`. +//! +//! Because blank lines can be empty (line endings are not considered part of +//! it), and events cannot be empty, blank lines are not present as a token. +//! +//! ## References +//! +//! * [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js) +//! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) +//! +//! <!-- To do: link `flow`, `heading`, `list`, `paragraph` --> + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a blank line. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace), + |_ok| Box::new(after), + )(tokenizer, code) +} + +/// After zero or more spaces or tabs, before a line ending or EOF. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +fn after(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs new file mode 100644 index 0000000..5ea995e --- /dev/null +++ b/src/construct/character_escape.rs @@ -0,0 +1,69 @@ +//! Character escapes are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_escape ::= '\\' ascii_punctuation +//! ``` +//! +//! Like much of markdown, there are no “invalid” character escapes: just a +//! slash, or a slash followed by anything other than an ASCII punctuation +//! character, is exactly that: just a slash. +//! To escape (most) arbitrary characters, use a +//! [character reference][] instead +//! (as in, `&`, `{`, or say `	`). +//! It is also possible to escape a line ending in text with a similar +//! construct: a backslash followed by a line ending (that is part of the +//! construct instead of ending it). +//! +//! ## References +//! +//! * [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js) +//! * [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes) +//! +//! [character reference]: crate::construct::character_reference +//! +//! <!-- To do: link `hard_break_escape`, `string`, `text` --> + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a character escape. +/// +/// ```markdown +/// a|\*b +/// a|\b +/// a|\ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('\\') => { + tokenizer.enter(TokenType::CharacterEscape); + tokenizer.enter(TokenType::CharacterEscapeMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeMarker); + (State::Fn(Box::new(inside)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character escape, after `\`. +/// +/// ```markdown +/// a\|*b +/// a\|b +/// a\| b +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_punctuation() => { + tokenizer.enter(TokenType::CharacterEscapeValue); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeValue); + tokenizer.exit(TokenType::CharacterEscape); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs new file mode 100644 index 0000000..27275d5 --- /dev/null +++ b/src/construct/character_reference.rs @@ -0,0 +1,237 @@ +//! Character references are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_reference ::= '&' (numeric | named) ';' +//! +//! numeric ::= '#' (hexadecimal | decimal) +//! ; Note: Limit of `6` imposed as all bigger numbers are invalid: +//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) +//! ; Note: Limit of `7` imposed as all bigger numbers are invalid: +//! decimal ::= 1*7(ascii_digit) +//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`: +//! ; Note: Limited to any known named character reference (see `constants.rs`) +//! named ::= 1*31(ascii_alphanumeric) +//! ``` +//! +//! Like much of markdown, there are no “invalid” character references. +//! However, for security reasons, several numeric character references parse +//! fine but are not rendered as their corresponding character and they are +//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). +//! See [`decode_numeric_character_reference`][decode_numeric] for more info. +//! +//! To escape ASCII punctuation characters, use the terser +//! [character escape][character_escape] construct instead (as in, `\&`). +//! +//! Character references in markdown are not the same as character references +//! in HTML. +//! Notably, HTML allows several character references without a closing +//! semicolon. +//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info. +//! +//! Character references are parsed insensitive to casing. +//! The casing of hexadecimal numeric character references has no effect. +//! The casing of named character references does not matter when parsing them, +//! but does affect whether they match. +//! Depending on the name, one or more cases are allowed, such as that `AMP` +//! and `amp` are both allowed but other cases are not. +//! See [`CHARACTER_REFERENCE_NAMES`][character_reference_names] for which +//! names match. +//! +//! ## References +//! +//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) +//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +//! +//! [character_escape]: crate::construct::character_reference +//! [decode_numeric]: crate::util::decode_numeric_character_reference +//! [character_reference_names]: crate::constant::CHARACTER_REFERENCE_NAMES +//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state +//! +//! <!-- To do: link `string`, `text` --> + +use crate::constant::{ + CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_NAMED_SIZE_MAX, CHARACTER_REFERENCE_NAMES, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of a character reference. +#[derive(Debug, Clone)] +pub enum Kind { + /// Numeric decimal character reference (`	`). + Decimal, + /// Numeric hexadecimal character reference (`{`). + Hexadecimal, + /// Named character reference (`&`). + Named, +} + +/// State needed to parse character references. +#[derive(Debug, Clone)] +struct Info { + /// All parsed characters. + buffer: Vec<char>, + /// Kind of character reference. + kind: Kind, +} + +/// Start of a character reference. +/// +/// ```markdown +/// a|&b +/// a|{b +/// a|	b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('&') => { + tokenizer.enter(TokenType::CharacterReference); + tokenizer.enter(TokenType::CharacterReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarker); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character reference, after `&`, before `#` for numeric references +/// or an alphanumeric for named references. +/// +/// ```markdown +/// a&|amp;b +/// a&|#123;b +/// a&|#x9;b +/// ``` +fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); + (State::Fn(Box::new(numeric)), None) + } else { + tokenizer.enter(TokenType::CharacterReferenceValue); + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Named, + }, + ) + } +} + +/// Inside a numeric character reference, right before `x` for hexadecimals, +/// or a digit for decimals. +/// +/// ```markdown +/// a&#|123;b +/// a&#|x9;b +/// ``` +fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == 'x' || char == 'X' => { + tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.enter(TokenType::CharacterReferenceValue); + + ( + State::Fn(Box::new(|tokenizer, code| { + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Hexadecimal, + }, + ) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::CharacterReferenceValue); + + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Decimal, + }, + ) + } + } +} + +/// Inside a character reference value, after the markers (`&#x`, `&#`, or +/// `&`) that define its kind, but before the `;`. +/// The character reference kind defines what and how many characters are +/// allowed. +/// +/// ```markdown +/// a&a|mp;b +/// a|23;b +/// a&#x|9;b +/// ``` +fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + match code { + Code::Char(';') if !info.buffer.is_empty() => { + tokenizer.exit(TokenType::CharacterReferenceValue); + let value = info.buffer.iter().collect::<String>(); + + if let Kind::Named = info.kind { + if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { + return (State::Nok, Some(vec![code])); + } + } + + tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); + tokenizer.exit(TokenType::CharacterReference); + (State::Ok, None) + } + Code::Char(char) => { + let len = info.buffer.len(); + + let cont = match info.kind { + Kind::Hexadecimal + if char.is_ascii_hexdigit() + && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX => + { + true + } + Kind::Decimal + if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX => + { + true + } + Kind::Named + if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX => + { + true + } + _ => false, + }; + + if cont { + let mut clone = info; + clone.buffer.push(char); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))), + None, + ) + } else { + (State::Nok, None) + } + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs new file mode 100644 index 0000000..2068a62 --- /dev/null +++ b/src/construct/code_fenced.rs @@ -0,0 +1,581 @@ +//! Code (fenced) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! +//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' +//! info ::= 1*text +//! meta ::= 1*text *( *space_or_tab 1*text ) +//! +//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the +//! ; marker of the opening fence sequence. +//! text ::= code - eol - space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! code ::= . ; any unicode code point (other than line endings). +//! ``` +//! +//! The above grammar does not show how whitespace is handled. +//! To parse code (fenced), let `X` be the number of whitespace characters +//! before the opening fence sequence. +//! Each line of content is then allowed (not required) to be indented with up +//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the code. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the code instead of a fence. +//! +//! Code (fenced) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` +//! element*][html-code] in the HTML spec for more info. +//! +//! The optional `meta` part is ignored: it is not used when parsing or +//! rendering. +//! The optional `info` part is used and is expected to specify the programming +//! language that the code is in. +//! Which value it holds depends on what your syntax highlighter supports, if +//! one is used. +//! The `info` is, when rendering to HTML, typically exposed as a class. +//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code` +//! element*][html-code]). +//! For example: +//! +//! ```markdown +//! ~~~css +//! * { color: tomato } +//! ~~~ +//! ``` +//! +//! Yields: +//! +//! ```html +//! <pre><code class="language-css">* { color: tomato } +//! </code></pre> +//! ``` +//! +//! The `info` and `meta` parts are interpreted as the string content type. +//! That means that character escapes and character reference are allowed. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the +//! [code (indented)][code-indented] construct. +//! That construct is less explicit, different from code (text), and has no +//! support for specifying the programming language, so it is recommended to +//! use code (fenced) instead of code (indented). +//! +//! ## References +//! +//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! [code-indented]: crate::construct::code_indented +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! <!-- To do: link `flow`, `text`, `code_text`, `string` --> + +use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::get_span; + +/// Kind of fences. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + GraveAccent, + /// Tilde code. + Tilde, +} + +/// State needed to parse code (fenced). +#[derive(Debug, Clone)] +struct Info { + /// Number of markers on the opening fence sequence. + size: usize, + /// Number of tabs or spaces of indentation before the opening fence + /// sequence. + prefix: usize, + /// Kind of fences. + kind: Kind, +} + +/// Start of fenced code. +/// +/// ```markdown +/// | ~~~js +/// console.log(1); +/// ~~~ +/// ``` +/// +/// Parsing note: normally, the prefix is already stripped. +/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need +/// it. +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFenced); + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_sequence_open), + )(tokenizer, code) +} + +/// Inside the opening fence, after an optional prefix, before a sequence. +/// +/// ```markdown +/// |~~~js +/// console.log(1); +/// ~~~ +/// ``` +fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + Code::Char(char) if char == '`' || char == '~' => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + kind: if char == '`' { + Kind::GraveAccent + } else { + Kind::Tilde + }, + }, + code, + ) + } + _ => (State::Nok, None), + } +} + +/// Inside the opening fence sequence. +/// +/// ```markdown +/// ~|~~js +/// console.log(1); +/// ~~~ +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + let mut info = info; + info.size += 1; + sequence_open(tokenizer, info, code) + })), + None, + ) + } + _ => { + if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { + (State::Nok, None) + } else { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| { + whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) + }, + |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), + )(tokenizer, code) + } + } + } +} + +/// Inside the opening fence, after the sequence (and optional whitespace), before the info. +/// +/// ```markdown +/// ~~~|js +/// console.log(1); +/// ~~~ +/// ``` +fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceInfo); + tokenizer.enter(TokenType::ChunkString); + info_inside(tokenizer, info, code, vec![]) + } + } +} + +/// Inside the opening fence info. +/// +/// ```markdown +/// ~~~j|s +/// console.log(1); +/// ~~~ +/// ``` +fn info_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, + codes: Vec<Code>, +) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), + )(tokenizer, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char(_) => { + let mut codes = codes; + codes.push(code); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + info_inside(tokenizer, info, code, codes) + })), + None, + ) + } + } +} + +/// Inside the opening fence, after the info and whitespace, before the meta. +/// +/// ```markdown +/// ~~~js |eval +/// console.log(1); +/// ~~~ +/// ``` +fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceMeta); + tokenizer.enter(TokenType::ChunkString); + meta(tokenizer, info, code) + } + } +} + +/// Inside the opening fence meta. +/// +/// ```markdown +/// ~~~js e|val +/// console.log(1); +/// ~~~ +/// ``` +fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceMeta); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), + None, + ) + } + } +} + +/// At an eol/eof in code, before a closing fence or before content. +/// +/// ```markdown +/// ~~~js| +/// aa| +/// ~~~ +/// ``` +fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let clone = info.clone(); + + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt( + |tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + close_before(tokenizer, info, code) + })), + None, + ) + }, + |ok| { + if ok { + Box::new(after) + } else { + Box::new(|tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + content_start(tokenizer, clone, code) + })), + None, + ) + }) + } + }, + )(tokenizer, code), + _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), + } +} + +/// Before a closing fence, before optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// | ~~~ +/// ``` +fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), + )(tokenizer, code) +} + +/// In a closing fence, after optional whitespace, before sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// |~~~ +/// ``` +fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + close_sequence(tokenizer, info, code, 0) + } + _ => (State::Nok, None), + } +} + +/// In the closing fence sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~|~~ +/// ``` +fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + close_sequence(tokenizer, info, code, size + 1) + })), + None, + ) + } + _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(close_whitespace_after), + )(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// After the closing fence sequence after optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~ | +/// ``` +fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// ~~~js +/// |aa +/// ~~~ +/// ``` +fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { + tokenizer.enter(TokenType::Whitespace); + content_prefix(tokenizer, info, 0, code) + } + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// Before code content, in a prefix. +/// +/// ```markdown +/// ~~~js +/// | aa +/// ~~~ +/// ``` +fn content_prefix( + tokenizer: &mut Tokenizer, + info: Info, + prefix: usize, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + content_prefix(tokenizer, info, prefix + 1, code) + })), + None, + ) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Whitespace); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.exit(TokenType::Whitespace); + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// In code content. +/// +/// ```markdown +/// ~~~js +/// |ab +/// a|b +/// ab| +/// ~~~ +/// ``` +fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + content_continue(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After fenced code. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeFenced); + (State::Ok, Some(vec![code])) +} diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs new file mode 100644 index 0000000..6bf089b --- /dev/null +++ b/src/construct/code_indented.rs @@ -0,0 +1,190 @@ +//! Code (indented) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line ) +//! +//! ; Restriction: at least one `code` must not be whitespace. +//! indented_filled_line ::= 4space_or_tab *code +//! blank_line ::= *space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Code (indented) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` +//! element*][html-code] in the HTML spec for more info. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the [code (fenced)][code-fenced] +//! construct. +//! That construct is more explicit, more similar to code (text), and has +//! support for specifying the programming language that the code is in, so it +//! is recommended to use that instead of indented code. +//! +//! ## References +//! +//! * [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js) +//! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks) +//! +//! [code-fenced]: crate::construct::code_fenced +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! <!-- To do: link `flow`, `code_text` --> + +use crate::constant::TAB_SIZE; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of code (indented). +/// +/// ```markdown +/// | asd +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.enter(TokenType::CodeIndented); + tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); + indent(tokenizer, code, 0) + } + _ => (State::Nok, None), + } +} + +/// Inside the initial whitespace. +/// +/// ```markdown +/// | asd +/// | asd +/// | asd +/// |asd +/// ``` +/// +/// > **Parsing note**: it is not needed to check if this first line is a +/// > filled line (that it has a non-whitespace character), because blank lines +/// > are parsed already, so we never run into that. +fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + _ if size == TAB_SIZE => { + tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); + at_break(tokenizer, code) + } + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + indent(tokenizer, code, size + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// At a break. +/// +/// ```markdown +/// |asd +/// asd| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer + .attempt(further_start, |ok| { + Box::new(if ok { at_break } else { after }) + })(tokenizer, code), + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content(tokenizer, code) + } + } +} + +/// Inside code content. +/// +/// ```markdown +/// |ab +/// a|b +/// ab| +/// ``` +fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(content)), None) + } + } +} + +/// After indented code. +/// +/// ```markdown +/// ab| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeIndented); + (State::Ok, Some(vec![code])) +} + +/// Right at a line ending, trying to parse another indent. +/// +/// ```markdown +/// ab| +/// cd +/// ``` +fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // To do: `nok` if lazy line. + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(further_start)), None) + } + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); + further_indent(tokenizer, code, 0) + } + _ => (State::Nok, None), + } +} + +/// Inside further whitespace. +/// +/// ```markdown +/// asd +/// | asd +/// ``` +fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + _ if size == TAB_SIZE => { + tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + further_indent(tokenizer, code, size + 1) + })), + None, + ) + } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); + further_start(tokenizer, code) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs new file mode 100644 index 0000000..b3aef1b --- /dev/null +++ b/src/construct/heading_atx.rs @@ -0,0 +1,175 @@ +//! Heading (atx) is a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! `CommonMark` introduced the requirement on whitespace existing after the +//! opening sequence and before text. +//! In older markdown versions, this was not required, and headings would form +//! without it. +//! +//! In markdown, it is also possible to create headings with the setext heading +//! construct. +//! The benefit of setext headings is that their text can include line endings. +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) +//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) +//! +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ +//! +//! <!-- To do: link `flow`, `setext` --> + +use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a heading (atx). +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('#') == code { + tokenizer.enter(TokenType::AtxHeading); + tokenizer.enter(TokenType::AtxHeadingSequence); + sequence_open(tokenizer, code, 0) + } else { + (State::Nok, None) + } +} + +/// In the opening sequence. +/// +/// ```markdown +/// #|# alpha +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ') + if rank > 0 => + { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } + Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_open(tokenizer, code, rank + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// ## |alpha +/// ## alpha| bravo +/// ## alpha |bravo +/// ## alpha bravo|## +/// ## alpha bravo ##| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::AtxHeading); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::AtxHeadingWhitespace); + whitespace(tokenizer, code) + } + Code::Char('#') => { + tokenizer.enter(TokenType::AtxHeadingSequence); + further_sequence(tokenizer, code) + } + Code::Char(_) => { + tokenizer.enter(TokenType::AtxHeadingText); + data(tokenizer, code) + } + } +} + +/// In a further sequence (after whitespace). +/// Could be normal “visible” hashes in the heading or a final sequence. +/// +/// ```markdown +/// ## alpha #|# +/// ``` +fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.consume(code); + (State::Fn(Box::new(further_sequence)), None) + } else { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } +} + +/// In whitespace. +/// +/// ```markdown +/// ## alpha | bravo +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(whitespace)), None) + } + _ => { + tokenizer.exit(TokenType::AtxHeadingWhitespace); + at_break(tokenizer, code) + } + } +} + +/// In text. +/// +/// ```markdown +/// ## al|pha +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + tokenizer.exit(TokenType::AtxHeadingText); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(data)), None) + } + } +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ] +//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ] +//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '</' tag_name whitespace_optional '>' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//! <div>This is a <code>div</code> but *this* is not emphasis.</div> +//! +//! <div> +//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//! </div> +//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! <!-- To do: link stuff --> + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { + /// Not yet known. + Unknown, + /// Symbol for `<script>` (condition 1). + Raw, + /// Symbol for `<!---->` (condition 2). + Comment, + /// Symbol for `<?php?>` (condition 3). + Instruction, + /// Symbol for `<!doctype>` (condition 4). + Declaration, + /// Symbol for `<![CDATA[]]>` (condition 5). + Cdata, + /// Symbol for `<div` (condition 6). + Basic, + /// Symbol for `<x>` (condition 7). + Complete, +} + +/// Type of quote, if we’re in an attribure, in complete (condition 7). +#[derive(Debug, Clone, PartialEq)] +enum QuoteKind { + /// Not in a quoted attribute. + None, + /// In a double quoted (`"`) attribute. + Double, + /// In a single quoted (`"`) attribute. + Single, +} + +/// State needed to parse HTML (flow). +#[derive(Debug, Clone)] +struct Info { + /// Kind of HTML (flow). + kind: Kind, + /// Whether this is a start tag (`<` not followed by `/`). + start_tag: bool, + /// Used depending on `kind` to either collect all parsed characters, or to + /// store expected characters. + buffer: Vec<char>, + /// `index` into `buffer` when expecting certain characters. + index: usize, + /// Current quote, when in a double or single quoted attribute value. + quote: QuoteKind, +} + +// To do: mark as concrete (block quotes or lists can’t “pierce” into HTML). + +/// Start of HTML (flow), before optional whitespace. +/// +/// ```markdown +/// |<x /> +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HtmlFlow); + tokenizer.enter(TokenType::HtmlFlowData); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before), + )(tokenizer, code) +} + +/// After optional whitespace, before `<`. +/// +/// ```markdown +/// |<x /> +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('<') == code { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + open( + tokenizer, + Info { + kind: Kind::Unknown, + start_tag: false, + buffer: vec![], + index: 0, + quote: QuoteKind::None, + }, + code, + ) + })), + None, + ) + } else { + (State::Nok, None) + } +} + +/// After `<`, before a tag name or other stuff. +/// +/// ```markdown +/// <|x /> +/// <|!doctype /> +/// <|!--xxx--/> +/// ``` +fn open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + declaration_start(tokenizer, info, code) + })), + None, + ) + } + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + tag_close_start(tokenizer, info, code) + })), + None, + ) + } + Code::Char('?') => { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Instruction; + tokenizer.consume(code); + // While we’re in an instruction instead of a declaration, we’re on a `?` + // right now, so we do need to search for `>`, similar to declarations. + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + // To do: life times. + let mut clone = info; + clone.start_tag = true; + tag_name(tokenizer, clone, code) + } + _ => (State::Nok, None), + } +} + +/// After `<!`, so inside a declaration, comment, or CDATA. +/// +/// ```markdown +/// <!|doctype /> +/// <!|--xxx--/> +/// <!|[CDATA[>&<]]> +/// ``` +fn declaration_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + let mut clone = info; + clone.kind = Kind::Comment; + ( + State::Fn(Box::new(|tokenizer, code| { + comment_open_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char('[') => { + tokenizer.consume(code); + let mut clone = info; + clone.kind = Kind::Cdata; + clone.buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + clone.index = 0; + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.kind = Kind::Declaration; + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, clone, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `<!-`, inside a comment, before another `-`. +/// +/// ```markdown +/// <!-|-xxx--/> +/// ``` +fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `<![`, inside CDATA, expecting `CDATA[`. +/// +/// ```markdown +/// <![|CDATA[>&<]]> +/// <![CD|ATA[>&<]]> +/// <![CDA|TA[>&<]]> +/// <![CDAT|A[>&<]]> +/// <![CDATA|[>&<]]> +/// ``` +fn cdata_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == info.buffer[info.index] => { + let mut clone = info; + clone.index += 1; + tokenizer.consume(code); + + if clone.index == clone.buffer.len() { + clone.buffer.clear(); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, clone, code) + })), + None, + ) + } else { + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open_inside(tokenizer, clone, code) + })), + None, + ) + } + } + _ => (State::Nok, None), + } +} + +/// After `</`, in a closing tag, before a tag name. +/// +/// ```markdown +/// </|x> +/// ``` +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// In a tag name. +/// +/// ```markdown +/// <a|b> +/// </a|b> +/// ``` +fn tag_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { + let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + let name = tag_name_buffer.as_str(); + let slash = if let Code::Char(char) = code { + char == '/' + } else { + false + }; + + if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Raw; + clone.buffer.clear(); + continuation(tokenizer, clone, code) + } else if HTML_BLOCK_NAMES.contains(&name) { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Basic; + clone.buffer.clear(); + + if slash { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + basic_self_closing(tokenizer, clone, code) + })), + None, + ) + } else { + continuation(tokenizer, clone, code) + } + } else { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Complete; + + // To do: do not support complete HTML when interrupting. + if clone.start_tag { + complete_attribute_name_before(tokenizer, clone, code) + } else { + complete_closing_tag_after(tokenizer, clone, code) + } + } + } + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), + None, + ) + } + Code::Char(_) => (State::Nok, None), + } +} + +/// After a closing slash of a basic tag name. +/// +/// ```markdown +/// <div/|> +/// ``` +fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After a closing slash of a complete tag name. +/// +/// ```markdown +/// <x/|> +/// </x/|> +/// ``` +fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_closing_tag_after(tokenizer, info, code) + })), + None, + ) + } + _ => complete_end(tokenizer, info, code), + } +} + +/// At a place where an attribute name would be valid. +/// +/// At first, this state is used after a complete tag name, after whitespace, +/// where it expects optional attributes or the end of the tag. +/// It is also reused after attributes, when expecting more optional +/// attributes. +/// +/// ```markdown +/// <x |/> +/// <x |:asd> +/// <x |_asd> +/// <x |asd> +/// <x | > +/// <x |> +/// ``` +fn complete_attribute_name_before( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_end(tokenizer, info, code) + })), + None, + ) + } + Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name(tokenizer, info, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name_before(tokenizer, info, code) + })), + None, + ) + } + _ => complete_end(tokenizer, info, code), + } +} + +/// In an attribute name. +/// +/// ```markdown +/// <x :|> +/// <x _|> +/// <x a|> +/// ``` +fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) + if char == '-' + || char == '.' + || char == ':' + || char == '_' + || char.is_ascii_alphanumeric() => + { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_name_after(tokenizer, info, code), + } +} + +/// After an attribute name, before an attribute initializer, the end of the +/// tag, or whitespace. +/// +/// ```markdown +/// <x a|> +/// <x a|=b> +/// <x a|="c"> +/// ``` +fn complete_attribute_name_after( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('=') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_before(tokenizer, info, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name_after(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_name_before(tokenizer, info, code), + } +} + +/// Before an unquoted, double quoted, or single quoted attribute value, +/// allowing whitespace. +/// +/// ```markdown +/// <x a=|b> +/// <x a=|"c"> +/// ``` +fn complete_attribute_value_before( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), + Code::Char(char) if char == '"' || char == '\'' => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.quote = if char == '"' { + QuoteKind::Double + } else { + QuoteKind::Single + }; + + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted(tokenizer, clone, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_before(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_value_unquoted(tokenizer, info, code), + } +} + +/// In a double or single quoted attribute value. +/// +/// ```markdown +/// <x a="|"> +/// <x a='|'> +/// ``` +fn complete_attribute_value_quoted( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + let marker = if info.quote == QuoteKind::Double { + '"' + } else { + '\'' + }; + + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted_after(tokenizer, info, code) + })), + None, + ) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// In an unquoted attribute value. +/// +/// ```markdown +/// <x a=b|c> +/// ``` +fn complete_attribute_value_unquoted( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + complete_attribute_name_after(tokenizer, info, code) + } + Code::Char(_) => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_unquoted(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After a double or single quoted attribute value, before whitespace or the +/// end of the tag. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_attribute_value_quoted_after( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { + complete_attribute_name_before(tokenizer, info, code) + } + _ => (State::Nok, None), + } +} + +/// In certain circumstances of a complete tag where only an `>` is allowed. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_end(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_after(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `>` in a complete tag. +/// +/// ```markdown +/// <x>| +/// ``` +fn complete_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + continuation(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_after(tokenizer, info, code) + })), + None, + ) + } + Code::Char(_) => (State::Nok, None), + } +} + +/// Inside continuation of any HTML kind. +/// +/// ```markdown +/// <!--x|xx--> +/// ``` +fn continuation(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_comment_inside(tokenizer, info, code) + })), + None, + ) + } + Code::Char('<') if info.kind == Kind::Raw => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_tag_open(tokenizer, info, code) + })), + None, + ) + } + Code::Char('>') if info.kind == Kind::Declaration => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + Code::Char('?') if info.kind == Kind::Instruction => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + Code::Char(']') if info.kind == Kind::Cdata => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_character_data_inside(tokenizer, info, code) + })), + None, + ) + } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') + if info.kind == Kind::Basic || info.kind == Kind::Complete => + { + let clone = info; + + tokenizer.check(blank_line_before, |ok| { + if ok { + Box::new(|tokenizer, code| continuation_close(tokenizer, clone, code)) + } else { + Box::new(|tokenizer, code| continuation_at_line_ending(tokenizer, clone, code)) + } + })(tokenizer, code) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + continuation_at_line_ending(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// In continuation, before an eol or eof. +/// +/// ```markdown +/// <x>| +/// ``` +fn continuation_at_line_ending(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::HtmlFlowData); + html_continue_start(tokenizer, info, code) +} + +/// In continuation, after an eol. +/// +/// ```markdown +/// <x>| +/// asd +/// ``` +fn html_continue_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::HtmlFlow); + (State::Ok, Some(vec![code])) + } + // To do: do not allow lazy lines. + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + html_continue_start(tokenizer, info, code) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::HtmlFlowData); + continuation(tokenizer, info, code) + } + } +} + +/// In comment continuation, after one `-`, expecting another. +/// +/// ```markdown +/// <!--xxx-|-> +/// ``` +fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In raw continuation, after `<`, expecting a `/`. +/// +/// ```markdown +/// <script>console.log(1)<|/script> +/// ``` +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_end_tag(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In raw continuation, after `</`, expecting or inside a raw tag name. +/// +/// ```markdown +/// <script>console.log(1)</|script> +/// <script>console.log(1)</s|cript> +/// <script>console.log(1)</script|> +/// ``` +fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + // To do: life times. + let mut clone = info; + clone.buffer.clear(); + + if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, clone, code) + })), + None, + ) + } else { + continuation(tokenizer, clone, code) + } + } + Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_end_tag(tokenizer, clone, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In cdata continuation, after `]`, expecting `]>`. +/// +/// ```markdown +/// <![CDATA[>&<]|]> +/// ``` +fn continuation_character_data_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In declaration or instruction continuation, waiting for `>` to close it. +/// +/// ```markdown +/// <!--|> +/// <?ab?|> +/// <?|> +/// <!q|> +/// <!--ab--|> +/// <!--ab--|-> +/// <!--ab---|> +/// <![CDATA[>&<]]|> +/// ``` +fn continuation_declaration_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In closed continuation: everything we get until the eol/eof is part of it. +/// +/// ```markdown +/// <!doctype>| +/// ``` +fn continuation_close(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::HtmlFlowData); + tokenizer.exit(TokenType::HtmlFlow); + (State::Ok, Some(vec![code])) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// Before a line ending, expecting a blank line. +/// +/// ```markdown +/// <div>| +/// +/// ``` +fn blank_line_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(blank_line)), None) +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs new file mode 100644 index 0000000..d671db6 --- /dev/null +++ b/src/construct/mod.rs @@ -0,0 +1,11 @@ +//! Constructs found in markdown. + +pub mod blank_line; +pub mod character_escape; +pub mod character_reference; +pub mod code_fenced; +pub mod code_indented; +pub mod heading_atx; +pub mod html_flow; +pub mod partial_whitespace; +pub mod thematic_break; diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs new file mode 100644 index 0000000..dd0d2b5 --- /dev/null +++ b/src/construct/partial_whitespace.rs @@ -0,0 +1,66 @@ +//! A little helper to parse `space_or_tab` +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! space_or_tab ::= 1*(' ' '\t') +//! ``` +//! +//! Depending on where whitespace can occur, it can be optional (or not), +//! and present in the rendered result (or not). +//! +//! ## References +//! +//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) +//! +//! <!-- To do: link stuff --> + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +// To do: should `token_type` be a `Some`, with `None` defaulting to something? +// To do: should `max: Some(usize)` be added? + +/// Before whitespace. +/// +/// ```markdown +/// alpha| bravo +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + // To do: lifetimes. + let clone = token_type.clone(); + tokenizer.enter(token_type); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// In whitespace. +/// +/// ```markdown +/// alpha |bravo +/// alpha | bravo +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + inside(tokenizer, code, token_type) + })), + None, + ) + } + _ => { + tokenizer.exit(token_type); + (State::Ok, Some(vec![code])) + } + } +} diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs new file mode 100644 index 0000000..15ebac7 --- /dev/null +++ b/src/construct/thematic_break.rs @@ -0,0 +1,137 @@ +//! Thematic breaks, sometimes called horizontal rules, are a construct that +//! occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: all markers must be identical. +//! ; Restriction: at least 3 markers must be used. +//! thematic_break ::= *space_or_tab 1*(1*marker *space_or_tab) +//! +//! space_or_tab ::= ' ' | '\t' +//! marker ::= '*' | '-' | '_' +//! ``` +//! +//! Thematic breaks in markdown typically relate to the HTML element `<hr>`. +//! See [*§ 4.4.2 The `hr` element* in the HTML spec][html] for more info. +//! +//! It is recommended to use exactly three asterisks without whitespace when +//! writing markdown. +//! As using more than three markers has no effect other than wasting space, +//! it is recommended to use exactly three markers. +//! Thematic breaks formed with asterisks or dashes can interfere with lists +//! in if there is whitespace between them: `* * *` and `- - -`. +//! For these reasons, it is recommend to not use spaces or tabs between the +//! markers. +//! Thematic breaks formed with dashes (without whitespace) can also form +//! setext headings. +//! As dashes and underscores frequently occur in natural language and URLs, it +//! is recommended to use asterisks for thematic breaks to distinguish from +//! such use. +//! Because asterisks can be used to form the most markdown constructs, using +//! them has the added benefit of making it easier to gloss over markdown: you +//! can look for asterisks to find syntax while not worrying about other +//! characters. +//! +//! ## References +//! +//! * [`thematic-break.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/thematic-break.js) +//! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks) +//! +//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element +//! +//! <!-- To do: link `flow` --> + +use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a thematic break. +/// +/// ```markdown +/// |*** +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '*' || char == '-' || char == '_' => { + tokenizer.enter(TokenType::ThematicBreak); + at_break(tokenizer, code, char, 0) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// |*** +/// *| * * +/// * |* * +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::ThematicBreakSequence); + sequence(tokenizer, code, marker, size) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::ThematicBreakWhitespace); + whitespace(tokenizer, code, marker, size) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') + if size >= THEMATIC_BREAK_MARKER_COUNT_MIN => + { + tokenizer.exit(TokenType::ThematicBreak); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// In a sequence of markers. +/// +/// ```markdown +/// |*** +/// *|** +/// **|* +/// ``` +fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence(tokenizer, code, marker, size + 1) + })), + None, + ) + } + _ => { + tokenizer.exit(TokenType::ThematicBreakSequence); + at_break(tokenizer, code, marker, size) + } + } +} + +/// In whitespace. +/// +/// ```markdown +/// * |* * +/// * | * * +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + whitespace(tokenizer, code, marker, size) + })), + None, + ) + } + _ => { + tokenizer.exit(TokenType::ThematicBreakWhitespace); + at_break(tokenizer, code, marker, size) + } + } +} |