From 4c06c8554c35887f8f5147783953b2b7e7c2327f Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 8 Jun 2022 15:52:16 +0200 Subject: . --- src/construct/blank_line.rs | 61 ++ src/construct/character_escape.rs | 69 +++ src/construct/character_reference.rs | 237 ++++++++ src/construct/code_fenced.rs | 581 ++++++++++++++++++ src/construct/code_indented.rs | 190 ++++++ src/construct/heading_atx.rs | 175 ++++++ src/construct/html_flow.rs | 1068 ++++++++++++++++++++++++++++++++++ src/construct/mod.rs | 11 + src/construct/partial_whitespace.rs | 66 +++ src/construct/thematic_break.rs | 137 +++++ 10 files changed, 2595 insertions(+) create mode 100644 src/construct/blank_line.rs create mode 100644 src/construct/character_escape.rs create mode 100644 src/construct/character_reference.rs create mode 100644 src/construct/code_fenced.rs create mode 100644 src/construct/code_indented.rs create mode 100644 src/construct/heading_atx.rs create mode 100644 src/construct/html_flow.rs create mode 100644 src/construct/mod.rs create mode 100644 src/construct/partial_whitespace.rs create mode 100644 src/construct/thematic_break.rs (limited to 'src/construct') diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs new file mode 100644 index 0000000..7b7962b --- /dev/null +++ b/src/construct/blank_line.rs @@ -0,0 +1,61 @@ +//! Blank lines are a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! blank_line ::= *(' ' '\t') +//! ``` +//! +//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! from another paragraph. +//! In several cases, blank lines are not needed between flow constructs, +//! such as between two headings. +//! Sometimes, whether blank lines are present, changes the behavior of how +//! HTML is rendered, such as whether blank lines are present between list +//! items in a list. +//! More than one blank line is never needed in `CommonMark`. +//! +//! Because blank lines can be empty (line endings are not considered part of +//! it), and events cannot be empty, blank lines are not present as a token. +//! +//! ## References +//! +//! * [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js) +//! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) +//! +//! + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a blank line. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace), + |_ok| Box::new(after), + )(tokenizer, code) +} + +/// After zero or more spaces or tabs, before a line ending or EOF. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +fn after(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs new file mode 100644 index 0000000..5ea995e --- /dev/null +++ b/src/construct/character_escape.rs @@ -0,0 +1,69 @@ +//! Character escapes are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_escape ::= '\\' ascii_punctuation +//! ``` +//! +//! Like much of markdown, there are no “invalid” character escapes: just a +//! slash, or a slash followed by anything other than an ASCII punctuation +//! character, is exactly that: just a slash. +//! To escape (most) arbitrary characters, use a +//! [character reference][] instead +//! (as in, `&`, `{`, or say ` `). +//! It is also possible to escape a line ending in text with a similar +//! construct: a backslash followed by a line ending (that is part of the +//! construct instead of ending it). +//! +//! ## References +//! +//! * [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js) +//! * [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes) +//! +//! [character reference]: crate::construct::character_reference +//! +//! + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a character escape. +/// +/// ```markdown +/// a|\*b +/// a|\b +/// a|\ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('\\') => { + tokenizer.enter(TokenType::CharacterEscape); + tokenizer.enter(TokenType::CharacterEscapeMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeMarker); + (State::Fn(Box::new(inside)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character escape, after `\`. +/// +/// ```markdown +/// a\|*b +/// a\|b +/// a\| b +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_punctuation() => { + tokenizer.enter(TokenType::CharacterEscapeValue); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeValue); + tokenizer.exit(TokenType::CharacterEscape); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs new file mode 100644 index 0000000..27275d5 --- /dev/null +++ b/src/construct/character_reference.rs @@ -0,0 +1,237 @@ +//! Character references are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_reference ::= '&' (numeric | named) ';' +//! +//! numeric ::= '#' (hexadecimal | decimal) +//! ; Note: Limit of `6` imposed as all bigger numbers are invalid: +//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) +//! ; Note: Limit of `7` imposed as all bigger numbers are invalid: +//! decimal ::= 1*7(ascii_digit) +//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`: +//! ; Note: Limited to any known named character reference (see `constants.rs`) +//! named ::= 1*31(ascii_alphanumeric) +//! ``` +//! +//! Like much of markdown, there are no “invalid” character references. +//! However, for security reasons, several numeric character references parse +//! fine but are not rendered as their corresponding character and they are +//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). +//! See [`decode_numeric_character_reference`][decode_numeric] for more info. +//! +//! To escape ASCII punctuation characters, use the terser +//! [character escape][character_escape] construct instead (as in, `\&`). +//! +//! Character references in markdown are not the same as character references +//! in HTML. +//! Notably, HTML allows several character references without a closing +//! semicolon. +//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info. +//! +//! Character references are parsed insensitive to casing. +//! The casing of hexadecimal numeric character references has no effect. +//! The casing of named character references does not matter when parsing them, +//! but does affect whether they match. +//! Depending on the name, one or more cases are allowed, such as that `AMP` +//! and `amp` are both allowed but other cases are not. +//! See [`CHARACTER_REFERENCE_NAMES`][character_reference_names] for which +//! names match. +//! +//! ## References +//! +//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) +//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +//! +//! [character_escape]: crate::construct::character_reference +//! [decode_numeric]: crate::util::decode_numeric_character_reference +//! [character_reference_names]: crate::constant::CHARACTER_REFERENCE_NAMES +//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state +//! +//! + +use crate::constant::{ + CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_NAMED_SIZE_MAX, CHARACTER_REFERENCE_NAMES, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of a character reference. +#[derive(Debug, Clone)] +pub enum Kind { + /// Numeric decimal character reference (` `). + Decimal, + /// Numeric hexadecimal character reference (`{`). + Hexadecimal, + /// Named character reference (`&`). + Named, +} + +/// State needed to parse character references. +#[derive(Debug, Clone)] +struct Info { + /// All parsed characters. + buffer: Vec, + /// Kind of character reference. + kind: Kind, +} + +/// Start of a character reference. +/// +/// ```markdown +/// a|&b +/// a|{b +/// a| b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('&') => { + tokenizer.enter(TokenType::CharacterReference); + tokenizer.enter(TokenType::CharacterReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarker); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character reference, after `&`, before `#` for numeric references +/// or an alphanumeric for named references. +/// +/// ```markdown +/// a&|amp;b +/// a&|#123;b +/// a&|#x9;b +/// ``` +fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); + (State::Fn(Box::new(numeric)), None) + } else { + tokenizer.enter(TokenType::CharacterReferenceValue); + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Named, + }, + ) + } +} + +/// Inside a numeric character reference, right before `x` for hexadecimals, +/// or a digit for decimals. +/// +/// ```markdown +/// a&#|123;b +/// a&#|x9;b +/// ``` +fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == 'x' || char == 'X' => { + tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.enter(TokenType::CharacterReferenceValue); + + ( + State::Fn(Box::new(|tokenizer, code| { + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Hexadecimal, + }, + ) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::CharacterReferenceValue); + + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Decimal, + }, + ) + } + } +} + +/// Inside a character reference value, after the markers (`&#x`, `&#`, or +/// `&`) that define its kind, but before the `;`. +/// The character reference kind defines what and how many characters are +/// allowed. +/// +/// ```markdown +/// a&a|mp;b +/// a|23;b +/// a&#x|9;b +/// ``` +fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + match code { + Code::Char(';') if !info.buffer.is_empty() => { + tokenizer.exit(TokenType::CharacterReferenceValue); + let value = info.buffer.iter().collect::(); + + if let Kind::Named = info.kind { + if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { + return (State::Nok, Some(vec![code])); + } + } + + tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); + tokenizer.exit(TokenType::CharacterReference); + (State::Ok, None) + } + Code::Char(char) => { + let len = info.buffer.len(); + + let cont = match info.kind { + Kind::Hexadecimal + if char.is_ascii_hexdigit() + && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX => + { + true + } + Kind::Decimal + if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX => + { + true + } + Kind::Named + if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX => + { + true + } + _ => false, + }; + + if cont { + let mut clone = info; + clone.buffer.push(char); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))), + None, + ) + } else { + (State::Nok, None) + } + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs new file mode 100644 index 0000000..2068a62 --- /dev/null +++ b/src/construct/code_fenced.rs @@ -0,0 +1,581 @@ +//! Code (fenced) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! +//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' +//! info ::= 1*text +//! meta ::= 1*text *( *space_or_tab 1*text ) +//! +//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the +//! ; marker of the opening fence sequence. +//! text ::= code - eol - space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! code ::= . ; any unicode code point (other than line endings). +//! ``` +//! +//! The above grammar does not show how whitespace is handled. +//! To parse code (fenced), let `X` be the number of whitespace characters +//! before the opening fence sequence. +//! Each line of content is then allowed (not required) to be indented with up +//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the code. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the code instead of a fence. +//! +//! Code (fenced) relates to both the `
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! The optional `meta` part is ignored: it is not used when parsing or
+//! rendering.
+//! The optional `info` part is used and is expected to specify the programming
+//! language that the code is in.
+//! Which value it holds depends on what your syntax highlighter supports, if
+//! one is used.
+//! The `info` is, when rendering to HTML, typically exposed as a class.
+//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
+//! element*][html-code]).
+//! For example:
+//!
+//! ```markdown
+//! ~~~css
+//! * { color: tomato }
+//! ~~~
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! 
* { color: tomato }
+//! 
+//! ``` +//! +//! The `info` and `meta` parts are interpreted as the string content type. +//! That means that character escapes and character reference are allowed. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the +//! [code (indented)][code-indented] construct. +//! That construct is less explicit, different from code (text), and has no +//! support for specifying the programming language, so it is recommended to +//! use code (fenced) instead of code (indented). +//! +//! ## References +//! +//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! [code-indented]: crate::construct::code_indented +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! + +use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::get_span; + +/// Kind of fences. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + GraveAccent, + /// Tilde code. + Tilde, +} + +/// State needed to parse code (fenced). +#[derive(Debug, Clone)] +struct Info { + /// Number of markers on the opening fence sequence. + size: usize, + /// Number of tabs or spaces of indentation before the opening fence + /// sequence. + prefix: usize, + /// Kind of fences. + kind: Kind, +} + +/// Start of fenced code. +/// +/// ```markdown +/// | ~~~js +/// console.log(1); +/// ~~~ +/// ``` +/// +/// Parsing note: normally, the prefix is already stripped. +/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need +/// it. +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFenced); + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_sequence_open), + )(tokenizer, code) +} + +/// Inside the opening fence, after an optional prefix, before a sequence. +/// +/// ```markdown +/// |~~~js +/// console.log(1); +/// ~~~ +/// ``` +fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + Code::Char(char) if char == '`' || char == '~' => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + kind: if char == '`' { + Kind::GraveAccent + } else { + Kind::Tilde + }, + }, + code, + ) + } + _ => (State::Nok, None), + } +} + +/// Inside the opening fence sequence. +/// +/// ```markdown +/// ~|~~js +/// console.log(1); +/// ~~~ +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + let mut info = info; + info.size += 1; + sequence_open(tokenizer, info, code) + })), + None, + ) + } + _ => { + if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { + (State::Nok, None) + } else { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| { + whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) + }, + |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), + )(tokenizer, code) + } + } + } +} + +/// Inside the opening fence, after the sequence (and optional whitespace), before the info. +/// +/// ```markdown +/// ~~~|js +/// console.log(1); +/// ~~~ +/// ``` +fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceInfo); + tokenizer.enter(TokenType::ChunkString); + info_inside(tokenizer, info, code, vec![]) + } + } +} + +/// Inside the opening fence info. +/// +/// ```markdown +/// ~~~j|s +/// console.log(1); +/// ~~~ +/// ``` +fn info_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, + codes: Vec, +) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), + )(tokenizer, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char(_) => { + let mut codes = codes; + codes.push(code); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + info_inside(tokenizer, info, code, codes) + })), + None, + ) + } + } +} + +/// Inside the opening fence, after the info and whitespace, before the meta. +/// +/// ```markdown +/// ~~~js |eval +/// console.log(1); +/// ~~~ +/// ``` +fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceMeta); + tokenizer.enter(TokenType::ChunkString); + meta(tokenizer, info, code) + } + } +} + +/// Inside the opening fence meta. +/// +/// ```markdown +/// ~~~js e|val +/// console.log(1); +/// ~~~ +/// ``` +fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceMeta); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), + None, + ) + } + } +} + +/// At an eol/eof in code, before a closing fence or before content. +/// +/// ```markdown +/// ~~~js| +/// aa| +/// ~~~ +/// ``` +fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let clone = info.clone(); + + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt( + |tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + close_before(tokenizer, info, code) + })), + None, + ) + }, + |ok| { + if ok { + Box::new(after) + } else { + Box::new(|tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + content_start(tokenizer, clone, code) + })), + None, + ) + }) + } + }, + )(tokenizer, code), + _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), + } +} + +/// Before a closing fence, before optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// | ~~~ +/// ``` +fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), + )(tokenizer, code) +} + +/// In a closing fence, after optional whitespace, before sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// |~~~ +/// ``` +fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + close_sequence(tokenizer, info, code, 0) + } + _ => (State::Nok, None), + } +} + +/// In the closing fence sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~|~~ +/// ``` +fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + close_sequence(tokenizer, info, code, size + 1) + })), + None, + ) + } + _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(close_whitespace_after), + )(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// After the closing fence sequence after optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~ | +/// ``` +fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// ~~~js +/// |aa +/// ~~~ +/// ``` +fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { + tokenizer.enter(TokenType::Whitespace); + content_prefix(tokenizer, info, 0, code) + } + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// Before code content, in a prefix. +/// +/// ```markdown +/// ~~~js +/// | aa +/// ~~~ +/// ``` +fn content_prefix( + tokenizer: &mut Tokenizer, + info: Info, + prefix: usize, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + content_prefix(tokenizer, info, prefix + 1, code) + })), + None, + ) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Whitespace); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.exit(TokenType::Whitespace); + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// In code content. +/// +/// ```markdown +/// ~~~js +/// |ab +/// a|b +/// ab| +/// ~~~ +/// ``` +fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + content_continue(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After fenced code. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeFenced); + (State::Ok, Some(vec![code])) +} diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs new file mode 100644 index 0000000..6bf089b --- /dev/null +++ b/src/construct/code_indented.rs @@ -0,0 +1,190 @@ +//! Code (indented) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line ) +//! +//! ; Restriction: at least one `code` must not be whitespace. +//! indented_filled_line ::= 4space_or_tab *code +//! blank_line ::= *space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Code (indented) relates to both the `
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! In markdown, it is also possible to use code (text) in the text content
+//! type.
+//! It is also possible to create code with the [code (fenced)][code-fenced]
+//! construct.
+//! That construct is more explicit, more similar to code (text), and has
+//! support for specifying the programming language that the code is in, so it
+//! is recommended to use that instead of indented code.
+//!
+//! ## References
+//!
+//! *   [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js)
+//! *   [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks)
+//!
+//! [code-fenced]: crate::construct::code_fenced
+//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+//!
+//! 
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of code (indented).
+///
+/// ```markdown
+/// |    asd
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.enter(TokenType::CodeIndented);
+            tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+            indent(tokenizer, code, 0)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// Inside the initial whitespace.
+///
+/// ```markdown
+///  |   asd
+///   |  asd
+///    | asd
+///     |asd
+/// ```
+///
+/// > **Parsing note**: it is not needed to check if this first line is a
+/// > filled line (that it has a non-whitespace character), because blank lines
+/// > are parsed already, so we never run into that.
+fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        _ if size == TAB_SIZE => {
+            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+            at_break(tokenizer, code)
+        }
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    indent(tokenizer, code, size + 1)
+                })),
+                None,
+            )
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// At a break.
+///
+/// ```markdown
+///     |asd
+///     asd|
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None => after(tokenizer, code),
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
+            .attempt(further_start, |ok| {
+                Box::new(if ok { at_break } else { after })
+            })(tokenizer, code),
+        _ => {
+            tokenizer.enter(TokenType::CodeFlowChunk);
+            content(tokenizer, code)
+        }
+    }
+}
+
+/// Inside code content.
+///
+/// ```markdown
+///     |ab
+///     a|b
+///     ab|
+/// ```
+fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.exit(TokenType::CodeFlowChunk);
+            at_break(tokenizer, code)
+        }
+        _ => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(content)), None)
+        }
+    }
+}
+
+/// After indented code.
+///
+/// ```markdown
+///     ab|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.exit(TokenType::CodeIndented);
+    (State::Ok, Some(vec![code]))
+}
+
+/// Right at a line ending, trying to parse another indent.
+///
+/// ```markdown
+///     ab|
+///     cd
+/// ```
+fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    // To do: `nok` if lazy line.
+    match code {
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.enter(TokenType::LineEnding);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::LineEnding);
+            (State::Fn(Box::new(further_start)), None)
+        }
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+            further_indent(tokenizer, code, 0)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// Inside further whitespace.
+///
+/// ```markdown
+///     asd
+///   |  asd
+/// ```
+fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        _ if size == TAB_SIZE => {
+            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+            (State::Ok, Some(vec![code]))
+        }
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    further_indent(tokenizer, code, size + 1)
+                })),
+                None,
+            )
+        }
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+            further_start(tokenizer, code)
+        }
+        _ => (State::Nok, None),
+    }
+}
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
new file mode 100644
index 0000000..b3aef1b
--- /dev/null
+++ b/src/construct/heading_atx.rs
@@ -0,0 +1,175 @@
+//! Heading (atx) is a construct that occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab
+//!
+//! code ::= . ; any unicode code point (other than line endings).
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Headings in markdown relate to the `

` through `

` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! `CommonMark` introduced the requirement on whitespace existing after the +//! opening sequence and before text. +//! In older markdown versions, this was not required, and headings would form +//! without it. +//! +//! In markdown, it is also possible to create headings with the setext heading +//! construct. +//! The benefit of setext headings is that their text can include line endings. +//! However, their limit is that they cannot form `

` through `

` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) +//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) +//! +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ +//! +//! + +use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a heading (atx). +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('#') == code { + tokenizer.enter(TokenType::AtxHeading); + tokenizer.enter(TokenType::AtxHeadingSequence); + sequence_open(tokenizer, code, 0) + } else { + (State::Nok, None) + } +} + +/// In the opening sequence. +/// +/// ```markdown +/// #|# alpha +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ') + if rank > 0 => + { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } + Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_open(tokenizer, code, rank + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// ## |alpha +/// ## alpha| bravo +/// ## alpha |bravo +/// ## alpha bravo|## +/// ## alpha bravo ##| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::AtxHeading); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::AtxHeadingWhitespace); + whitespace(tokenizer, code) + } + Code::Char('#') => { + tokenizer.enter(TokenType::AtxHeadingSequence); + further_sequence(tokenizer, code) + } + Code::Char(_) => { + tokenizer.enter(TokenType::AtxHeadingText); + data(tokenizer, code) + } + } +} + +/// In a further sequence (after whitespace). +/// Could be normal “visible” hashes in the heading or a final sequence. +/// +/// ```markdown +/// ## alpha #|# +/// ``` +fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.consume(code); + (State::Fn(Box::new(further_sequence)), None) + } else { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } +} + +/// In whitespace. +/// +/// ```markdown +/// ## alpha | bravo +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(whitespace)), None) + } + _ => { + tokenizer.exit(TokenType::AtxHeadingWhitespace); + at_break(tokenizer, code) + } + } +} + +/// In text. +/// +/// ```markdown +/// ## al|pha +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + tokenizer.exit(TokenType::AtxHeadingText); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(data)), None) + } + } +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '' *line ] +//! cdata ::= '' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//!
This is a div but *this* is not emphasis.
+//! +//!
+//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//!
+//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { + /// Not yet known. + Unknown, + /// Symbol for `