From 4c06c8554c35887f8f5147783953b2b7e7c2327f Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 8 Jun 2022 15:52:16 +0200 Subject: . --- src/construct/code_fenced.rs | 581 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 581 insertions(+) create mode 100644 src/construct/code_fenced.rs (limited to 'src/construct/code_fenced.rs') diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs new file mode 100644 index 0000000..2068a62 --- /dev/null +++ b/src/construct/code_fenced.rs @@ -0,0 +1,581 @@ +//! Code (fenced) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! +//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' +//! info ::= 1*text +//! meta ::= 1*text *( *space_or_tab 1*text ) +//! +//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the +//! ; marker of the opening fence sequence. +//! text ::= code - eol - space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! code ::= . ; any unicode code point (other than line endings). +//! ``` +//! +//! The above grammar does not show how whitespace is handled. +//! To parse code (fenced), let `X` be the number of whitespace characters +//! before the opening fence sequence. +//! Each line of content is then allowed (not required) to be indented with up +//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the code. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the code instead of a fence. +//! +//! Code (fenced) relates to both the `
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! The optional `meta` part is ignored: it is not used when parsing or
+//! rendering.
+//! The optional `info` part is used and is expected to specify the programming
+//! language that the code is in.
+//! Which value it holds depends on what your syntax highlighter supports, if
+//! one is used.
+//! The `info` is, when rendering to HTML, typically exposed as a class.
+//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
+//! element*][html-code]).
+//! For example:
+//!
+//! ```markdown
+//! ~~~css
+//! * { color: tomato }
+//! ~~~
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! 
* { color: tomato }
+//! 
+//! ``` +//! +//! The `info` and `meta` parts are interpreted as the string content type. +//! That means that character escapes and character reference are allowed. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the +//! [code (indented)][code-indented] construct. +//! That construct is less explicit, different from code (text), and has no +//! support for specifying the programming language, so it is recommended to +//! use code (fenced) instead of code (indented). +//! +//! ## References +//! +//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! [code-indented]: crate::construct::code_indented +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! + +use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::get_span; + +/// Kind of fences. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + GraveAccent, + /// Tilde code. + Tilde, +} + +/// State needed to parse code (fenced). +#[derive(Debug, Clone)] +struct Info { + /// Number of markers on the opening fence sequence. + size: usize, + /// Number of tabs or spaces of indentation before the opening fence + /// sequence. + prefix: usize, + /// Kind of fences. + kind: Kind, +} + +/// Start of fenced code. +/// +/// ```markdown +/// | ~~~js +/// console.log(1); +/// ~~~ +/// ``` +/// +/// Parsing note: normally, the prefix is already stripped. +/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need +/// it. +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFenced); + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_sequence_open), + )(tokenizer, code) +} + +/// Inside the opening fence, after an optional prefix, before a sequence. +/// +/// ```markdown +/// |~~~js +/// console.log(1); +/// ~~~ +/// ``` +fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + Code::Char(char) if char == '`' || char == '~' => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + kind: if char == '`' { + Kind::GraveAccent + } else { + Kind::Tilde + }, + }, + code, + ) + } + _ => (State::Nok, None), + } +} + +/// Inside the opening fence sequence. +/// +/// ```markdown +/// ~|~~js +/// console.log(1); +/// ~~~ +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + let mut info = info; + info.size += 1; + sequence_open(tokenizer, info, code) + })), + None, + ) + } + _ => { + if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { + (State::Nok, None) + } else { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| { + whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) + }, + |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), + )(tokenizer, code) + } + } + } +} + +/// Inside the opening fence, after the sequence (and optional whitespace), before the info. +/// +/// ```markdown +/// ~~~|js +/// console.log(1); +/// ~~~ +/// ``` +fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceInfo); + tokenizer.enter(TokenType::ChunkString); + info_inside(tokenizer, info, code, vec![]) + } + } +} + +/// Inside the opening fence info. +/// +/// ```markdown +/// ~~~j|s +/// console.log(1); +/// ~~~ +/// ``` +fn info_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, + codes: Vec, +) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), + )(tokenizer, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char(_) => { + let mut codes = codes; + codes.push(code); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + info_inside(tokenizer, info, code, codes) + })), + None, + ) + } + } +} + +/// Inside the opening fence, after the info and whitespace, before the meta. +/// +/// ```markdown +/// ~~~js |eval +/// console.log(1); +/// ~~~ +/// ``` +fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceMeta); + tokenizer.enter(TokenType::ChunkString); + meta(tokenizer, info, code) + } + } +} + +/// Inside the opening fence meta. +/// +/// ```markdown +/// ~~~js e|val +/// console.log(1); +/// ~~~ +/// ``` +fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceMeta); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), + None, + ) + } + } +} + +/// At an eol/eof in code, before a closing fence or before content. +/// +/// ```markdown +/// ~~~js| +/// aa| +/// ~~~ +/// ``` +fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let clone = info.clone(); + + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt( + |tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + close_before(tokenizer, info, code) + })), + None, + ) + }, + |ok| { + if ok { + Box::new(after) + } else { + Box::new(|tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + content_start(tokenizer, clone, code) + })), + None, + ) + }) + } + }, + )(tokenizer, code), + _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), + } +} + +/// Before a closing fence, before optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// | ~~~ +/// ``` +fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), + )(tokenizer, code) +} + +/// In a closing fence, after optional whitespace, before sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// |~~~ +/// ``` +fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + close_sequence(tokenizer, info, code, 0) + } + _ => (State::Nok, None), + } +} + +/// In the closing fence sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~|~~ +/// ``` +fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + close_sequence(tokenizer, info, code, size + 1) + })), + None, + ) + } + _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(close_whitespace_after), + )(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// After the closing fence sequence after optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~ | +/// ``` +fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// ~~~js +/// |aa +/// ~~~ +/// ``` +fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { + tokenizer.enter(TokenType::Whitespace); + content_prefix(tokenizer, info, 0, code) + } + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// Before code content, in a prefix. +/// +/// ```markdown +/// ~~~js +/// | aa +/// ~~~ +/// ``` +fn content_prefix( + tokenizer: &mut Tokenizer, + info: Info, + prefix: usize, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + content_prefix(tokenizer, info, prefix + 1, code) + })), + None, + ) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Whitespace); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.exit(TokenType::Whitespace); + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// In code content. +/// +/// ```markdown +/// ~~~js +/// |ab +/// a|b +/// ab| +/// ~~~ +/// ``` +fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + content_continue(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After fenced code. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeFenced); + (State::Ok, Some(vec![code])) +} -- cgit