From acc35758778bfda5cb01951533868eb8baa2e2d2 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 15 Jun 2022 18:17:01 +0200 Subject: Add code (text) --- src/construct/code_fenced.rs | 13 +-- src/construct/code_indented.rs | 12 +-- src/construct/code_text.rs | 217 +++++++++++++++++++++++++++++++++++++++++ src/construct/mod.rs | 3 +- 4 files changed, 230 insertions(+), 15 deletions(-) create mode 100644 src/construct/code_text.rs (limited to 'src/construct') diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index c852e8d..12c8bd6 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -66,10 +66,10 @@ //! The `info` and `meta` parts are interpreted as the [string][] content type. //! That means that character escapes and character reference are allowed. //! -//! In markdown, it is also possible to use code (text) in the [text][] content -//! type. +//! In markdown, it is also possible to use [code (text)][code_text] in the +//! [text][] content type. //! It is also possible to create code with the -//! [code (indented)][code-indented] construct. +//! [code (indented)][code_indented] construct. //! That construct is less explicit, different from code (text), and has no //! support for specifying the programming language, so it is recommended to //! use code (fenced) instead of code (indented). @@ -82,11 +82,10 @@ //! [flow]: crate::content::flow //! [string]: crate::content::string //! [text]: crate::content::text -//! [code-indented]: crate::construct::code_indented +//! [code_indented]: crate::construct::code_indented +//! [code_text]: crate::construct::code_text //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element -//! -//! use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; use crate::construct::partial_whitespace::start as whitespace; @@ -251,14 +250,12 @@ fn info_inside( ) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - println!("to do: subtokenize: {:?}", codes); tokenizer.exit(TokenType::ChunkString); tokenizer.exit(TokenType::CodeFencedFenceInfo); tokenizer.exit(TokenType::CodeFencedFence); at_break(tokenizer, info, code) } Code::VirtualSpace | Code::Char('\t' | ' ') => { - println!("to do: subtokenize: {:?}", codes); tokenizer.exit(TokenType::ChunkString); tokenizer.exit(TokenType::CodeFencedFenceInfo); tokenizer.attempt( diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 936f174..55b8901 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -18,9 +18,9 @@ //! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` //! element*][html-code] in the HTML spec for more info. //! -//! In markdown, it is also possible to use code (text) in the text content -//! type. -//! It is also possible to create code with the [code (fenced)][code-fenced] +//! In markdown, it is also possible to use [code (text)][code_text] in the +//! [text][] content type. +//! It is also possible to create code with the [code (fenced)][code_fenced] //! construct. //! That construct is more explicit, more similar to code (text), and has //! support for specifying the programming language that the code is in, so it @@ -32,11 +32,11 @@ //! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks) //! //! [flow]: crate::content::flow -//! [code-fenced]: crate::construct::code_fenced +//! [text]: crate::content::text +//! [code_text]: crate::construct::code_text +//! [code_fenced]: crate::construct::code_fenced //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element -//! -//! use crate::constant::TAB_SIZE; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs new file mode 100644 index 0000000..3c01070 --- /dev/null +++ b/src/construct/code_text.rs @@ -0,0 +1,217 @@ +//! Code (text) is a construct that occurs in the [text][] content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! ; Restriction: the number of markers in the closing sequence must be equal +//! ; to the number of markers in the opening sequence. +//! code_text ::= sequence 1*code sequence +//! +//! sequence ::= 1*'`' +//! ``` +//! +//! The above grammar shows that it is not possible to create empty code. +//! It is possible to include grave accents (ticks) in code, by wrapping it +//! in bigger or smaller sequences: +//! +//! ```markdown +//! Include more: `a``b` or include less: ``a`b``. +//! ``` +//! +//! When turning markdown into HTML, each line ending is turned into a space. +//! +//! It is also possible to include just one grave accent (tick): +//! +//! ```markdown +//! Include just one: `` ` ``. +//! ``` +//! +//! Sequences are “gready”, in that they cannot be preceded or succeeded by +//! more grave accents (ticks). +//! To illustrate: +//! +//! ```markdown +//! Not code: ``x`. +//! +//! Not code: `x``. +//! +//! Escapes work, this is code: \``x`. +//! +//! Escapes work, this is code: `x`\`. +//! ``` +//! +//! Yields: +//! +//! ```html +//!

Not code: ``x`.

+//!

Not code: `x``.

+//!

Escapes work, this is code: `x.

+//!

Escapes work, this is code: x`.

+//! ``` +//! +//! That is because, when turning markdown into HTML, the first and last space, +//! if both exist and there is also a non-space in the code, are removed. +//! Line endings, at that stage, are considered as spaces. +//! +//! Code (text) relates to the `` element in HTML. +//! See [*§ 4.5.15 The `code` element*][html-code] in the HTML spec for more +//! info. +//! +//! In markdown, it is possible to create code with the +//! [code (fenced)][code_fenced] or [code (indented)][code_indented] constructs +//! in the [flow][] content type. +//! Compared to code (indented), fenced code is more explicit and more similar +//! to code (text), and it has support for specifying the programming language +//! that the code is in, so it is recommended to use that instead of indented +//! code. +//! +//! ## References +//! +//! * [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js) +//! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans) +//! +//! [flow]: crate::content::flow +//! [text]: crate::content::text +//! [code_indented]: crate::construct::code_indented +//! [code_fenced]: crate::construct::code_fenced +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of code (text). +/// +/// ```markdown +/// |`a` +/// +/// |\``a` +/// +/// |``a` +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let len = tokenizer.events.len(); + + match code { + Code::Char('`') + if tokenizer.previous != Code::Char('`') + || (len > 0 + && tokenizer.events[len - 1].token_type == TokenType::CharacterEscape) => + { + tokenizer.enter(TokenType::CodeText); + tokenizer.enter(TokenType::CodeTextSequence); + sequence_open(tokenizer, code, 0) + } + _ => (State::Nok, None), + } +} + +/// In the opening sequence. +/// +/// ```markdown +/// `|`a`` +/// ``` +pub fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + if let Code::Char('`') = code { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_open(tokenizer, code, size + 1) + })), + None, + ) + } else { + tokenizer.exit(TokenType::CodeTextSequence); + between(tokenizer, code, size) + } +} + +/// Between something and something else +/// +/// ```markdown +/// `|a` +/// `a|` +/// ``` +pub fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult { + match code { + Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.enter(TokenType::CodeTextLineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::CodeTextLineEnding); + ( + State::Fn(Box::new(move |tokenizer, code| { + between(tokenizer, code, size_open) + })), + None, + ) + } + Code::Char('`') => { + tokenizer.enter(TokenType::CodeTextSequence); + sequence_close(tokenizer, code, size_open, 0) + } + _ => { + tokenizer.enter(TokenType::CodeTextData); + data(tokenizer, code, size_open) + } + } +} + +/// In data. +/// +/// ```markdown +/// `a|b` +/// ``` +pub fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '`') => { + tokenizer.exit(TokenType::CodeTextData); + between(tokenizer, code, size_open) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + data(tokenizer, code, size_open) + })), + None, + ) + } + } +} + +/// In the closing sequence. +/// +/// ```markdown +/// ``a`|` +/// ``` +pub fn sequence_close( + tokenizer: &mut Tokenizer, + code: Code, + size_open: usize, + size: usize, +) -> StateFnResult { + match code { + Code::Char('`') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_close(tokenizer, code, size_open, size + 1) + })), + None, + ) + } + _ if size_open == size => { + tokenizer.exit(TokenType::CodeTextSequence); + tokenizer.exit(TokenType::CodeText); + (State::Ok, Some(vec![code])) + } + _ => { + let tail_index = tokenizer.events.len(); + let head_index = tokenizer.events.len() - 1; + tokenizer.exit(TokenType::CodeTextSequence); + // Change the token type. + tokenizer.events[head_index].token_type = TokenType::CodeTextData; + tokenizer.events[tail_index].token_type = TokenType::CodeTextData; + between(tokenizer, code, size_open) + } + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 14f53a0..1fa57d5 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -23,7 +23,7 @@ //! * [character reference][character_reference] //! * [code (fenced)][code_fenced] //! * [code (indented)][code_indented] -//! * code (text) +//! * [code (text)][code_text] //! * content //! * definition //! * hard break escape @@ -59,6 +59,7 @@ pub mod character_escape; pub mod character_reference; pub mod code_fenced; pub mod code_indented; +pub mod code_text; pub mod heading_atx; pub mod html_flow; pub mod html_text; -- cgit