From acc35758778bfda5cb01951533868eb8baa2e2d2 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Wed, 15 Jun 2022 18:17:01 +0200
Subject: Add code (text)

---
 src/construct/code_fenced.rs   |  13 +--
 src/construct/code_indented.rs |  12 +--
 src/construct/code_text.rs     | 217 +++++++++++++++++++++++++++++++++++++++++
 src/construct/mod.rs           |   3 +-
 4 files changed, 230 insertions(+), 15 deletions(-)
 create mode 100644 src/construct/code_text.rs

(limited to 'src/construct')

diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index c852e8d..12c8bd6 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -66,10 +66,10 @@
 //! The `info` and `meta` parts are interpreted as the [string][] content type.
 //! That means that character escapes and character reference are allowed.
 //!
-//! In markdown, it is also possible to use code (text) in the [text][] content
-//! type.
+//! In markdown, it is also possible to use [code (text)][code_text] in the
+//! [text][] content type.
 //! It is also possible to create code with the
-//! [code (indented)][code-indented] construct.
+//! [code (indented)][code_indented] construct.
 //! That construct is less explicit, different from code (text), and has no
 //! support for specifying the programming language, so it is recommended to
 //! use code (fenced) instead of code (indented).
@@ -82,11 +82,10 @@
 //! [flow]: crate::content::flow
 //! [string]: crate::content::string
 //! [text]: crate::content::text
-//! [code-indented]: crate::construct::code_indented
+//! [code_indented]: crate::construct::code_indented
+//! [code_text]: crate::construct::code_text
 //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
 //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
-//!
-//! <!-- To do: link `code_text` -->
 
 use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE};
 use crate::construct::partial_whitespace::start as whitespace;
@@ -251,14 +250,12 @@ fn info_inside(
 ) -> StateFnResult {
     match code {
         Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
-            println!("to do: subtokenize: {:?}", codes);
             tokenizer.exit(TokenType::ChunkString);
             tokenizer.exit(TokenType::CodeFencedFenceInfo);
             tokenizer.exit(TokenType::CodeFencedFence);
             at_break(tokenizer, info, code)
         }
         Code::VirtualSpace | Code::Char('\t' | ' ') => {
-            println!("to do: subtokenize: {:?}", codes);
             tokenizer.exit(TokenType::ChunkString);
             tokenizer.exit(TokenType::CodeFencedFenceInfo);
             tokenizer.attempt(
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 936f174..55b8901 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -18,9 +18,9 @@
 //! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
 //! element*][html-code] in the HTML spec for more info.
 //!
-//! In markdown, it is also possible to use code (text) in the text content
-//! type.
-//! It is also possible to create code with the [code (fenced)][code-fenced]
+//! In markdown, it is also possible to use [code (text)][code_text] in the
+//! [text][] content type.
+//! It is also possible to create code with the [code (fenced)][code_fenced]
 //! construct.
 //! That construct is more explicit, more similar to code (text), and has
 //! support for specifying the programming language that the code is in, so it
@@ -32,11 +32,11 @@
 //! *   [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks)
 //!
 //! [flow]: crate::content::flow
-//! [code-fenced]: crate::construct::code_fenced
+//! [text]: crate::content::text
+//! [code_text]: crate::construct::code_text
+//! [code_fenced]: crate::construct::code_fenced
 //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
 //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
-//!
-//! <!-- To do: link `code_text` -->
 
 use crate::constant::TAB_SIZE;
 use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
new file mode 100644
index 0000000..3c01070
--- /dev/null
+++ b/src/construct/code_text.rs
@@ -0,0 +1,217 @@
+//! Code (text) is a construct that occurs in the [text][] content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: the number of markers in the closing sequence must be equal
+//! ; to the number of markers in the opening sequence.
+//! code_text ::= sequence 1*code sequence
+//!
+//! sequence ::= 1*'`'
+//! ```
+//!
+//! The above grammar shows that it is not possible to create empty code.
+//! It is possible to include grave accents (ticks) in code, by wrapping it
+//! in bigger or smaller sequences:
+//!
+//! ```markdown
+//! Include more: `a``b` or include less: ``a`b``.
+//! ```
+//!
+//! When turning markdown into HTML, each line ending is turned into a space.
+//!
+//! It is also possible to include just one grave accent (tick):
+//!
+//! ```markdown
+//! Include just one: `` ` ``.
+//! ```
+//!
+//! Sequences are “gready”, in that they cannot be preceded or succeeded by
+//! more grave accents (ticks).
+//! To illustrate:
+//!
+//! ```markdown
+//! Not code: ``x`.
+//!
+//! Not code: `x``.
+//!
+//! Escapes work, this is code: \``x`.
+//!
+//! Escapes work, this is code: `x`\`.
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p>Not code: ``x`.</p>
+//! <p>Not code: `x``.</p>
+//! <p>Escapes work, this is code: `<code>x</code>.</p>
+//! <p>Escapes work, this is code: <code>x</code>`.</p>
+//! ```
+//!
+//! That is because, when turning markdown into HTML, the first and last space,
+//! if both exist and there is also a non-space in the code, are removed.
+//! Line endings, at that stage, are considered as spaces.
+//!
+//! Code (text) relates to the `<code>` element in HTML.
+//! See [*§ 4.5.15 The `code` element*][html-code] in the HTML spec for more
+//! info.
+//!
+//! In markdown, it is possible to create code with the
+//! [code (fenced)][code_fenced] or [code (indented)][code_indented] constructs
+//! in the [flow][] content type.
+//! Compared to code (indented), fenced code is more explicit and more similar
+//! to code (text), and it has support for specifying the programming language
+//! that the code is in, so it is recommended to use that instead of indented
+//! code.
+//!
+//! ## References
+//!
+//! *   [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js)
+//! *   [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans)
+//!
+//! [flow]: crate::content::flow
+//! [text]: crate::content::text
+//! [code_indented]: crate::construct::code_indented
+//! [code_fenced]: crate::construct::code_fenced
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of code (text).
+///
+/// ```markdown
+/// |`a`
+///
+/// |\``a`
+///
+/// |``a`
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    let len = tokenizer.events.len();
+
+    match code {
+        Code::Char('`')
+            if tokenizer.previous != Code::Char('`')
+                || (len > 0
+                    && tokenizer.events[len - 1].token_type == TokenType::CharacterEscape) =>
+        {
+            tokenizer.enter(TokenType::CodeText);
+            tokenizer.enter(TokenType::CodeTextSequence);
+            sequence_open(tokenizer, code, 0)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// In the opening sequence.
+///
+/// ```markdown
+/// `|`a``
+/// ```
+pub fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    if let Code::Char('`') = code {
+        tokenizer.consume(code);
+        (
+            State::Fn(Box::new(move |tokenizer, code| {
+                sequence_open(tokenizer, code, size + 1)
+            })),
+            None,
+        )
+    } else {
+        tokenizer.exit(TokenType::CodeTextSequence);
+        between(tokenizer, code, size)
+    }
+}
+
+/// Between something and something else
+///
+/// ```markdown
+/// `|a`
+/// `a|`
+/// ```
+pub fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {
+    match code {
+        Code::None => (State::Nok, None),
+        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+            tokenizer.enter(TokenType::CodeTextLineEnding);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::CodeTextLineEnding);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    between(tokenizer, code, size_open)
+                })),
+                None,
+            )
+        }
+        Code::Char('`') => {
+            tokenizer.enter(TokenType::CodeTextSequence);
+            sequence_close(tokenizer, code, size_open, 0)
+        }
+        _ => {
+            tokenizer.enter(TokenType::CodeTextData);
+            data(tokenizer, code, size_open)
+        }
+    }
+}
+
+/// In data.
+///
+/// ```markdown
+/// `a|b`
+/// ```
+pub fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {
+    match code {
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '`') => {
+            tokenizer.exit(TokenType::CodeTextData);
+            between(tokenizer, code, size_open)
+        }
+        _ => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    data(tokenizer, code, size_open)
+                })),
+                None,
+            )
+        }
+    }
+}
+
+/// In the closing sequence.
+///
+/// ```markdown
+/// ``a`|`
+/// ```
+pub fn sequence_close(
+    tokenizer: &mut Tokenizer,
+    code: Code,
+    size_open: usize,
+    size: usize,
+) -> StateFnResult {
+    match code {
+        Code::Char('`') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    sequence_close(tokenizer, code, size_open, size + 1)
+                })),
+                None,
+            )
+        }
+        _ if size_open == size => {
+            tokenizer.exit(TokenType::CodeTextSequence);
+            tokenizer.exit(TokenType::CodeText);
+            (State::Ok, Some(vec![code]))
+        }
+        _ => {
+            let tail_index = tokenizer.events.len();
+            let head_index = tokenizer.events.len() - 1;
+            tokenizer.exit(TokenType::CodeTextSequence);
+            // Change the token type.
+            tokenizer.events[head_index].token_type = TokenType::CodeTextData;
+            tokenizer.events[tail_index].token_type = TokenType::CodeTextData;
+            between(tokenizer, code, size_open)
+        }
+    }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 14f53a0..1fa57d5 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -23,7 +23,7 @@
 //! *   [character reference][character_reference]
 //! *   [code (fenced)][code_fenced]
 //! *   [code (indented)][code_indented]
-//! *   code (text)
+//! *   [code (text)][code_text]
 //! *   content
 //! *   definition
 //! *   hard break escape
@@ -59,6 +59,7 @@ pub mod character_escape;
 pub mod character_reference;
 pub mod code_fenced;
 pub mod code_indented;
+pub mod code_text;
 pub mod heading_atx;
 pub mod html_flow;
 pub mod html_text;
-- 
cgit