diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/raw_text.rs | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/src/construct/raw_text.rs b/src/construct/raw_text.rs new file mode 100644 index 0000000..7f3990d --- /dev/null +++ b/src/construct/raw_text.rs @@ -0,0 +1,270 @@ +//! Raw (text) occurs in the [text][] content type. +//! It forms code (text) and math (text). +//! +//! ## Grammar +//! +//! Raw (text) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! ; Restriction: the number of markers in the closing sequence must be equal +//! ; to the number of markers in the opening sequence. +//! raw_text ::= sequence 1*byte sequence +//! +//! ; Restriction: not preceded or followed by the same marker. +//! sequence ::= 1*'`' | 1*'$' +//! ``` +//! +//! The above grammar shows that it is not possible to create empty raw (text). +//! It is possible to include the sequence marker (grave accent for code, +//! dollar for math) in raw (text), by wrapping it in bigger or smaller +//! sequences: +//! +//! ```markdown +//! Include more: `a``b` or include less: ``a`b``. +//! ``` +//! +//! It is also possible to include just one marker: +//! +//! ```markdown +//! Include just one: `` ` ``. +//! ``` +//! +//! Sequences are “gready”, in that they cannot be preceded or followed by +//! more markers. +//! To illustrate: +//! +//! ```markdown +//! Not code: ``x`. +//! +//! Not code: `x``. +//! +//! Escapes work, this is code: \``x`. +//! +//! Escapes work, this is code: `x`\`. +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p>Not code: ``x`.</p> +//! <p>Not code: `x``.</p> +//! <p>Escapes work, this is code: `<code>x</code>.</p> +//! <p>Escapes work, this is code: <code>x</code>`.</p> +//! ``` +//! +//! That is because, when turning markdown into HTML, the first and last space, +//! if both exist and there is also a non-space in the code, are removed. +//! Line endings, at that stage, are considered as spaces. +//! +//! In markdown, it is possible to create code with the +//! [code (fenced)][code_fenced] or [code (indented)][code_indented], +//! and math with the [math (flow)][math_flow] constructs in the [flow][] +//! content type. +//! +//! ## HTML +//! +//! Code (text) relates to the `<code>` element in HTML. +//! See [*§ 4.5.15 The `code` element*][html_code] in the HTML spec for more +//! info. +//! +//! Math (text) does not relate to HTML elements. +//! `MathML`, which is sort of like SVG but for math, exists but it doesn’t work +//! well and isn’t widely supported. +//! Instead, it is recommended to use client side JavaScript with something like +//! `KaTeX` or `MathJax` to process the math +//! For that, the math is compiled as a `<code>` element with two classes: +//! `lang-math` and `math-inline`. +//! Client side JavaScript can look for these classes to process them further. +//! +//! When turning markdown into HTML, each line ending in raw (text) is turned +//! into a space. +//! +//! ## Recommendations +//! +//! When authoring markdown with math, keep in mind that math doesn’t work in +//! most places. +//! Notably, GitHub currently has a really weird crappy client-side regex-based +//! thing. +//! But on your own (math-heavy?) site it can be great! +//! Alternatively, set `options.math_text_single_dollar: false`, which prevents +//! single dollars from being seen as math, and thus prevents normal dollars in +//! text from being seen as math. +//! +//! ## Tokens +//! +//! * [`CodeText`][Name::CodeText] +//! * [`CodeTextData`][Name::CodeTextData] +//! * [`CodeTextSequence`][Name::CodeTextSequence] +//! * [`MathText`][Name::MathText] +//! * [`MathTextData`][Name::MathTextData] +//! * [`MathTextSequence`][Name::MathTextSequence] +//! * [`LineEnding`][Name::LineEnding] +//! +//! ## References +//! +//! * [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js) +//! * [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) +//! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans) +//! +//! [flow]: crate::construct::flow +//! [text]: crate::construct::text +//! [code_indented]: crate::construct::code_indented +//! [code_fenced]: crate::construct::code_fenced +//! [math_flow]: # "to do" +//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element + +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; + +/// Start of raw (text). +/// +/// ```markdown +/// > | `a` +/// ^ +/// > | \`a` +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + // Code (text): + if ((tokenizer.parse_state.options.constructs.code_text && tokenizer.current == Some(b'`')) + // Math (text): + || (tokenizer.parse_state.options.constructs.math_text && tokenizer.current == Some(b'$'))) + // Not the same marker (except when escaped). + && (tokenizer.previous != tokenizer.current + || (!tokenizer.events.is_empty() + && tokenizer.events[tokenizer.events.len() - 1].name == Name::CharacterEscape)) + { + let marker = tokenizer.current.unwrap(); + if marker == b'`' { + tokenizer.tokenize_state.token_1 = Name::CodeText; + tokenizer.tokenize_state.token_2 = Name::CodeTextSequence; + tokenizer.tokenize_state.token_3 = Name::CodeTextData; + } else { + tokenizer.tokenize_state.token_1 = Name::MathText; + tokenizer.tokenize_state.token_2 = Name::MathTextSequence; + tokenizer.tokenize_state.token_3 = Name::MathTextData; + } + tokenizer.tokenize_state.marker = marker; + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + State::Retry(StateName::RawTextSequenceOpen) + } else { + State::Nok + } +} + +/// In opening sequence. +/// +/// ```markdown +/// > | `a` +/// ^ +/// ``` +pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::RawTextSequenceOpen) + } + // Not enough markers in the sequence. + else if tokenizer.tokenize_state.marker == b'$' + && tokenizer.tokenize_state.size == 1 + && !tokenizer.parse_state.options.math_text_single_dollar + { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok + } else { + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + State::Retry(StateName::RawTextBetween) + } +} + +/// Between something and something else +/// +/// ```markdown +/// > | `a` +/// ^^ +/// ``` +pub fn between(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok + } + Some(b'\n') => { + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::RawTextBetween) + } + _ => { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + State::Retry(StateName::RawTextSequenceClose) + } else { + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); + State::Retry(StateName::RawTextData) + } + } + } +} + +/// In data. +/// +/// ```markdown +/// > | `a` +/// ^ +/// ``` +pub fn data(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, None | Some(b'\n')) + || tokenizer.current == Some(tokenizer.tokenize_state.marker) + { + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + State::Retry(StateName::RawTextBetween) + } else { + tokenizer.consume(); + State::Next(StateName::RawTextData) + } +} + +/// In closing sequence. +/// +/// ```markdown +/// > | `a` +/// ^ +/// ``` +pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.tokenize_state.size_b += 1; + tokenizer.consume(); + State::Next(StateName::RawTextSequenceClose) + } else { + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_b { + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_b = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Ok + } else { + // More or less accents: mark as data. + let len = tokenizer.events.len(); + tokenizer.events[len - 2].name = tokenizer.tokenize_state.token_3.clone(); + tokenizer.events[len - 1].name = tokenizer.tokenize_state.token_3.clone(); + tokenizer.tokenize_state.size_b = 0; + State::Retry(StateName::RawTextBetween) + } + } +} |