From f41688c067be261279804b8ab3e04cd5d67f492f Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 26 Aug 2022 10:57:20 +0200 Subject: Add support for math (text) --- readme.md | 2 +- src/compiler.rs | 33 ++--- src/construct/code_fenced.rs | 4 +- src/construct/code_indented.rs | 4 +- src/construct/code_text.rs | 213 -------------------------------- src/construct/mod.rs | 6 +- src/construct/paragraph.rs | 4 +- src/construct/partial_label.rs | 4 +- src/construct/raw_text.rs | 270 +++++++++++++++++++++++++++++++++++++++++ src/construct/text.rs | 35 +++--- src/event.rs | 66 +++++++++- src/lib.rs | 144 +++++++++++++++------- src/state.rs | 20 +-- tests/math_text.rs | 190 +++++++++++++++++++++++++++++ 14 files changed, 682 insertions(+), 313 deletions(-) delete mode 100644 src/construct/code_text.rs create mode 100644 src/construct/raw_text.rs create mode 100644 tests/math_text.rs diff --git a/readme.md b/readme.md index b4788ae..7db6cdb 100644 --- a/readme.md +++ b/readme.md @@ -196,7 +196,7 @@ The files in `src/` are as follows: ## Examples - + > 🚧 **To do**. diff --git a/src/compiler.rs b/src/compiler.rs index 6ad34b2..73834a4 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -90,7 +90,7 @@ struct CompileContext<'a> { /// Number of code (fenced) fenced. pub code_fenced_fences_count: Option, /// Whether we are in code (text). - pub code_text_inside: bool, + pub raw_text_inside: bool, /// Whether we are in image text. pub image_alt_inside: bool, /// Marker of character reference. @@ -145,7 +145,7 @@ impl<'a> CompileContext<'a> { heading_setext_buffer: None, code_flow_seen_data: None, code_fenced_fences_count: None, - code_text_inside: false, + raw_text_inside: false, character_reference_marker: None, list_expect_first_marker: None, media_stack: vec![], @@ -341,7 +341,7 @@ fn enter(context: &mut CompileContext) { Name::BlockQuote => on_enter_block_quote(context), Name::CodeIndented => on_enter_code_indented(context), Name::CodeFenced => on_enter_code_fenced(context), - Name::CodeText => on_enter_code_text(context), + Name::CodeText | Name::MathText => on_enter_raw_text(context), Name::Definition => on_enter_definition(context), Name::DefinitionDestinationString => on_enter_definition_destination_string(context), Name::Emphasis => on_enter_emphasis(context), @@ -368,8 +368,9 @@ fn enter(context: &mut CompileContext) { fn exit(context: &mut CompileContext) { match context.events[context.index].name { Name::CodeFencedFenceMeta | Name::Resource => on_exit_drop(context), - Name::CharacterEscapeValue | Name::CodeTextData | Name::Data => on_exit_data(context), - + Name::CharacterEscapeValue | Name::CodeTextData | Name::Data | Name::MathTextData => { + on_exit_data(context); + } Name::AutolinkEmail => on_exit_autolink_email(context), Name::AutolinkProtocol => on_exit_autolink_protocol(context), Name::BlankLineEnding => on_exit_blank_line_ending(context), @@ -386,7 +387,7 @@ fn exit(context: &mut CompileContext) { Name::CodeFencedFence => on_exit_code_fenced_fence(context), Name::CodeFencedFenceInfo => on_exit_code_fenced_fence_info(context), Name::CodeFlowChunk => on_exit_code_flow_chunk(context), - Name::CodeText => on_exit_code_text(context), + Name::CodeText | Name::MathText => on_exit_raw_text(context), Name::Definition => on_exit_definition(context), Name::DefinitionDestinationString => on_exit_definition_destination_string(context), Name::DefinitionLabelString => on_exit_definition_label_string(context), @@ -460,11 +461,15 @@ fn on_enter_code_fenced(context: &mut CompileContext) { context.code_fenced_fences_count = Some(0); } -/// Handle [`Enter`][Kind::Enter]:[`CodeText`][Name::CodeText]. -fn on_enter_code_text(context: &mut CompileContext) { - context.code_text_inside = true; +/// Handle [`Enter`][Kind::Enter]:{[`CodeText`][Name::CodeText],[`MathText`][Name::MathText]}. +fn on_enter_raw_text(context: &mut CompileContext) { + context.raw_text_inside = true; if !context.image_alt_inside { - context.push(""); + context.push(""); } context.buffer(); } @@ -875,8 +880,8 @@ fn on_exit_code_flow(context: &mut CompileContext) { context.slurp_one_line_ending = false; } -/// Handle [`Exit`][Kind::Exit]:[`CodeText`][Name::CodeText]. -fn on_exit_code_text(context: &mut CompileContext) { +/// Handle [`Exit`][Kind::Exit]:{[`CodeText`][Name::CodeText],[`MathText`][Name::MathText]}. +fn on_exit_raw_text(context: &mut CompileContext) { let result = context.resume(); let mut bytes = result.as_bytes(); let mut trim = false; @@ -899,7 +904,7 @@ fn on_exit_code_text(context: &mut CompileContext) { bytes = &bytes[1..end]; } - context.code_text_inside = false; + context.raw_text_inside = false; context.push(str::from_utf8(bytes).unwrap()); if !context.image_alt_inside { @@ -1209,7 +1214,7 @@ fn on_exit_label_text(context: &mut CompileContext) { /// Handle [`Exit`][Kind::Exit]:[`LineEnding`][Name::LineEnding]. fn on_exit_line_ending(context: &mut CompileContext) { - if context.code_text_inside { + if context.raw_text_inside { context.push(" "); } else if context.slurp_one_line_ending // Ignore line endings after definitions. diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index d117006..edb2d93 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -48,7 +48,7 @@ //! Which value it holds depends on what your syntax highlighter supports, if //! one is used. //! -//! In markdown, it is also possible to use [code (text)][code_text] in the +//! In markdown, it is also possible to use [code (text)][raw_text] in the //! [text][] content type. //! It is also possible to create code with the //! [code (indented)][code_indented] construct. @@ -106,7 +106,7 @@ //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference //! [code_indented]: crate::construct::code_indented -//! [code_text]: crate::construct::code_text +//! [raw_text]: crate::construct::raw_text //! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element //! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 7d279c1..5fc9446 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -16,7 +16,7 @@ //! As this construct occurs in flow, like all flow constructs, it must be //! followed by an eol (line ending) or eof (end of file). //! -//! In markdown, it is also possible to use [code (text)][code_text] in the +//! In markdown, it is also possible to use [code (text)][raw_text] in the //! [text][] content type. //! It is also possible to create code with the [code (fenced)][code_fenced] //! construct. @@ -49,7 +49,7 @@ //! [flow]: crate::construct::flow //! [text]: crate::construct::text //! [code_fenced]: crate::construct::code_fenced -//! [code_text]: crate::construct::code_text +//! [raw_text]: crate::construct::raw_text //! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element //! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs deleted file mode 100644 index b2cfd17..0000000 --- a/src/construct/code_text.rs +++ /dev/null @@ -1,213 +0,0 @@ -//! Code (text) occurs in the [text][] content type. -//! -//! ## Grammar -//! -//! Code (text) forms with the following BNF -//! (see [construct][crate::construct] for character groups): -//! -//! ```bnf -//! ; Restriction: the number of markers in the closing sequence must be equal -//! ; to the number of markers in the opening sequence. -//! code_text ::= sequence 1*byte sequence -//! -//! ; Restriction: not preceded or followed by `` ` ``. -//! sequence ::= 1*'`' -//! ``` -//! -//! The above grammar shows that it is not possible to create empty code. -//! It is possible to include grave accents (ticks) in code, by wrapping it -//! in bigger or smaller sequences: -//! -//! ```markdown -//! Include more: `a``b` or include less: ``a`b``. -//! ``` -//! -//! It is also possible to include just one grave accent (tick): -//! -//! ```markdown -//! Include just one: `` ` ``. -//! ``` -//! -//! Sequences are “gready”, in that they cannot be preceded or followed by -//! more grave accents (ticks). -//! To illustrate: -//! -//! ```markdown -//! Not code: ``x`. -//! -//! Not code: `x``. -//! -//! Escapes work, this is code: \``x`. -//! -//! Escapes work, this is code: `x`\`. -//! ``` -//! -//! Yields: -//! -//! ```html -//!

Not code: ``x`.

-//!

Not code: `x``.

-//!

Escapes work, this is code: `x.

-//!

Escapes work, this is code: x`.

-//! ``` -//! -//! That is because, when turning markdown into HTML, the first and last space, -//! if both exist and there is also a non-space in the code, are removed. -//! Line endings, at that stage, are considered as spaces. -//! -//! In markdown, it is possible to create code with the -//! [code (fenced)][code_fenced] or [code (indented)][code_indented] constructs -//! in the [flow][] content type. -//! -//! ## HTML -//! -//! Code (text) relates to the `` element in HTML. -//! See [*§ 4.5.15 The `code` element*][html_code] in the HTML spec for more -//! info. -//! -//! When turning markdown into HTML, each line ending is turned into a space. -//! -//! ## Tokens -//! -//! * [`CodeText`][Name::CodeText] -//! * [`CodeTextData`][Name::CodeTextData] -//! * [`CodeTextSequence`][Name::CodeTextSequence] -//! * [`LineEnding`][Name::LineEnding] -//! -//! ## References -//! -//! * [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js) -//! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans) -//! -//! [flow]: crate::construct::flow -//! [text]: crate::construct::text -//! [code_indented]: crate::construct::code_indented -//! [code_fenced]: crate::construct::code_fenced -//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element - -use crate::event::Name; -use crate::state::{Name as StateName, State}; -use crate::tokenizer::Tokenizer; - -/// Start of code (text). -/// -/// ```markdown -/// > | `a` -/// ^ -/// > | \`a` -/// ^ -/// ``` -pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(b'`') - if tokenizer.parse_state.options.constructs.code_text - && (tokenizer.previous != Some(b'`') - || (!tokenizer.events.is_empty() - && tokenizer.events[tokenizer.events.len() - 1].name - == Name::CharacterEscape)) => - { - tokenizer.enter(Name::CodeText); - tokenizer.enter(Name::CodeTextSequence); - State::Retry(StateName::CodeTextSequenceOpen) - } - _ => State::Nok, - } -} - -/// In opening sequence. -/// -/// ```markdown -/// > | `a` -/// ^ -/// ``` -pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { - if let Some(b'`') = tokenizer.current { - tokenizer.tokenize_state.size += 1; - tokenizer.consume(); - State::Next(StateName::CodeTextSequenceOpen) - } else { - tokenizer.exit(Name::CodeTextSequence); - State::Retry(StateName::CodeTextBetween) - } -} - -/// Between something and something else -/// -/// ```markdown -/// > | `a` -/// ^^ -/// ``` -pub fn between(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None => { - tokenizer.tokenize_state.size = 0; - State::Nok - } - Some(b'\n') => { - tokenizer.enter(Name::LineEnding); - tokenizer.consume(); - tokenizer.exit(Name::LineEnding); - State::Next(StateName::CodeTextBetween) - } - Some(b'`') => { - tokenizer.enter(Name::CodeTextSequence); - State::Retry(StateName::CodeTextSequenceClose) - } - _ => { - tokenizer.enter(Name::CodeTextData); - State::Retry(StateName::CodeTextData) - } - } -} - -/// In data. -/// -/// ```markdown -/// > | `a` -/// ^ -/// ``` -pub fn data(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n' | b'`') => { - tokenizer.exit(Name::CodeTextData); - State::Retry(StateName::CodeTextBetween) - } - _ => { - tokenizer.consume(); - State::Next(StateName::CodeTextData) - } - } -} - -/// In closing sequence. -/// -/// ```markdown -/// > | `a` -/// ^ -/// ``` -pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(b'`') => { - tokenizer.tokenize_state.size_b += 1; - tokenizer.consume(); - State::Next(StateName::CodeTextSequenceClose) - } - _ => { - if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_b { - tokenizer.exit(Name::CodeTextSequence); - tokenizer.exit(Name::CodeText); - tokenizer.tokenize_state.size = 0; - tokenizer.tokenize_state.size_b = 0; - State::Ok - } else { - let index = tokenizer.events.len(); - tokenizer.exit(Name::CodeTextSequence); - // More or less accents: mark as data. - tokenizer.events[index - 1].name = Name::CodeTextData; - tokenizer.events[index].name = Name::CodeTextData; - tokenizer.tokenize_state.size_b = 0; - State::Retry(StateName::CodeTextBetween) - } - } - } -} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index c5002bb..a0065fa 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -38,7 +38,7 @@ //! * [character reference][character_reference] //! * [code (fenced)][code_fenced] //! * [code (indented)][code_indented] -//! * [code (text)][code_text] +//! * [code (text)][raw_text] //! * [definition][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] @@ -62,6 +62,8 @@ //! * [gfm footnote definition][gfm_footnote_definition] //! * [gfm task list item check][gfm_task_list_item_check] //! * [gfm label start footnote][gfm_label_start_footnote] +//! * math (text) (in `raw_text`) +//! * gfm strikethrough (in attention) //! //! There are also several small subroutines typically used in different places: //! @@ -143,7 +145,6 @@ pub mod character_escape; pub mod character_reference; pub mod code_fenced; pub mod code_indented; -pub mod code_text; pub mod definition; pub mod document; pub mod flow; @@ -171,6 +172,7 @@ pub mod partial_space_or_tab; pub mod partial_space_or_tab_eol; pub mod partial_title; pub mod partial_whitespace; +pub mod raw_text; pub mod string; pub mod text; pub mod thematic_break; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 9e20643..c1e7311 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -18,7 +18,7 @@ //! allowed to contain blank lines, or to be blank themselves. //! //! The paragraph is interpreted as the [text][] content type. -//! That means that [autolinks][autolink], [code (text)][code_text], etc are allowed. +//! That means that [autolinks][autolink], [code (text)][raw_text], etc are allowed. //! //! ## HTML //! @@ -37,7 +37,7 @@ //! [flow]: crate::construct::flow //! [text]: crate::construct::text //! [autolink]: crate::construct::autolink -//! [code_text]: crate::construct::code_text +//! [raw_text]: crate::construct::raw_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element use crate::event::{Content, Kind, Link, Name}; diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index ab436b2..255fde1 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -44,7 +44,7 @@ //! > ([label start (image)][label_start_image] or //! > [label start (link)][label_start_link]) and a closing //! > ([label end][label_end]), so as to allow further phrasing such as -//! > [code (text)][code_text] or [attention][]. +//! > [code (text)][raw_text] or [attention][]. //! //! ## References //! @@ -58,7 +58,7 @@ //! [label_start_image]: crate::construct::label_start_image //! [label_start_link]: crate::construct::label_start_link //! [label_end]: crate::construct::label_end -//! [code_text]: crate::construct::code_text +//! [raw_text]: crate::construct::raw_text //! [link_reference_size_max]: crate::util::constant::LINK_REFERENCE_SIZE_MAX use crate::construct::partial_space_or_tab_eol::{space_or_tab_eol_with_options, Options}; diff --git a/src/construct/raw_text.rs b/src/construct/raw_text.rs new file mode 100644 index 0000000..7f3990d --- /dev/null +++ b/src/construct/raw_text.rs @@ -0,0 +1,270 @@ +//! Raw (text) occurs in the [text][] content type. +//! It forms code (text) and math (text). +//! +//! ## Grammar +//! +//! Raw (text) forms with the following BNF +//! (see [construct][crate::construct] for character groups): +//! +//! ```bnf +//! ; Restriction: the number of markers in the closing sequence must be equal +//! ; to the number of markers in the opening sequence. +//! raw_text ::= sequence 1*byte sequence +//! +//! ; Restriction: not preceded or followed by the same marker. +//! sequence ::= 1*'`' | 1*'$' +//! ``` +//! +//! The above grammar shows that it is not possible to create empty raw (text). +//! It is possible to include the sequence marker (grave accent for code, +//! dollar for math) in raw (text), by wrapping it in bigger or smaller +//! sequences: +//! +//! ```markdown +//! Include more: `a``b` or include less: ``a`b``. +//! ``` +//! +//! It is also possible to include just one marker: +//! +//! ```markdown +//! Include just one: `` ` ``. +//! ``` +//! +//! Sequences are “gready”, in that they cannot be preceded or followed by +//! more markers. +//! To illustrate: +//! +//! ```markdown +//! Not code: ``x`. +//! +//! Not code: `x``. +//! +//! Escapes work, this is code: \``x`. +//! +//! Escapes work, this is code: `x`\`. +//! ``` +//! +//! Yields: +//! +//! ```html +//!

Not code: ``x`.

+//!

Not code: `x``.

+//!

Escapes work, this is code: `x.

+//!

Escapes work, this is code: x`.

+//! ``` +//! +//! That is because, when turning markdown into HTML, the first and last space, +//! if both exist and there is also a non-space in the code, are removed. +//! Line endings, at that stage, are considered as spaces. +//! +//! In markdown, it is possible to create code with the +//! [code (fenced)][code_fenced] or [code (indented)][code_indented], +//! and math with the [math (flow)][math_flow] constructs in the [flow][] +//! content type. +//! +//! ## HTML +//! +//! Code (text) relates to the `` element in HTML. +//! See [*§ 4.5.15 The `code` element*][html_code] in the HTML spec for more +//! info. +//! +//! Math (text) does not relate to HTML elements. +//! `MathML`, which is sort of like SVG but for math, exists but it doesn’t work +//! well and isn’t widely supported. +//! Instead, it is recommended to use client side JavaScript with something like +//! `KaTeX` or `MathJax` to process the math +//! For that, the math is compiled as a `` element with two classes: +//! `lang-math` and `math-inline`. +//! Client side JavaScript can look for these classes to process them further. +//! +//! When turning markdown into HTML, each line ending in raw (text) is turned +//! into a space. +//! +//! ## Recommendations +//! +//! When authoring markdown with math, keep in mind that math doesn’t work in +//! most places. +//! Notably, GitHub currently has a really weird crappy client-side regex-based +//! thing. +//! But on your own (math-heavy?) site it can be great! +//! Alternatively, set `options.math_text_single_dollar: false`, which prevents +//! single dollars from being seen as math, and thus prevents normal dollars in +//! text from being seen as math. +//! +//! ## Tokens +//! +//! * [`CodeText`][Name::CodeText] +//! * [`CodeTextData`][Name::CodeTextData] +//! * [`CodeTextSequence`][Name::CodeTextSequence] +//! * [`MathText`][Name::MathText] +//! * [`MathTextData`][Name::MathTextData] +//! * [`MathTextSequence`][Name::MathTextSequence] +//! * [`LineEnding`][Name::LineEnding] +//! +//! ## References +//! +//! * [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js) +//! * [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) +//! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans) +//! +//! [flow]: crate::construct::flow +//! [text]: crate::construct::text +//! [code_indented]: crate::construct::code_indented +//! [code_fenced]: crate::construct::code_fenced +//! [math_flow]: # "to do" +//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element + +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; + +/// Start of raw (text). +/// +/// ```markdown +/// > | `a` +/// ^ +/// > | \`a` +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + // Code (text): + if ((tokenizer.parse_state.options.constructs.code_text && tokenizer.current == Some(b'`')) + // Math (text): + || (tokenizer.parse_state.options.constructs.math_text && tokenizer.current == Some(b'$'))) + // Not the same marker (except when escaped). + && (tokenizer.previous != tokenizer.current + || (!tokenizer.events.is_empty() + && tokenizer.events[tokenizer.events.len() - 1].name == Name::CharacterEscape)) + { + let marker = tokenizer.current.unwrap(); + if marker == b'`' { + tokenizer.tokenize_state.token_1 = Name::CodeText; + tokenizer.tokenize_state.token_2 = Name::CodeTextSequence; + tokenizer.tokenize_state.token_3 = Name::CodeTextData; + } else { + tokenizer.tokenize_state.token_1 = Name::MathText; + tokenizer.tokenize_state.token_2 = Name::MathTextSequence; + tokenizer.tokenize_state.token_3 = Name::MathTextData; + } + tokenizer.tokenize_state.marker = marker; + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + State::Retry(StateName::RawTextSequenceOpen) + } else { + State::Nok + } +} + +/// In opening sequence. +/// +/// ```markdown +/// > | `a` +/// ^ +/// ``` +pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::RawTextSequenceOpen) + } + // Not enough markers in the sequence. + else if tokenizer.tokenize_state.marker == b'$' + && tokenizer.tokenize_state.size == 1 + && !tokenizer.parse_state.options.math_text_single_dollar + { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok + } else { + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + State::Retry(StateName::RawTextBetween) + } +} + +/// Between something and something else +/// +/// ```markdown +/// > | `a` +/// ^^ +/// ``` +pub fn between(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None => { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Nok + } + Some(b'\n') => { + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::RawTextBetween) + } + _ => { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + State::Retry(StateName::RawTextSequenceClose) + } else { + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); + State::Retry(StateName::RawTextData) + } + } + } +} + +/// In data. +/// +/// ```markdown +/// > | `a` +/// ^ +/// ``` +pub fn data(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, None | Some(b'\n')) + || tokenizer.current == Some(tokenizer.tokenize_state.marker) + { + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + State::Retry(StateName::RawTextBetween) + } else { + tokenizer.consume(); + State::Next(StateName::RawTextData) + } +} + +/// In closing sequence. +/// +/// ```markdown +/// > | `a` +/// ^ +/// ``` +pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.tokenize_state.size_b += 1; + tokenizer.consume(); + State::Next(StateName::RawTextSequenceClose) + } else { + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_b { + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_b = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + State::Ok + } else { + // More or less accents: mark as data. + let len = tokenizer.events.len(); + tokenizer.events[len - 2].name = tokenizer.tokenize_state.token_3.clone(); + tokenizer.events[len - 1].name = tokenizer.tokenize_state.token_3.clone(); + tokenizer.tokenize_state.size_b = 0; + State::Retry(StateName::RawTextBetween) + } + } +} diff --git a/src/construct/text.rs b/src/construct/text.rs index 5535e3f..3cb0f10 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -1,16 +1,16 @@ //! The text content type. //! //! **Text** contains phrasing content such as -//! [attention][crate::construct::attention] (emphasis, strong), -//! [code (text)][crate::construct::code_text], and actual text. +//! [attention][crate::construct::attention] (emphasis, gfm strikethrough, strong), +//! [raw (text)][crate::construct::raw_text] (code (text), math (text)), and actual text. //! //! The constructs found in text are: //! -//! * [Attention][crate::construct::attention] +//! * [Attention][crate::construct::attention] (emphasis, gfm strikethrough, strong) //! * [Autolink][crate::construct::autolink] //! * [Character escape][crate::construct::character_escape] //! * [Character reference][crate::construct::character_reference] -//! * [Code (text)][crate::construct::code_text] +//! * [Raw (text)][crate::construct::raw_text] (code (text), math (text)) //! * [GFM: Label start (footnote)][crate::construct::gfm_label_start_footnote] //! * [GFM: Task list item check][crate::construct::gfm_task_list_item_check] //! * [Hard break (escape)][crate::construct::hard_break_escape] @@ -29,17 +29,18 @@ use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; /// Characters that can start something in text. -const MARKERS: [u8; 10] = [ +const MARKERS: [u8; 11] = [ b'!', // `label_start_image` + b'$', // `raw_text` (math (text)) b'&', // `character_reference` - b'*', // `attention` + b'*', // `attention` (emphasis, strong) b'<', // `autolink`, `html_text` b'[', // `label_start_link` b'\\', // `character_escape`, `hard_break_escape` b']', // `label_end`, `gfm_label_start_footnote` - b'_', // `attention` - b'`', // `code_text` - b'~', // `attention` (w/ `gfm_strikethrough`) + b'_', // `attention` (emphasis, strong) + b'`', // `raw_text` (code (text)) + b'~', // `attention` (gfm strikethrough) ]; /// Start of text. @@ -81,6 +82,14 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::LabelStartImageStart) } + // raw (text) (code (text), math (text)) + Some(b'$' | b'`') => { + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBeforeData), + ); + State::Retry(StateName::RawTextStart) + } Some(b'&') => { tokenizer.attempt( State::Next(StateName::TextBefore), @@ -88,6 +97,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::CharacterReferenceStart) } + // attention (emphasis, gfm strikethrough, strong) Some(b'*' | b'_' | b'~') => { tokenizer.attempt( State::Next(StateName::TextBefore), @@ -124,13 +134,6 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::LabelEndStart) } - Some(b'`') => { - tokenizer.attempt( - State::Next(StateName::TextBefore), - State::Next(StateName::TextBeforeData), - ); - State::Retry(StateName::CodeTextStart) - } _ => State::Retry(StateName::TextBeforeData), } } diff --git a/src/event.rs b/src/event.rs index 3b805e5..869f2e8 100644 --- a/src/event.rs +++ b/src/event.rs @@ -507,7 +507,7 @@ pub enum Name { /// [`CodeTextSequence`][Name::CodeTextSequence], /// [`LineEnding`][Name::LineEnding] /// * **Construct**: - /// [`code_text`][crate::construct::code_text] + /// [`raw_text`][crate::construct::raw_text] /// /// ## Example /// @@ -525,7 +525,7 @@ pub enum Name { /// * **Content model**: /// void /// * **Construct**: - /// [`code_text`][crate::construct::code_text] + /// [`raw_text`][crate::construct::raw_text] /// /// ## Example /// @@ -543,7 +543,7 @@ pub enum Name { /// * **Content model**: /// void /// * **Construct**: - /// [`code_text`][crate::construct::code_text] + /// [`raw_text`][crate::construct::raw_text] /// /// ## Example /// @@ -1889,6 +1889,62 @@ pub enum Name { /// ^^^ /// ``` ListUnordered, + /// Whole math (text). + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// [`MathTextData`][Name::MathTextData], + /// [`MathTextSequence`][Name::MathTextSequence], + /// [`LineEnding`][Name::LineEnding] + /// * **Construct**: + /// [`raw_text`][crate::construct::raw_text] + /// + /// ## Example + /// + /// ```markdown + /// > | a $b$ c + /// ^^^ + /// ``` + MathText, + /// Math (text) data. + /// + /// ## Info + /// + /// * **Context**: + /// [`MathText`][Name::MathText], + /// * **Content model**: + /// void + /// * **Construct**: + /// [`raw_text`][crate::construct::raw_text] + /// + /// ## Example + /// + /// ```markdown + /// > | a `b` c + /// ^ + /// ``` + MathTextData, + /// Math (text) sequence. + /// + /// ## Info + /// + /// * **Context**: + /// [`MathText`][Name::MathText], + /// * **Content model**: + /// void + /// * **Construct**: + /// [`raw_text`][crate::construct::raw_text] + /// + /// ## Example + /// + /// ```markdown + /// > | a $b$ c + /// ^ ^ + /// ``` + MathTextSequence, /// Whole paragraph. /// /// ## Info @@ -2271,7 +2327,7 @@ pub enum Name { } /// List of void events, used to make sure everything is working well. -pub const VOID_EVENTS: [Name; 53] = [ +pub const VOID_EVENTS: [Name; 55] = [ Name::AttentionSequence, Name::AutolinkEmail, Name::AutolinkMarker, @@ -2319,6 +2375,8 @@ pub const VOID_EVENTS: [Name; 53] = [ Name::LineEnding, Name::ListItemMarker, Name::ListItemValue, + Name::MathTextData, + Name::MathTextSequence, Name::ReferenceMarker, Name::ResourceMarker, Name::ResourceTitleMarker, diff --git a/src/lib.rs b/src/lib.rs index fd5e500..98a4936 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -272,6 +272,13 @@ pub struct Constructs { /// ^^^ /// ``` pub list_item: bool, + /// Math (text). + /// + /// ```markdown + /// > | a $b$ c + /// ^^^ + /// ``` + pub math_text: bool, /// Thematic break. /// /// ```markdown @@ -310,6 +317,7 @@ impl Default for Constructs { label_start_link: true, label_end: true, list_item: true, + math_text: false, thematic_break: true, } } @@ -333,6 +341,7 @@ impl Constructs { } /// Configuration (optional). +#[allow(clippy::struct_excessive_bools)] #[derive(Clone, Debug)] pub struct Options { /// Whether to allow (dangerous) HTML. @@ -393,6 +402,74 @@ pub struct Options { /// ``` pub allow_dangerous_protocol: bool, + /// Which constructs to enable and disable. + /// The default is to follow `CommonMark`. + /// + /// ## Examples + /// + /// ``` + /// use micromark::{micromark, micromark_with_options, Options, Constructs}; + /// + /// // micromark follows CommonMark by default: + /// assert_eq!( + /// micromark(" indented code?"), + /// "
indented code?\n
" + /// ); + /// + /// // Pass `constructs` to choose what to enable and disable: + /// assert_eq!( + /// micromark_with_options( + /// " indented code?", + /// &Options { + /// constructs: Constructs { + /// code_indented: false, + /// ..Constructs::default() + /// }, + /// ..Options::default() + /// } + /// ), + /// "

indented code?

" + /// ); + /// ``` + pub constructs: Constructs, + + /// Default line ending to use, for line endings not in `value`. + /// + /// Generally, micromark copies line endings (`\r`, `\n`, `\r\n`) in the + /// markdown document over to the compiled HTML. + /// In some cases, such as `> a`, CommonMark requires that extra line + /// endings are added: `
\n

a

\n
`. + /// + /// To create that line ending, the document is checked for the first line + /// ending that is used. + /// If there is no line ending, `default_line_ending` is used. + /// If that isn’t configured, `\n` is used. + /// + /// ## Examples + /// + /// ``` + /// use micromark::{micromark, micromark_with_options, Options, LineEnding}; + /// + /// // micromark uses `\n` by default: + /// assert_eq!( + /// micromark("> a"), + /// "
\n

a

\n
" + /// ); + /// + /// // Define `default_line_ending` to configure the default: + /// assert_eq!( + /// micromark_with_options( + /// "> a", + /// &Options { + /// default_line_ending: LineEnding::CarriageReturnLineFeed, + /// ..Options::default() + /// } + /// ), + /// "
\r\n

a

\r\n
" + /// ); + /// ``` + pub default_line_ending: LineEnding, + /// Label to use for the footnotes section. /// /// Change it when the markdown is not in English. @@ -594,7 +671,7 @@ pub struct Options { pub gfm_footnote_clobber_prefix: Option, /// Whether to support GFM strikethrough (if enabled in `constructs`) with - /// a single tilde (default: true). + /// a single tilde (default: `true`). /// /// Single tildes work on github.com but are technically prohibited by GFM. /// @@ -630,73 +707,49 @@ pub struct Options { /// ``` pub gfm_strikethrough_single_tilde: bool, - /// Default line ending to use, for line endings not in `value`. + /// Whether to support math (text) (if enabled in `constructs`) with a + /// single dollar (default: `true`). /// - /// Generally, micromark copies line endings (`\r`, `\n`, `\r\n`) in the - /// markdown document over to the compiled HTML. - /// In some cases, such as `> a`, CommonMark requires that extra line - /// endings are added: `
\n

a

\n
`. - /// - /// To create that line ending, the document is checked for the first line - /// ending that is used. - /// If there is no line ending, `default_line_ending` is used. - /// If that isn’t configured, `\n` is used. + /// Single dollars work in Pandoc and many other places, but often + /// interfere with “normal” dollars in text. /// /// ## Examples /// /// ``` - /// use micromark::{micromark, micromark_with_options, Options, LineEnding}; - /// - /// // micromark uses `\n` by default: - /// assert_eq!( - /// micromark("> a"), - /// "
\n

a

\n
" - /// ); + /// use micromark::{micromark, micromark_with_options, Options, Constructs}; /// - /// // Define `default_line_ending` to configure the default: + /// // micromark supports single dollars by default: /// assert_eq!( /// micromark_with_options( - /// "> a", + /// "$a$", /// &Options { - /// default_line_ending: LineEnding::CarriageReturnLineFeed, + /// constructs: Constructs { + /// math_text: true, + /// ..Constructs::default() + /// }, /// ..Options::default() /// } /// ), - /// "
\r\n

a

\r\n
" - /// ); - /// ``` - pub default_line_ending: LineEnding, - - /// Which constructs to enable and disable. - /// The default is to follow `CommonMark`. - /// - /// ## Examples - /// - /// ``` - /// use micromark::{micromark, micromark_with_options, Options, Constructs}; - /// - /// // micromark follows CommonMark by default: - /// assert_eq!( - /// micromark(" indented code?"), - /// "
indented code?\n
" + /// "

a

" /// ); /// - /// // Pass `constructs` to choose what to enable and disable: + /// // Pass `math_text_single_dollar: false` to turn that off: /// assert_eq!( /// micromark_with_options( - /// " indented code?", + /// "$a$", /// &Options { /// constructs: Constructs { - /// code_indented: false, + /// math_text: true, /// ..Constructs::default() /// }, + /// math_text_single_dollar: false, /// ..Options::default() /// } /// ), - /// "

indented code?

" + /// "

$a$

" /// ); /// ``` - pub constructs: Constructs, + pub math_text_single_dollar: bool, } impl Default for Options { @@ -705,14 +758,15 @@ impl Default for Options { Self { allow_dangerous_html: false, allow_dangerous_protocol: false, + constructs: Constructs::default(), + default_line_ending: LineEnding::default(), gfm_footnote_label: None, gfm_footnote_label_tag_name: None, gfm_footnote_label_attributes: None, gfm_footnote_back_label: None, gfm_footnote_clobber_prefix: None, gfm_strikethrough_single_tilde: true, - default_line_ending: LineEnding::default(), - constructs: Constructs::default(), + math_text_single_dollar: true, } } } diff --git a/src/state.rs b/src/state.rs index 6c3f563..0c04821 100644 --- a/src/state.rs +++ b/src/state.rs @@ -78,11 +78,11 @@ pub enum Name { CodeIndentedFurtherBegin, CodeIndentedFurtherAfter, - CodeTextStart, - CodeTextSequenceOpen, - CodeTextBetween, - CodeTextData, - CodeTextSequenceClose, + RawTextStart, + RawTextSequenceOpen, + RawTextBetween, + RawTextData, + RawTextSequenceClose, DataStart, DataInside, @@ -392,11 +392,11 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::CodeIndentedFurtherBegin => construct::code_indented::further_begin, Name::CodeIndentedFurtherAfter => construct::code_indented::further_after, - Name::CodeTextStart => construct::code_text::start, - Name::CodeTextSequenceOpen => construct::code_text::sequence_open, - Name::CodeTextBetween => construct::code_text::between, - Name::CodeTextData => construct::code_text::data, - Name::CodeTextSequenceClose => construct::code_text::sequence_close, + Name::RawTextStart => construct::raw_text::start, + Name::RawTextSequenceOpen => construct::raw_text::sequence_open, + Name::RawTextBetween => construct::raw_text::between, + Name::RawTextData => construct::raw_text::data, + Name::RawTextSequenceClose => construct::raw_text::sequence_close, Name::DataStart => construct::partial_data::start, Name::DataInside => construct::partial_data::inside, diff --git a/tests/math_text.rs b/tests/math_text.rs new file mode 100644 index 0000000..d0e7589 --- /dev/null +++ b/tests/math_text.rs @@ -0,0 +1,190 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Constructs, Options}; +use pretty_assertions::assert_eq; + +#[test] +fn math_text() { + let math = Options { + constructs: Constructs { + math_text: true, + // To do: enable `math_flow`. + ..Constructs::default() + }, + ..Options::default() + }; + + assert_eq!( + micromark("$a$"), + "

$a$

", + "should not support math (text) by default" + ); + + assert_eq!( + micromark_with_options("$foo$ $$bar$$", &math), + "

foo bar

", + "should support math (text) if enabled" + ); + + assert_eq!( + micromark_with_options( + "$foo$ $$bar$$", + &Options { + math_text_single_dollar: false, + ..math.clone() + } + ), + "

$foo$ bar

", + "should not support math (text) w/ a single dollar, w/ `math_text_single_dollar: false`" + ); + + // assert_eq!( + // micromark_with_options("$foo$", &math), + // "

foo

", + // "should support math (text)" + // ); + + assert_eq!( + micromark_with_options("$$ foo $ bar $$", &math), + "

foo $ bar

", + "should support math (text) w/ more dollars" + ); + + assert_eq!( + micromark_with_options("$ $$ $", &math), + "

$$

", + "should support math (text) w/ fences inside, and padding" + ); + + assert_eq!( + micromark_with_options("$ $$ $", &math), + "

$$

", + "should support math (text) w/ extra padding" + ); + + assert_eq!( + micromark_with_options("$ a$", &math), + "

a

", + "should support math (text) w/ unbalanced padding" + ); + + assert_eq!( + micromark_with_options("$\u{a0}b\u{a0}$", &math), + "

\u{a0}b\u{a0}

", + "should support math (text) w/ non-padding whitespace" + ); + + assert_eq!( + micromark_with_options("$ $\n$ $", &math), + "

\n

", + "should support math (text) w/o data" + ); + + assert_eq!( + micromark_with_options("$$\nfoo\nbar \nbaz\n$$", &math), + "

foo bar baz

", + "should support math (text) w/o line endings (1)" + ); + + assert_eq!( + micromark_with_options("$$\nfoo \n$$", &math), + "

foo

", + "should support math (text) w/o line endings (2)" + ); + + assert_eq!( + micromark_with_options("$foo bar \nbaz$", &math), + "

foo bar baz

", + "should not support whitespace collapsing" + ); + + assert_eq!( + micromark_with_options("$foo\\$bar$", &math), + "

foo\\bar$

", + "should not support character escapes" + ); + + assert_eq!( + micromark_with_options("$$foo$bar$$", &math), + "

foo$bar

", + "should support more dollars" + ); + + assert_eq!( + micromark_with_options("$ foo $$ bar $", &math), + "

foo $$ bar

", + "should support less dollars" + ); + + assert_eq!( + micromark_with_options("*foo$*$", &math), + "

*foo*

", + "should precede over emphasis" + ); + + assert_eq!( + micromark_with_options("[not a $link](/foo$)", &math), + "

[not a link](/foo)

", + "should precede over links" + ); + + assert_eq!( + micromark_with_options("$$", &math), + "

<a href="">$

", + "should have same precedence as HTML (1)" + ); + + assert_eq!( + micromark_with_options( + "
$", + &Options { + allow_dangerous_html: true, + allow_dangerous_protocol: true, + ..math.clone() + } + ), + "

$

", + "should have same precedence as HTML (2)" + ); + + assert_eq!( + micromark_with_options("$$", &math), + "

<http://foo.bar.baz>$

", + "should have same precedence as autolinks (1)" + ); + + assert_eq!( + micromark_with_options("$", &math), + "

http://foo.bar.$baz$

", + "should have same precedence as autolinks (2)" + ); + + assert_eq!( + micromark_with_options("$$$foo$$", &math), + "

$$$foo$$

", + "should not support more dollars before a fence" + ); + + assert_eq!( + micromark_with_options("$foo", &math), + "

$foo

", + "should not support no closing fence (1)" + ); + + assert_eq!( + micromark_with_options("$foo$$bar$$", &math), + "

$foobar

", + "should not support no closing fence (2)" + ); + + assert_eq!( + micromark_with_options("$foo\t\tbar$", &math), + "

foo\t\tbar

", + "should support tabs in code" + ); + + assert_eq!( + micromark_with_options("\\$$x$", &math), + "

$x

", + "should support an escaped initial dollar" + ); +} -- cgit