From 670f1d82e01ea2394b21d7d1857f41bdc67b3fce Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 26 Aug 2022 13:29:10 +0200 Subject: Add support for math (flow) --- src/compiler.rs | 79 +-- src/construct/code_fenced.rs | 569 --------------------- src/construct/code_indented.rs | 4 +- src/construct/flow.rs | 26 +- src/construct/mod.rs | 6 +- src/construct/partial_non_lazy_continuation.rs | 4 +- src/construct/raw_flow.rs | 665 +++++++++++++++++++++++++ src/construct/raw_text.rs | 19 +- src/event.rs | 127 ++++- src/lib.rs | 14 +- src/state.rs | 72 +-- src/tokenizer.rs | 3 + src/util/constant.rs | 8 +- 13 files changed, 915 insertions(+), 681 deletions(-) delete mode 100644 src/construct/code_fenced.rs create mode 100644 src/construct/raw_flow.rs (limited to 'src') diff --git a/src/compiler.rs b/src/compiler.rs index 73834a4..9057505 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -85,10 +85,10 @@ struct CompileContext<'a> { pub heading_atx_rank: Option, /// Buffer of heading (setext) text. pub heading_setext_buffer: Option, - /// Whether code (flow) contains data. - pub code_flow_seen_data: Option, - /// Number of code (fenced) fenced. - pub code_fenced_fences_count: Option, + /// Whether raw (flow) (code (fenced), math (flow)) or code (indented) contains data. + pub raw_flow_seen_data: Option, + /// Number of raw (flow) fences. + pub raw_flow_fences_count: Option, /// Whether we are in code (text). pub raw_text_inside: bool, /// Whether we are in image text. @@ -143,8 +143,8 @@ impl<'a> CompileContext<'a> { bytes, heading_atx_rank: None, heading_setext_buffer: None, - code_flow_seen_data: None, - code_fenced_fences_count: None, + raw_flow_seen_data: None, + raw_flow_fences_count: None, raw_text_inside: false, character_reference_marker: None, list_expect_first_marker: None, @@ -329,6 +329,7 @@ fn enter(context: &mut CompileContext) { match context.events[context.index].name { Name::CodeFencedFenceInfo | Name::CodeFencedFenceMeta + | Name::MathFlowFenceMeta | Name::DefinitionLabelString | Name::DefinitionTitleString | Name::GfmFootnoteDefinitionPrefix @@ -340,7 +341,7 @@ fn enter(context: &mut CompileContext) { Name::BlockQuote => on_enter_block_quote(context), Name::CodeIndented => on_enter_code_indented(context), - Name::CodeFenced => on_enter_code_fenced(context), + Name::CodeFenced | Name::MathFlow => on_enter_raw_flow(context), Name::CodeText | Name::MathText => on_enter_raw_text(context), Name::Definition => on_enter_definition(context), Name::DefinitionDestinationString => on_enter_definition_destination_string(context), @@ -367,7 +368,9 @@ fn enter(context: &mut CompileContext) { /// Handle [`Exit`][Kind::Exit]. fn exit(context: &mut CompileContext) { match context.events[context.index].name { - Name::CodeFencedFenceMeta | Name::Resource => on_exit_drop(context), + Name::CodeFencedFenceMeta | Name::MathFlowFenceMeta | Name::Resource => { + on_exit_drop(context); + } Name::CharacterEscapeValue | Name::CodeTextData | Name::Data | Name::MathTextData => { on_exit_data(context); } @@ -383,10 +386,10 @@ fn exit(context: &mut CompileContext) { on_exit_character_reference_marker_hexadecimal(context); } Name::CharacterReferenceValue => on_exit_character_reference_value(context), - Name::CodeFenced | Name::CodeIndented => on_exit_code_flow(context), - Name::CodeFencedFence => on_exit_code_fenced_fence(context), - Name::CodeFencedFenceInfo => on_exit_code_fenced_fence_info(context), - Name::CodeFlowChunk => on_exit_code_flow_chunk(context), + Name::CodeFenced | Name::CodeIndented | Name::MathFlow => on_exit_raw_flow(context), + Name::CodeFencedFence | Name::MathFlowFence => on_exit_raw_flow_fence(context), + Name::CodeFencedFenceInfo => on_exit_raw_flow_fence_info(context), + Name::CodeFlowChunk | Name::MathFlowChunk => on_exit_raw_flow_chunk(context), Name::CodeText | Name::MathText => on_exit_raw_text(context), Name::Definition => on_exit_definition(context), Name::DefinitionDestinationString => on_exit_definition_destination_string(context), @@ -447,18 +450,22 @@ fn on_enter_block_quote(context: &mut CompileContext) { /// Handle [`Enter`][Kind::Enter]:[`CodeIndented`][Name::CodeIndented]. fn on_enter_code_indented(context: &mut CompileContext) { - context.code_flow_seen_data = Some(false); + context.raw_flow_seen_data = Some(false); context.line_ending_if_needed(); context.push("
");
 }
 
-/// Handle [`Enter`][Kind::Enter]:[`CodeFenced`][Name::CodeFenced].
-fn on_enter_code_fenced(context: &mut CompileContext) {
-    context.code_flow_seen_data = Some(false);
+/// Handle [`Enter`][Kind::Enter]:{[`CodeFenced`][Name::CodeFenced],[`MathFlow`][Name::MathFlow]}.
+fn on_enter_raw_flow(context: &mut CompileContext) {
+    context.raw_flow_seen_data = Some(false);
     context.line_ending_if_needed();
-    // Note that no `>` is used, which is added later.
+    // Note that no `>` is used, which is added later (due to info)
     context.push("
");
     }
@@ -802,9 +809,9 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {
     context.push(&encode(&value, context.encode_html));
 }
 
-/// Handle [`Exit`][Kind::Exit]:[`CodeFlowChunk`][Name::CodeFlowChunk].
-fn on_exit_code_flow_chunk(context: &mut CompileContext) {
-    context.code_flow_seen_data = Some(true);
+/// Handle [`Exit`][Kind::Exit]:{[`CodeFlowChunk`][Name::CodeFlowChunk],[`MathFlowChunk`][Name::MathFlowChunk]}.
+fn on_exit_raw_flow_chunk(context: &mut CompileContext) {
+    context.raw_flow_seen_data = Some(true);
     context.push(&encode(
         &Slice::from_position(
             context.bytes,
@@ -816,9 +823,9 @@ fn on_exit_code_flow_chunk(context: &mut CompileContext) {
     ));
 }
 
-/// Handle [`Exit`][Kind::Exit]:[`CodeFencedFence`][Name::CodeFencedFence].
-fn on_exit_code_fenced_fence(context: &mut CompileContext) {
-    let count = if let Some(count) = context.code_fenced_fences_count {
+/// Handle [`Exit`][Kind::Exit]:{[`CodeFencedFence`][Name::CodeFencedFence],[`MathFlowFence`][Name::MathFlowFence]}.
+fn on_exit_raw_flow_fence(context: &mut CompileContext) {
+    let count = if let Some(count) = context.raw_flow_fences_count {
         count
     } else {
         0
@@ -829,31 +836,33 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {
         context.slurp_one_line_ending = true;
     }
 
-    context.code_fenced_fences_count = Some(count + 1);
+    context.raw_flow_fences_count = Some(count + 1);
 }
 
 /// Handle [`Exit`][Kind::Exit]:[`CodeFencedFenceInfo`][Name::CodeFencedFenceInfo].
-fn on_exit_code_fenced_fence_info(context: &mut CompileContext) {
+///
+/// Note: math (flow) does not support `info`.
+fn on_exit_raw_flow_fence_info(context: &mut CompileContext) {
     let value = context.resume();
     context.push(" class=\"language-");
     context.push(&value);
     context.push("\"");
 }
 
-/// Handle [`Exit`][Kind::Exit]:{[`CodeFenced`][Name::CodeFenced],[`CodeIndented`][Name::CodeIndented]}.
-fn on_exit_code_flow(context: &mut CompileContext) {
-    // One special case is if we are inside a container, and the fenced code was
+/// Handle [`Exit`][Kind::Exit]:{[`CodeFenced`][Name::CodeFenced],[`CodeIndented`][Name::CodeIndented],[`MathFlow`][Name::MathFlow]}.
+fn on_exit_raw_flow(context: &mut CompileContext) {
+    // One special case is if we are inside a container, and the raw (flow) was
     // not closed (meaning it runs to the end).
     // In that case, the following line ending, is considered *outside* the
     // fenced code and block quote by micromark, but CM wants to treat that
     // ending as part of the code.
-    if let Some(count) = context.code_fenced_fences_count {
+    if let Some(count) = context.raw_flow_fences_count {
         // No closing fence.
         if count == 1
             // In a container.
             && !context.tight_stack.is_empty()
             // Empty (as the closing is right at the opening fence)
-            && context.events[context.index - 1].name != Name::CodeFencedFence
+            && !matches!(context.events[context.index - 1].name, Name::CodeFencedFence | Name::MathFlowFence)
         {
             context.line_ending();
         }
@@ -862,16 +871,16 @@ fn on_exit_code_flow(context: &mut CompileContext) {
     // But in most cases, it’s simpler: when we’ve seen some data, emit an extra
     // line ending when needed.
     if context
-        .code_flow_seen_data
+        .raw_flow_seen_data
         .take()
-        .expect("`code_flow_seen_data` must be defined")
+        .expect("`raw_flow_seen_data` must be defined")
     {
         context.line_ending_if_needed();
     }
 
     context.push("
"); - if let Some(count) = context.code_fenced_fences_count.take() { + if let Some(count) = context.raw_flow_fences_count.take() { if count < 2 { context.line_ending_if_needed(); } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs deleted file mode 100644 index edb2d93..0000000 --- a/src/construct/code_fenced.rs +++ /dev/null @@ -1,569 +0,0 @@ -//! Code (fenced) occurs in the [flow][] content type. -//! -//! ## Grammar -//! -//! Code (fenced) forms with the following BNF -//! (see [construct][crate::construct] for character groups): -//! -//! ```bnf -//! code_fenced ::= fence_open *( eol *byte ) [ eol fence_close ] -//! -//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab -//! ; Restriction: the number of markers in the closing fence sequence must be -//! ; equal to or greater than the number of markers in the opening fence -//! ; sequence. -//! ; Restriction: the marker in the closing fence sequence must match the -//! ; marker in the opening fence sequence -//! fence_close ::= sequence *space_or_tab -//! sequence ::= 3*'`' | 3*'~' -//! ; Restriction: the `` ` `` character cannot occur in `info` if it is the marker. -//! info ::= 1*text -//! ; Restriction: the `` ` `` character cannot occur in `meta` if it is the marker. -//! meta ::= 1*text *( *space_or_tab 1*text ) -//! ``` -//! -//! As this construct occurs in flow, like all flow constructs, it must be -//! followed by an eol (line ending) or eof (end of file). -//! -//! The above grammar does not show how indentation (with `space_or_tab`) of -//! each line is handled. -//! To parse code (fenced), let `x` be the number of `space_or_tab` characters -//! before the opening fence sequence. -//! Each line of text is then allowed (not required) to be indented with up -//! to `x` spaces or tabs, which are then ignored as an indent instead of being -//! considered as part of the code. -//! This indent does not affect the closing fence. -//! It can be indented up to a separate 3 spaces or tabs. -//! A bigger indent makes it part of the code instead of a fence. -//! -//! The `info` and `meta` parts are interpreted as the [string][] content type. -//! That means that [character escapes][character_escape] and -//! [character references][character_reference] are allowed. -//! -//! The optional `meta` part is ignored: it is not used when parsing or -//! rendering. -//! -//! The optional `info` part is used and is expected to specify the programming -//! language that the code is in. -//! Which value it holds depends on what your syntax highlighter supports, if -//! one is used. -//! -//! In markdown, it is also possible to use [code (text)][raw_text] in the -//! [text][] content type. -//! It is also possible to create code with the -//! [code (indented)][code_indented] construct. -//! -//! ## HTML -//! -//! Code (fenced) relates to both the `
` and the `` elements in
-//! HTML.
-//! See [*§ 4.4.3 The `pre` element*][html_pre] and the [*§ 4.5.15 The `code`
-//! element*][html_code] in the HTML spec for more info.
-//!
-//! The `info` is, when rendering to HTML, typically exposed as a class.
-//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
-//! element*][html_code]).
-//! For example:
-//!
-//! ```markdown
-//! ~~~css
-//! * { color: tomato }
-//! ~~~
-//! ```
-//!
-//! Yields:
-//!
-//! ```html
-//! 
* { color: tomato }
-//! 
-//! ``` -//! -//! ## Recommendation -//! -//! It is recommended to use code (fenced) instead of code (indented). -//! Code (fenced) is more explicit, similar to code (text), and has support -//! for specifying the programming language. -//! -//! ## Tokens -//! -//! * [`CodeFenced`][Name::CodeFenced] -//! * [`CodeFencedFence`][Name::CodeFencedFence] -//! * [`CodeFencedFenceInfo`][Name::CodeFencedFenceInfo] -//! * [`CodeFencedFenceMeta`][Name::CodeFencedFenceMeta] -//! * [`CodeFencedFenceSequence`][Name::CodeFencedFenceSequence] -//! * [`CodeFlowChunk`][Name::CodeFlowChunk] -//! * [`LineEnding`][Name::LineEnding] -//! * [`SpaceOrTab`][Name::SpaceOrTab] -//! -//! ## References -//! -//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) -//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) -//! -//! [flow]: crate::construct::flow -//! [string]: crate::construct::string -//! [text]: crate::construct::text -//! [character_escape]: crate::construct::character_escape -//! [character_reference]: crate::construct::character_reference -//! [code_indented]: crate::construct::code_indented -//! [raw_text]: crate::construct::raw_text -//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element -//! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element - -use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::event::{Content, Link, Name}; -use crate::state::{Name as StateName, State}; -use crate::tokenizer::Tokenizer; -use crate::util::{ - constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}, - slice::{Position, Slice}, -}; - -/// Start of fenced code. -/// -/// ```markdown -/// > | ~~~js -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.options.constructs.code_fenced { - if matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.enter(Name::CodeFenced); - tokenizer.enter(Name::CodeFencedFence); - tokenizer.attempt( - State::Next(StateName::CodeFencedBeforeSequenceOpen), - State::Nok, - ); - return State::Retry(space_or_tab_min_max( - tokenizer, - 0, - if tokenizer.parse_state.options.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - )); - } - - if matches!(tokenizer.current, Some(b'`' | b'~')) { - tokenizer.enter(Name::CodeFenced); - tokenizer.enter(Name::CodeFencedFence); - return State::Retry(StateName::CodeFencedBeforeSequenceOpen); - } - } - - State::Nok -} - -/// In opening fence, after prefix, at sequence. -/// -/// ```markdown -/// > | ~~~js -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { - let tail = tokenizer.events.last(); - let mut prefix = 0; - - if let Some(event) = tail { - if event.name == Name::SpaceOrTab { - prefix = Slice::from_position( - tokenizer.parse_state.bytes, - &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), - ) - .len(); - } - } - - if let Some(b'`' | b'~') = tokenizer.current { - tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); - tokenizer.tokenize_state.size_c = prefix; - tokenizer.enter(Name::CodeFencedFenceSequence); - State::Retry(StateName::CodeFencedSequenceOpen) - } else { - State::Nok - } -} - -/// In opening fence sequence. -/// -/// ```markdown -/// > | ~~~js -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { - if tokenizer.current == Some(tokenizer.tokenize_state.marker) { - tokenizer.tokenize_state.size += 1; - tokenizer.consume(); - State::Next(StateName::CodeFencedSequenceOpen) - } else if tokenizer.tokenize_state.size < CODE_FENCED_SEQUENCE_SIZE_MIN { - tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.size_c = 0; - tokenizer.tokenize_state.size = 0; - State::Nok - } else if matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.exit(Name::CodeFencedFenceSequence); - tokenizer.attempt(State::Next(StateName::CodeFencedInfoBefore), State::Nok); - State::Retry(space_or_tab(tokenizer)) - } else { - tokenizer.exit(Name::CodeFencedFenceSequence); - State::Retry(StateName::CodeFencedInfoBefore) - } -} - -/// In opening fence, after the sequence (and optional whitespace), before info. -/// -/// ```markdown -/// > | ~~~js -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn info_before(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => { - tokenizer.exit(Name::CodeFencedFence); - // Do not form containers. - tokenizer.concrete = true; - tokenizer.check( - State::Next(StateName::CodeFencedAtNonLazyBreak), - State::Next(StateName::CodeFencedAfter), - ); - State::Retry(StateName::NonLazyContinuationStart) - } - _ => { - tokenizer.enter(Name::CodeFencedFenceInfo); - tokenizer.enter_link( - Name::Data, - Link { - previous: None, - next: None, - content: Content::String, - }, - ); - State::Retry(StateName::CodeFencedInfo) - } - } -} - -/// In info. -/// -/// ```markdown -/// > | ~~~js -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn info(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => { - tokenizer.exit(Name::Data); - tokenizer.exit(Name::CodeFencedFenceInfo); - State::Retry(StateName::CodeFencedInfoBefore) - } - Some(b'\t' | b' ') => { - tokenizer.exit(Name::Data); - tokenizer.exit(Name::CodeFencedFenceInfo); - tokenizer.attempt(State::Next(StateName::CodeFencedMetaBefore), State::Nok); - State::Retry(space_or_tab(tokenizer)) - } - Some(byte) => { - if tokenizer.tokenize_state.marker == byte && byte == b'`' { - tokenizer.concrete = false; - tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.size_c = 0; - tokenizer.tokenize_state.size = 0; - State::Nok - } else { - tokenizer.consume(); - State::Next(StateName::CodeFencedInfo) - } - } - } -} - -/// In opening fence, after info and whitespace, before meta. -/// -/// ```markdown -/// > | ~~~js eval -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn meta_before(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => State::Retry(StateName::CodeFencedInfoBefore), - _ => { - tokenizer.enter(Name::CodeFencedFenceMeta); - tokenizer.enter_link( - Name::Data, - Link { - previous: None, - next: None, - content: Content::String, - }, - ); - State::Retry(StateName::CodeFencedMeta) - } - } -} - -/// In meta. -/// -/// ```markdown -/// > | ~~~js eval -/// ^ -/// | console.log(1) -/// | ~~~ -/// ``` -pub fn meta(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => { - tokenizer.exit(Name::Data); - tokenizer.exit(Name::CodeFencedFenceMeta); - State::Retry(StateName::CodeFencedInfoBefore) - } - Some(byte) => { - if tokenizer.tokenize_state.marker == byte && byte == b'`' { - tokenizer.concrete = false; - tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.size_c = 0; - tokenizer.tokenize_state.size = 0; - State::Nok - } else { - tokenizer.consume(); - State::Next(StateName::CodeFencedMeta) - } - } - } -} - -/// At eol/eof in code, before a non-lazy closing fence or content. -/// -/// ```markdown -/// > | ~~~js -/// ^ -/// > | console.log(1) -/// ^ -/// | ~~~ -/// ``` -pub fn at_non_lazy_break(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt( - State::Next(StateName::CodeFencedAfter), - State::Next(StateName::CodeFencedContentBefore), - ); - tokenizer.enter(Name::LineEnding); - tokenizer.consume(); - tokenizer.exit(Name::LineEnding); - State::Next(StateName::CodeFencedCloseStart) -} - -/// Before closing fence, at optional whitespace. -/// -/// ```markdown -/// | ~~~js -/// | console.log(1) -/// > | ~~~ -/// ^ -/// ``` -pub fn close_start(tokenizer: &mut Tokenizer) -> State { - tokenizer.enter(Name::CodeFencedFence); - - if matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.attempt( - State::Next(StateName::CodeFencedBeforeSequenceClose), - State::Nok, - ); - - State::Retry(space_or_tab_min_max( - tokenizer, - 0, - if tokenizer.parse_state.options.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - )) - } else { - State::Retry(StateName::CodeFencedBeforeSequenceClose) - } -} - -/// In closing fence, after optional whitespace, at sequence. -/// -/// ```markdown -/// | ~~~js -/// | console.log(1) -/// > | ~~~ -/// ^ -/// ``` -pub fn before_sequence_close(tokenizer: &mut Tokenizer) -> State { - if tokenizer.current == Some(tokenizer.tokenize_state.marker) { - tokenizer.enter(Name::CodeFencedFenceSequence); - State::Retry(StateName::CodeFencedSequenceClose) - } else { - State::Nok - } -} - -/// In closing fence sequence. -/// -/// ```markdown -/// | ~~~js -/// | console.log(1) -/// > | ~~~ -/// ^ -/// ``` -pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { - if tokenizer.current == Some(tokenizer.tokenize_state.marker) { - tokenizer.tokenize_state.size_b += 1; - tokenizer.consume(); - State::Next(StateName::CodeFencedSequenceClose) - } else if tokenizer.tokenize_state.size_b >= CODE_FENCED_SEQUENCE_SIZE_MIN - && tokenizer.tokenize_state.size_b >= tokenizer.tokenize_state.size - { - tokenizer.tokenize_state.size_b = 0; - tokenizer.exit(Name::CodeFencedFenceSequence); - - if matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.attempt( - State::Next(StateName::CodeFencedAfterSequenceClose), - State::Nok, - ); - State::Retry(space_or_tab(tokenizer)) - } else { - State::Retry(StateName::CodeFencedAfterSequenceClose) - } - } else { - tokenizer.tokenize_state.size_b = 0; - State::Nok - } -} - -/// After closing fence sequence, after optional whitespace. -/// -/// ```markdown -/// | ~~~js -/// | console.log(1) -/// > | ~~~ -/// ^ -/// ``` -pub fn sequence_close_after(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => { - tokenizer.exit(Name::CodeFencedFence); - State::Ok - } - _ => State::Nok, - } -} - -/// Before closing fence, at eol. -/// -/// ```markdown -/// | ~~~js -/// > | console.log(1) -/// ^ -/// | ~~~ -/// ``` -pub fn content_before(tokenizer: &mut Tokenizer) -> State { - tokenizer.enter(Name::LineEnding); - tokenizer.consume(); - tokenizer.exit(Name::LineEnding); - State::Next(StateName::CodeFencedContentStart) -} - -/// Before code content, definitely not before a closing fence. -/// -/// ```markdown -/// | ~~~js -/// > | console.log(1) -/// ^ -/// | ~~~ -/// ``` -pub fn content_start(tokenizer: &mut Tokenizer) -> State { - if matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.attempt( - State::Next(StateName::CodeFencedBeforeContentChunk), - State::Nok, - ); - State::Retry(space_or_tab_min_max( - tokenizer, - 0, - tokenizer.tokenize_state.size_c, - )) - } else { - State::Retry(StateName::CodeFencedBeforeContentChunk) - } -} - -/// Before code content, after optional prefix. -/// -/// ```markdown -/// | ~~~js -/// > | console.log(1) -/// ^ -/// | ~~~ -/// ``` -pub fn before_content_chunk(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => { - tokenizer.check( - State::Next(StateName::CodeFencedAtNonLazyBreak), - State::Next(StateName::CodeFencedAfter), - ); - State::Retry(StateName::NonLazyContinuationStart) - } - _ => { - tokenizer.enter(Name::CodeFlowChunk); - State::Retry(StateName::CodeFencedContentChunk) - } - } -} - -/// In code content. -/// -/// ```markdown -/// | ~~~js -/// > | console.log(1) -/// ^^^^^^^^^^^^^^ -/// | ~~~ -/// ``` -pub fn content_chunk(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - None | Some(b'\n') => { - tokenizer.exit(Name::CodeFlowChunk); - State::Retry(StateName::CodeFencedBeforeContentChunk) - } - _ => { - tokenizer.consume(); - State::Next(StateName::CodeFencedContentChunk) - } - } -} - -/// After fenced code. -/// -/// ```markdown -/// | ~~~js -/// | console.log(1) -/// > | ~~~ -/// ^ -/// ``` -pub fn after(tokenizer: &mut Tokenizer) -> State { - tokenizer.exit(Name::CodeFenced); - tokenizer.tokenize_state.marker = 0; - tokenizer.tokenize_state.size_c = 0; - tokenizer.tokenize_state.size = 0; - // Feel free to interrupt. - tokenizer.interrupt = false; - // No longer concrete. - tokenizer.concrete = false; - State::Ok -} diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 5fc9446..f2644d4 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -18,7 +18,7 @@ //! //! In markdown, it is also possible to use [code (text)][raw_text] in the //! [text][] content type. -//! It is also possible to create code with the [code (fenced)][code_fenced] +//! It is also possible to create code with the [code (fenced)][raw_flow] //! construct. //! //! ## HTML @@ -48,7 +48,7 @@ //! //! [flow]: crate::construct::flow //! [text]: crate::construct::text -//! [code_fenced]: crate::construct::code_fenced +//! [raw_flow]: crate::construct::raw_flow //! [raw_text]: crate::construct::raw_text //! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element //! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element diff --git a/src/construct/flow.rs b/src/construct/flow.rs index f3c7685..3f1cd77 100644 --- a/src/construct/flow.rs +++ b/src/construct/flow.rs @@ -11,12 +11,12 @@ //! The constructs found in flow are: //! //! * [Blank line][crate::construct::blank_line] -//! * [Code (fenced)][crate::construct::code_fenced] //! * [Code (indented)][crate::construct::code_indented] //! * [Definition][crate::construct::definition] //! * [Heading (atx)][crate::construct::heading_atx] //! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] +//! * [Raw (flow)][crate::construct::raw_flow] (code (fenced), math (flow)) //! * [Thematic break][crate::construct::thematic_break] use crate::event::Name; @@ -42,6 +42,15 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::HeadingAtxStart) } + Some(b'$' | b'`' | b'~') => { + tokenizer.attempt( + State::Next(StateName::FlowAfter), + State::Next(StateName::FlowBeforeParagraph), + ); + State::Retry(StateName::RawFlowStart) + } + // Note: `-` is also used in setext heading underline so it’s not + // included here. Some(b'*' | b'_') => { tokenizer.attempt( State::Next(StateName::FlowAfter), @@ -71,13 +80,6 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::DefinitionStart) } - Some(b'`' | b'~') => { - tokenizer.attempt( - State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), - ); - State::Retry(StateName::CodeFencedStart) - } // Actual parsing: blank line? Indented code? Indented anything? // Also includes `-` which can be a setext heading underline or thematic break. None | Some(b'\t' | b'\n' | b' ' | b'-') => State::Retry(StateName::FlowBlankLineBefore), @@ -112,23 +114,23 @@ pub fn blank_line_before(tokenizer: &mut Tokenizer) -> State { pub fn before_code_indented(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeCodeFenced), + State::Next(StateName::FlowBeforeRaw), ); State::Retry(StateName::CodeIndentedStart) } -/// At code (fenced). +/// At raw. /// /// ````markdown /// > | ``` /// ^ /// ```` -pub fn before_code_fenced(tokenizer: &mut Tokenizer) -> State { +pub fn before_raw(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::FlowAfter), State::Next(StateName::FlowBeforeHtml), ); - State::Retry(StateName::CodeFencedStart) + State::Retry(StateName::RawFlowStart) } /// At html (flow). diff --git a/src/construct/mod.rs b/src/construct/mod.rs index a0065fa..9add015 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -25,7 +25,7 @@ //! thematic break. //! These things are called constructs here. //! Sometimes, there are several constructs that result in an equivalent thing. -//! For example, [code (fenced)][code_fenced] and +//! For example, [code (fenced)][raw_flow] and //! [code (indented)][code_indented] are considered different constructs. //! //! The following constructs are found in markdown (CommonMark): @@ -36,7 +36,6 @@ //! * [block quote][block_quote] //! * [character escape][character_escape] //! * [character reference][character_reference] -//! * [code (fenced)][code_fenced] //! * [code (indented)][code_indented] //! * [code (text)][raw_text] //! * [definition][] @@ -50,6 +49,7 @@ //! * [label start (link)][label_start_link] //! * [list item][list_item] //! * [paragraph][] +//! * [raw (flow)][raw_flow] (code (fenced), math (flow)) //! * [thematic break][thematic_break] //! //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by @@ -143,7 +143,6 @@ pub mod blank_line; pub mod block_quote; pub mod character_escape; pub mod character_reference; -pub mod code_fenced; pub mod code_indented; pub mod definition; pub mod document; @@ -172,6 +171,7 @@ pub mod partial_space_or_tab; pub mod partial_space_or_tab_eol; pub mod partial_title; pub mod partial_whitespace; +pub mod raw_flow; pub mod raw_text; pub mod string; pub mod text; diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs index 35641ee..26a20dd 100644 --- a/src/construct/partial_non_lazy_continuation.rs +++ b/src/construct/partial_non_lazy_continuation.rs @@ -2,11 +2,11 @@ //! //! This is a tiny helper that [flow][] constructs can use to make sure that //! the following line is not lazy. -//! For example, [html (flow)][html_flow] and code ([fenced][code_fenced], +//! For example, [html (flow)][html_flow] and ([raw (flow)][raw_flow], //! [indented][code_indented]), stop when the next line is lazy. //! //! [flow]: crate::construct::flow -//! [code_fenced]: crate::construct::code_fenced +//! [raw_flow]: crate::construct::raw_flow //! [code_indented]: crate::construct::code_indented //! [html_flow]: crate::construct::html_flow diff --git a/src/construct/raw_flow.rs b/src/construct/raw_flow.rs new file mode 100644 index 0000000..7eaac0c --- /dev/null +++ b/src/construct/raw_flow.rs @@ -0,0 +1,665 @@ +//! Raw (flow) occurs in the [flow][] content type. +//! It forms code (fenced) and math (flow). +//! +//! ## Grammar +//! +//! Code (fenced) forms with the following BNF +//! (see [construct][crate::construct] for character groups): +//! +//! ```bnf +//! raw_flow ::= fence_open *( eol *byte ) [ eol fence_close ] +//! +//! ; Restriction: math (flow) does not support the `info` part. +//! fence_open ::= sequence [1*space_or_tab info [1*space_or_tab meta]] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' | 2*'$' +//! ; Restriction: the marker cannot occur in `info` if it is the `$` or `` ` `` character. +//! info ::= 1*text +//! ; Restriction: the marker cannot occur in `meta` if it is the `$` or `` ` `` character. +//! meta ::= 1*text *(*space_or_tab 1*text) +//! ``` +//! +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file). +//! +//! The above grammar does not show how indentation (with `space_or_tab`) of +//! each line is handled. +//! To parse raw (flow), let `x` be the number of `space_or_tab` characters +//! before the opening fence sequence. +//! Each line of text is then allowed (not required) to be indented with up +//! to `x` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the content. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the content instead of a fence. +//! +//! The `info` and `meta` parts are interpreted as the [string][] content type. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. +//! Math (flow) does not support `info`. +//! +//! The optional `meta` part is ignored: it is not used when parsing or +//! rendering. +//! +//! The optional `info` part is used and is expected to specify the programming +//! language that the content is in. +//! Which value it holds depends on what your syntax highlighter supports, if +//! one is used. +//! +//! In markdown, it is also possible to use [raw (text)][raw_text] in the +//! [text][] content type. +//! It is also possible to create code with the +//! [code (indented)][code_indented] construct. +//! +//! ## HTML +//! +//! Code (fenced) relates to both the `
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html_pre] and the [*§ 4.5.15 The `code`
+//! element*][html_code] in the HTML spec for more info.
+//!
+//! Math (flow) does not relate to HTML elements.
+//! `MathML`, which is sort of like SVG but for math, exists but it doesn’t work
+//! well and isn’t widely supported.
+//! Instead, it is recommended to use client side JavaScript with something like
+//! `KaTeX` or `MathJax` to process the math
+//! For that, the math is compiled as a `
`, and a `` element with two
+//! classes: `language-math` and `math-display`.
+//! Client side JavaScript can look for these classes to process them further.
+//!
+//! The `info` is, when rendering to HTML, typically exposed as a class.
+//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
+//! element*][html_code]).
+//! For example:
+//!
+//! ```markdown
+//! ~~~css
+//! * { color: tomato }
+//! ~~~
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! 
* { color: tomato }
+//! 
+//! ``` +//! +//! ## Recommendation +//! +//! It is recommended to use code (fenced) instead of code (indented). +//! Code (fenced) is more explicit, similar to code (text), and has support +//! for specifying the programming language. +//! +//! When authoring markdown with math, keep in mind that math doesn’t work in +//! most places. +//! Notably, GitHub currently has a really weird crappy client-side regex-based +//! thing. +//! But on your own (math-heavy?) site it can be great! +//! You can use code (fenced) with an info string of `math` to improve this, as +//! that works in many places. +//! +//! ## Tokens +//! +//! * [`CodeFenced`][Name::CodeFenced] +//! * [`CodeFencedFence`][Name::CodeFencedFence] +//! * [`CodeFencedFenceInfo`][Name::CodeFencedFenceInfo] +//! * [`CodeFencedFenceMeta`][Name::CodeFencedFenceMeta] +//! * [`CodeFencedFenceSequence`][Name::CodeFencedFenceSequence] +//! * [`CodeFlowChunk`][Name::CodeFlowChunk] +//! * [`LineEnding`][Name::LineEnding] +//! * [`MathFlow`][Name::MathFlow] +//! * [`MathFlowFence`][Name::MathFlowFence] +//! * [`MathFlowFenceMeta`][Name::MathFlowFenceMeta] +//! * [`MathFlowFenceSequence`][Name::MathFlowFenceSequence] +//! * [`MathFlowChunk`][Name::MathFlowChunk] +//! * [`SpaceOrTab`][Name::SpaceOrTab] +//! +//! ## References +//! +//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! * [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) +//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! > 👉 **Note**: math is not specified anywhere. +//! +//! [flow]: crate::construct::flow +//! [string]: crate::construct::string +//! [text]: crate::construct::text +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference +//! [code_indented]: crate::construct::code_indented +//! [raw_text]: crate::construct::raw_text +//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element + +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::event::{Content, Link, Name}; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{ + constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}, + slice::{Position, Slice}, +}; + +/// Start of raw. +/// +/// ```markdown +/// > | ~~~js +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if tokenizer.parse_state.options.constructs.code_fenced + || tokenizer.parse_state.options.constructs.math_flow + { + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt( + State::Next(StateName::RawFlowBeforeSequenceOpen), + State::Nok, + ); + return State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.options.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )); + } + + if matches!(tokenizer.current, Some(b'$' | b'`' | b'~')) { + return State::Retry(StateName::RawFlowBeforeSequenceOpen); + } + } + + State::Nok +} + +/// In opening fence, after prefix, at sequence. +/// +/// ```markdown +/// > | ~~~js +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.name == Name::SpaceOrTab { + prefix = Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), + ) + .len(); + } + } + + // Code (fenced). + if (tokenizer.parse_state.options.constructs.code_fenced + && matches!(tokenizer.current, Some(b'`' | b'~'))) + // Math (flow). + || (tokenizer.parse_state.options.constructs.math_flow && tokenizer.current == Some(b'$')) + { + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); + tokenizer.tokenize_state.size_c = prefix; + if tokenizer.tokenize_state.marker == b'$' { + tokenizer.tokenize_state.token_1 = Name::MathFlow; + tokenizer.tokenize_state.token_2 = Name::MathFlowFence; + tokenizer.tokenize_state.token_3 = Name::MathFlowFenceSequence; + // Math (flow) does not support an `info` part: everything after the + // opening sequence is the `meta` part. + tokenizer.tokenize_state.token_5 = Name::MathFlowFenceMeta; + tokenizer.tokenize_state.token_6 = Name::MathFlowChunk; + } else { + tokenizer.tokenize_state.token_1 = Name::CodeFenced; + tokenizer.tokenize_state.token_2 = Name::CodeFencedFence; + tokenizer.tokenize_state.token_3 = Name::CodeFencedFenceSequence; + tokenizer.tokenize_state.token_4 = Name::CodeFencedFenceInfo; + tokenizer.tokenize_state.token_5 = Name::CodeFencedFenceMeta; + tokenizer.tokenize_state.token_6 = Name::CodeFlowChunk; + } + + tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); + State::Retry(StateName::RawFlowSequenceOpen) + } else { + State::Nok + } +} + +/// In opening fence sequence. +/// +/// ```markdown +/// > | ~~~js +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::RawFlowSequenceOpen) + } + // To do: constant. + else if tokenizer.tokenize_state.size + < (if tokenizer.tokenize_state.marker == b'$' { + 2 + } else { + CODE_FENCED_SEQUENCE_SIZE_MIN + }) + { + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size_c = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + tokenizer.tokenize_state.token_4 = Name::Data; + tokenizer.tokenize_state.token_5 = Name::Data; + tokenizer.tokenize_state.token_6 = Name::Data; + State::Nok + } else { + // Math (flow) does not support an `info` part: everything after the + // opening sequence is the `meta` part. + let next = if tokenizer.tokenize_state.marker == b'$' { + StateName::RawFlowMetaBefore + } else { + StateName::RawFlowInfoBefore + }; + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + tokenizer.attempt(State::Next(next), State::Nok); + State::Retry(space_or_tab(tokenizer)) + } else { + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + State::Retry(next) + } + } +} + +/// In opening fence, after the sequence (and optional whitespace), before info. +/// +/// ```markdown +/// > | ~~~js +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn info_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + // Do not form containers. + tokenizer.concrete = true; + tokenizer.check( + State::Next(StateName::RawFlowAtNonLazyBreak), + State::Next(StateName::RawFlowAfter), + ); + State::Retry(StateName::NonLazyContinuationStart) + } + _ => { + tokenizer.enter(tokenizer.tokenize_state.token_4.clone()); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); + State::Retry(StateName::RawFlowInfo) + } + } +} + +/// In info. +/// +/// ```markdown +/// > | ~~~js +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn info(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(Name::Data); + tokenizer.exit(tokenizer.tokenize_state.token_4.clone()); + State::Retry(StateName::RawFlowInfoBefore) + } + Some(b'\t' | b' ') => { + tokenizer.exit(Name::Data); + tokenizer.exit(tokenizer.tokenize_state.token_4.clone()); + tokenizer.attempt(State::Next(StateName::RawFlowMetaBefore), State::Nok); + State::Retry(space_or_tab(tokenizer)) + } + Some(byte) => { + // This looks like code (text) / math (text). + // Note: no reason to check for `~`, because 3 of them can‘t be + // used as strikethrough in text. + if tokenizer.tokenize_state.marker == byte && matches!(byte, b'$' | b'`') { + tokenizer.concrete = false; + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size_c = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + tokenizer.tokenize_state.token_4 = Name::Data; + tokenizer.tokenize_state.token_5 = Name::Data; + tokenizer.tokenize_state.token_6 = Name::Data; + State::Nok + } else { + tokenizer.consume(); + State::Next(StateName::RawFlowInfo) + } + } + } +} + +/// In opening fence, after info and whitespace, before meta. +/// +/// ```markdown +/// > | ~~~js eval +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn meta_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => State::Retry(StateName::RawFlowInfoBefore), + _ => { + tokenizer.enter(tokenizer.tokenize_state.token_5.clone()); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); + State::Retry(StateName::RawFlowMeta) + } + } +} + +/// In meta. +/// +/// ```markdown +/// > | ~~~js eval +/// ^ +/// | console.log(1) +/// | ~~~ +/// ``` +pub fn meta(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(Name::Data); + tokenizer.exit(tokenizer.tokenize_state.token_5.clone()); + State::Retry(StateName::RawFlowInfoBefore) + } + Some(byte) => { + // This looks like code (text) / math (text). + // Note: no reason to check for `~`, because 3 of them can‘t be + // used as strikethrough in text. + if tokenizer.tokenize_state.marker == byte && matches!(byte, b'$' | b'`') { + tokenizer.concrete = false; + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size_c = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + tokenizer.tokenize_state.token_4 = Name::Data; + tokenizer.tokenize_state.token_5 = Name::Data; + tokenizer.tokenize_state.token_6 = Name::Data; + State::Nok + } else { + tokenizer.consume(); + State::Next(StateName::RawFlowMeta) + } + } + } +} + +/// At eol/eof in code, before a non-lazy closing fence or content. +/// +/// ```markdown +/// > | ~~~js +/// ^ +/// > | console.log(1) +/// ^ +/// | ~~~ +/// ``` +pub fn at_non_lazy_break(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt( + State::Next(StateName::RawFlowAfter), + State::Next(StateName::RawFlowContentBefore), + ); + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::RawFlowCloseStart) +} + +/// Before closing fence, at optional whitespace. +/// +/// ```markdown +/// | ~~~js +/// | console.log(1) +/// > | ~~~ +/// ^ +/// ``` +pub fn close_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(tokenizer.tokenize_state.token_2.clone()); + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt( + State::Next(StateName::RawFlowBeforeSequenceClose), + State::Nok, + ); + + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.options.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } else { + State::Retry(StateName::RawFlowBeforeSequenceClose) + } +} + +/// In closing fence, after optional whitespace, at sequence. +/// +/// ```markdown +/// | ~~~js +/// | console.log(1) +/// > | ~~~ +/// ^ +/// ``` +pub fn before_sequence_close(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); + State::Retry(StateName::RawFlowSequenceClose) + } else { + State::Nok + } +} + +/// In closing fence sequence. +/// +/// ```markdown +/// | ~~~js +/// | console.log(1) +/// > | ~~~ +/// ^ +/// ``` +pub fn sequence_close(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(tokenizer.tokenize_state.marker) { + tokenizer.tokenize_state.size_b += 1; + tokenizer.consume(); + State::Next(StateName::RawFlowSequenceClose) + } else if tokenizer.tokenize_state.size_b >= tokenizer.tokenize_state.size { + tokenizer.tokenize_state.size_b = 0; + tokenizer.exit(tokenizer.tokenize_state.token_3.clone()); + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt( + State::Next(StateName::RawFlowAfterSequenceClose), + State::Nok, + ); + State::Retry(space_or_tab(tokenizer)) + } else { + State::Retry(StateName::RawFlowAfterSequenceClose) + } + } else { + tokenizer.tokenize_state.size_b = 0; + State::Nok + } +} + +/// After closing fence sequence, after optional whitespace. +/// +/// ```markdown +/// | ~~~js +/// | console.log(1) +/// > | ~~~ +/// ^ +/// ``` +pub fn sequence_close_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); + State::Ok + } + _ => State::Nok, + } +} + +/// Before closing fence, at eol. +/// +/// ```markdown +/// | ~~~js +/// > | console.log(1) +/// ^ +/// | ~~~ +/// ``` +pub fn content_before(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::RawFlowContentStart) +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// | ~~~js +/// > | console.log(1) +/// ^ +/// | ~~~ +/// ``` +pub fn content_start(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt( + State::Next(StateName::RawFlowBeforeContentChunk), + State::Nok, + ); + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + tokenizer.tokenize_state.size_c, + )) + } else { + State::Retry(StateName::RawFlowBeforeContentChunk) + } +} + +/// Before code content, after optional prefix. +/// +/// ```markdown +/// | ~~~js +/// > | console.log(1) +/// ^ +/// | ~~~ +/// ``` +pub fn before_content_chunk(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.check( + State::Next(StateName::RawFlowAtNonLazyBreak), + State::Next(StateName::RawFlowAfter), + ); + State::Retry(StateName::NonLazyContinuationStart) + } + _ => { + tokenizer.enter(tokenizer.tokenize_state.token_6.clone()); + State::Retry(StateName::RawFlowContentChunk) + } + } +} + +/// In code content. +/// +/// ```markdown +/// | ~~~js +/// > | console.log(1) +/// ^^^^^^^^^^^^^^ +/// | ~~~ +/// ``` +pub fn content_chunk(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(tokenizer.tokenize_state.token_6.clone()); + State::Retry(StateName::RawFlowBeforeContentChunk) + } + _ => { + tokenizer.consume(); + State::Next(StateName::RawFlowContentChunk) + } + } +} + +/// After raw. +/// +/// ```markdown +/// | ~~~js +/// | console.log(1) +/// > | ~~~ +/// ^ +/// ``` +pub fn after(tokenizer: &mut Tokenizer) -> State { + tokenizer.exit(tokenizer.tokenize_state.token_1.clone()); + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.size_c = 0; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + tokenizer.tokenize_state.token_4 = Name::Data; + tokenizer.tokenize_state.token_5 = Name::Data; + tokenizer.tokenize_state.token_6 = Name::Data; + // Feel free to interrupt. + tokenizer.interrupt = false; + // No longer concrete. + tokenizer.concrete = false; + State::Ok +} diff --git a/src/construct/raw_text.rs b/src/construct/raw_text.rs index 7f3990d..1a4d03c 100644 --- a/src/construct/raw_text.rs +++ b/src/construct/raw_text.rs @@ -57,10 +57,9 @@ //! if both exist and there is also a non-space in the code, are removed. //! Line endings, at that stage, are considered as spaces. //! -//! In markdown, it is possible to create code with the -//! [code (fenced)][code_fenced] or [code (indented)][code_indented], -//! and math with the [math (flow)][math_flow] constructs in the [flow][] -//! content type. +//! In markdown, it is possible to create code or math with the +//! [raw (flow)][raw_flow] (or [code (indented)][code_indented]) constructs +//! in the [flow][] content type. //! //! ## HTML //! @@ -74,7 +73,7 @@ //! Instead, it is recommended to use client side JavaScript with something like //! `KaTeX` or `MathJax` to process the math //! For that, the math is compiled as a `` element with two classes: -//! `lang-math` and `math-inline`. +//! `language-math` and `math-inline`. //! Client side JavaScript can look for these classes to process them further. //! //! When turning markdown into HTML, each line ending in raw (text) is turned @@ -87,9 +86,9 @@ //! Notably, GitHub currently has a really weird crappy client-side regex-based //! thing. //! But on your own (math-heavy?) site it can be great! -//! Alternatively, set `options.math_text_single_dollar: false`, which prevents -//! single dollars from being seen as math, and thus prevents normal dollars in -//! text from being seen as math. +//! You can set `options.math_text_single_dollar: false` to improve this, as it +//! prevents single dollars from being seen as math, and thus prevents normal +//! dollars in text from being seen as math. //! //! ## Tokens //! @@ -107,10 +106,12 @@ //! * [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) //! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans) //! +//! > 👉 **Note**: math is not specified anywhere. +//! //! [flow]: crate::construct::flow //! [text]: crate::construct::text //! [code_indented]: crate::construct::code_indented -//! [code_fenced]: crate::construct::code_fenced +//! [raw_flow]: crate::construct::raw_flow //! [math_flow]: # "to do" //! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element diff --git a/src/event.rs b/src/event.rs index 869f2e8..3e540c0 100644 --- a/src/event.rs +++ b/src/event.rs @@ -351,7 +351,7 @@ pub enum Name { /// [`LineEnding`][Name::LineEnding], /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced] + /// [`raw_flow`][crate::construct::raw_flow] /// /// ## Example /// @@ -376,7 +376,7 @@ pub enum Name { /// [`CodeFencedFenceSequence`][Name::CodeFencedFenceSequence], /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced] + /// [`raw_flow`][crate::construct::raw_flow] /// /// ## Example /// @@ -397,7 +397,7 @@ pub enum Name { /// * **Content model**: /// [string content][crate::construct::string] /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced] + /// [`raw_flow`][crate::construct::raw_flow] /// /// ## Example /// @@ -417,7 +417,7 @@ pub enum Name { /// * **Content model**: /// [string content][crate::construct::string] /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced] + /// [`raw_flow`][crate::construct::raw_flow] /// /// ## Example /// @@ -437,7 +437,7 @@ pub enum Name { /// * **Content model**: /// void /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced] + /// [`raw_flow`][crate::construct::raw_flow] /// /// ## Example /// @@ -459,7 +459,7 @@ pub enum Name { /// * **Content model**: /// void /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced], + /// [`raw_flow`][crate::construct::raw_flow], /// [`code_indented`][crate::construct::code_indented] /// /// ## Example @@ -487,7 +487,7 @@ pub enum Name { /// [`LineEnding`][Name::LineEnding], /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: - /// [`code_fenced`][crate::construct::code_fenced] + /// [`raw_flow`][crate::construct::raw_flow] /// /// ## Example /// @@ -1889,6 +1889,115 @@ pub enum Name { /// ^^^ /// ``` ListUnordered, + /// Whole math (flow). + /// + /// ## Info + /// + /// * **Context**: + /// [flow content][crate::construct::flow] + /// * **Content model**: + /// [`MathFlowFence`][Name::MathFlowFence], + /// [`MathFlowChunk`][Name::MathFlowChunk], + /// [`LineEnding`][Name::LineEnding], + /// [`SpaceOrTab`][Name::SpaceOrTab] + /// * **Construct**: + /// [`raw_flow`][crate::construct::raw_flow] + /// + /// ## Example + /// + /// ```markdown + /// > | $$ + /// ^^ + /// > | \frac{1}{2} + /// ^^^^^^^^^^^ + /// > | $$ + /// ^^ + /// ``` + MathFlow, + /// A math (flow) fence. + /// + /// ## Info + /// + /// * **Context**: + /// [`MathFlow`][Name::MathFlow] + /// * **Content model**: + /// [`MathFlowFenceMeta`][Name::MathFlowFenceMeta], + /// [`MathFlowFenceSequence`][Name::MathFlowFenceSequence], + /// [`SpaceOrTab`][Name::SpaceOrTab] + /// * **Construct**: + /// [`raw_flow`][crate::construct::raw_flow] + /// + /// ## Example + /// + /// ```markdown + /// > | $$ + /// ^^ + /// | \frac{1}{2} + /// > | $$ + /// ^^ + /// ``` + MathFlowFence, + /// A math (flow) fence meta string. + /// + /// ## Info + /// + /// * **Context**: + /// [`MathFlowFence`][Name::MathFlowFence] + /// * **Content model**: + /// [string content][crate::construct::string] + /// * **Construct**: + /// [`raw_flow`][crate::construct::raw_flow] + /// + /// ## Example + /// + /// ```markdown + /// > | $$alpha bravo + /// ^^^^^^^^^^^ + /// | \frac{1}{2} + /// | $$ + /// ``` + MathFlowFenceMeta, + /// A math (flow) fence sequence. + /// + /// ## Info + /// + /// * **Context**: + /// [`MathFlowFenceSequence`][Name::MathFlowFenceSequence] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`raw_flow`][crate::construct::raw_flow] + /// + /// ## Example + /// + /// ```markdown + /// > | $$ + /// ^^ + /// | \frac{1}{2} + /// > | $$ + /// ^^ + /// ``` + MathFlowFenceSequence, + /// A math (flow) chunk. + /// + /// ## Info + /// + /// * **Context**: + /// [`MathFlow`][Name::MathFlow] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`raw_flow`][crate::construct::raw_flow] + /// + /// ## Example + /// + /// ```markdown + /// | $$ + /// > | \frac{1}{2} + /// ^^^^^^^^^^^ + /// | $$ + /// ``` + MathFlowChunk, /// Whole math (text). /// /// ## Info @@ -2327,7 +2436,7 @@ pub enum Name { } /// List of void events, used to make sure everything is working well. -pub const VOID_EVENTS: [Name; 55] = [ +pub const VOID_EVENTS: [Name; 57] = [ Name::AttentionSequence, Name::AutolinkEmail, Name::AutolinkMarker, @@ -2375,6 +2484,8 @@ pub const VOID_EVENTS: [Name; 55] = [ Name::LineEnding, Name::ListItemMarker, Name::ListItemValue, + Name::MathFlowFenceSequence, + Name::MathFlowChunk, Name::MathTextData, Name::MathTextSequence, Name::ReferenceMarker, diff --git a/src/lib.rs b/src/lib.rs index 98a4936..4d1b762 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -272,6 +272,17 @@ pub struct Constructs { /// ^^^ /// ``` pub list_item: bool, + /// Math (flow). + /// + /// ```markdown + /// > | $$ + /// ^^ + /// > | \frac{1}{2} + /// ^^^^^^^^^^^ + /// > | $$ + /// ^^ + /// ``` + pub math_flow: bool, /// Math (text). /// /// ```markdown @@ -317,6 +328,7 @@ impl Default for Constructs { label_start_link: true, label_end: true, list_item: true, + math_flow: false, math_text: false, thematic_break: true, } @@ -730,7 +742,7 @@ pub struct Options { /// ..Options::default() /// } /// ), - /// "

a

" + /// "

a

" /// ); /// /// // Pass `math_text_single_dollar: false` to turn that off: diff --git a/src/state.rs b/src/state.rs index 0c04821..a42e802 100644 --- a/src/state.rs +++ b/src/state.rs @@ -52,23 +52,23 @@ pub enum Name { CharacterReferenceNumeric, CharacterReferenceValue, - CodeFencedStart, - CodeFencedBeforeSequenceOpen, - CodeFencedSequenceOpen, - CodeFencedInfoBefore, - CodeFencedInfo, - CodeFencedMetaBefore, - CodeFencedMeta, - CodeFencedAtNonLazyBreak, - CodeFencedCloseStart, - CodeFencedBeforeSequenceClose, - CodeFencedSequenceClose, - CodeFencedAfterSequenceClose, - CodeFencedContentBefore, - CodeFencedContentStart, - CodeFencedBeforeContentChunk, - CodeFencedContentChunk, - CodeFencedAfter, + RawFlowStart, + RawFlowBeforeSequenceOpen, + RawFlowSequenceOpen, + RawFlowInfoBefore, + RawFlowInfo, + RawFlowMetaBefore, + RawFlowMeta, + RawFlowAtNonLazyBreak, + RawFlowCloseStart, + RawFlowBeforeSequenceClose, + RawFlowSequenceClose, + RawFlowAfterSequenceClose, + RawFlowContentBefore, + RawFlowContentStart, + RawFlowBeforeContentChunk, + RawFlowContentChunk, + RawFlowAfter, CodeIndentedStart, CodeIndentedAtBreak, @@ -124,7 +124,7 @@ pub enum Name { FlowStart, FlowBeforeCodeIndented, - FlowBeforeCodeFenced, + FlowBeforeRaw, FlowBeforeHtml, FlowBeforeHeadingAtx, FlowBeforeHeadingSetext, @@ -366,23 +366,23 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::CharacterReferenceNumeric => construct::character_reference::numeric, Name::CharacterReferenceValue => construct::character_reference::value, - Name::CodeFencedStart => construct::code_fenced::start, - Name::CodeFencedBeforeSequenceOpen => construct::code_fenced::before_sequence_open, - Name::CodeFencedSequenceOpen => construct::code_fenced::sequence_open, - Name::CodeFencedInfoBefore => construct::code_fenced::info_before, - Name::CodeFencedInfo => construct::code_fenced::info, - Name::CodeFencedMetaBefore => construct::code_fenced::meta_before, - Name::CodeFencedMeta => construct::code_fenced::meta, - Name::CodeFencedAtNonLazyBreak => construct::code_fenced::at_non_lazy_break, - Name::CodeFencedCloseStart => construct::code_fenced::close_start, - Name::CodeFencedBeforeSequenceClose => construct::code_fenced::before_sequence_close, - Name::CodeFencedSequenceClose => construct::code_fenced::sequence_close, - Name::CodeFencedAfterSequenceClose => construct::code_fenced::sequence_close_after, - Name::CodeFencedContentBefore => construct::code_fenced::content_before, - Name::CodeFencedContentStart => construct::code_fenced::content_start, - Name::CodeFencedBeforeContentChunk => construct::code_fenced::before_content_chunk, - Name::CodeFencedContentChunk => construct::code_fenced::content_chunk, - Name::CodeFencedAfter => construct::code_fenced::after, + Name::RawFlowStart => construct::raw_flow::start, + Name::RawFlowBeforeSequenceOpen => construct::raw_flow::before_sequence_open, + Name::RawFlowSequenceOpen => construct::raw_flow::sequence_open, + Name::RawFlowInfoBefore => construct::raw_flow::info_before, + Name::RawFlowInfo => construct::raw_flow::info, + Name::RawFlowMetaBefore => construct::raw_flow::meta_before, + Name::RawFlowMeta => construct::raw_flow::meta, + Name::RawFlowAtNonLazyBreak => construct::raw_flow::at_non_lazy_break, + Name::RawFlowCloseStart => construct::raw_flow::close_start, + Name::RawFlowBeforeSequenceClose => construct::raw_flow::before_sequence_close, + Name::RawFlowSequenceClose => construct::raw_flow::sequence_close, + Name::RawFlowAfterSequenceClose => construct::raw_flow::sequence_close_after, + Name::RawFlowContentBefore => construct::raw_flow::content_before, + Name::RawFlowContentStart => construct::raw_flow::content_start, + Name::RawFlowBeforeContentChunk => construct::raw_flow::before_content_chunk, + Name::RawFlowContentChunk => construct::raw_flow::content_chunk, + Name::RawFlowAfter => construct::raw_flow::after, Name::CodeIndentedStart => construct::code_indented::start, Name::CodeIndentedAtBreak => construct::code_indented::at_break, @@ -446,7 +446,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::FlowStart => construct::flow::start, Name::FlowBeforeCodeIndented => construct::flow::before_code_indented, - Name::FlowBeforeCodeFenced => construct::flow::before_code_fenced, + Name::FlowBeforeRaw => construct::flow::before_raw, Name::FlowBeforeHtml => construct::flow::before_html, Name::FlowBeforeHeadingAtx => construct::flow::before_heading_atx, Name::FlowBeforeHeadingSetext => construct::flow::before_heading_setext, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c6a209b..9b73836 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -256,6 +256,8 @@ pub struct TokenizeState<'a> { pub token_4: Name, /// Slot for an event name. pub token_5: Name, + /// Slot for an event name. + pub token_6: Name, } /// A tokenizer itself. @@ -364,6 +366,7 @@ impl<'a> Tokenizer<'a> { token_3: Name::Data, token_4: Name::Data, token_5: Name::Data, + token_6: Name::Data, }, map: EditMap::new(), interrupt: false, diff --git a/src/util/constant.rs b/src/util/constant.rs index 0c82378..f397f38 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -1,7 +1,7 @@ //! Constants needed to parse markdown. //! //! Most of these constants are magic numbers, such as the number of markers -//! needed to parse [code (fenced)][code_fenced] +//! needed to parse [code (fenced)][raw_flow] //! ([`CODE_FENCED_SEQUENCE_SIZE_MIN`][]) or the max number of allowed markers //! in a [heading (atx)][heading_atx] //! ([`HEADING_ATX_OPENING_FENCE_SIZE_MAX`][]). @@ -11,7 +11,7 @@ //! ([`HTML_RAW_NAMES`][]), or the list of named character references //! ([`CHARACTER_REFERENCES`][]). //! -//! [code_fenced]: crate::construct::code_fenced +//! [raw_flow]: crate::construct::raw_flow //! [heading_atx]: crate::construct::heading_atx //! [html_flow]: crate::construct::html_flow @@ -60,11 +60,11 @@ pub const CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX: usize = 6; /// [character_reference]: crate::construct::character_reference pub const CHARACTER_REFERENCE_NAMED_SIZE_MAX: usize = 31; -/// The number of markers needed for [code (fenced)][code_fenced] to form. +/// The number of markers needed for [code (fenced)][raw_flow] to form. /// /// Like many things in markdown, the number is `3`. /// -/// [code_fenced]: crate::construct::code_fenced +/// [raw_flow]: crate::construct::raw_flow pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3; /// The number of markers needed for [frontmatter][] to form. -- cgit