From 8774b207b7251730eaa7fbfe4f144122a472dda0 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 22 Aug 2022 16:16:59 +0200 Subject: Add support for GFM task list item --- Untitled.txt | 3 + readme.md | 2 +- src/compiler.rs | 26 +++- src/construct/gfm_task_list_item_check.rs | 157 ++++++++++++++++++++ src/construct/mod.rs | 1 + src/construct/text.rs | 10 +- src/event.rs | 79 +++++++++- src/lib.rs | 9 ++ src/state.rs | 14 ++ src/subtokenize.rs | 32 +++- src/tokenizer.rs | 10 +- src/util/skip.rs | 12 +- tests/gfm_task_list_item.rs | 236 ++++++++++++++++++++++++++++++ 13 files changed, 578 insertions(+), 13 deletions(-) create mode 100644 src/construct/gfm_task_list_item_check.rs create mode 100644 tests/gfm_task_list_item.rs diff --git a/Untitled.txt b/Untitled.txt index ca56d67..fb1e53c 100644 --- a/Untitled.txt +++ b/Untitled.txt @@ -6,6 +6,9 @@ micromark.js: `atLineEnding` in html (text) should always eat arbitrary whitespa ```rs // --------------------- // Useful helper: +extern crate std; +use std::println; +use alloc::string::String; let mut index = 0; let mut balance = 0; diff --git a/readme.md b/readme.md index 762a882..565d90a 100644 --- a/readme.md +++ b/readme.md @@ -145,7 +145,7 @@ They are not enabled by default but can be turned on with `options.constructs`. - [x] strikethrough - [ ] table - [ ] tagfilter - - [ ] task list item + - [x] task list item - [ ] math It is not a goal of this project to support lots of different extensions. diff --git a/src/compiler.rs b/src/compiler.rs index abf35c8..f1003fd 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -327,6 +327,7 @@ fn enter(context: &mut CompileContext) { Name::Emphasis => on_enter_emphasis(context), Name::Frontmatter => on_enter_frontmatter(context), Name::GfmStrikethrough => on_enter_gfm_strikethrough(context), + Name::GfmTaskListItemCheck => on_enter_gfm_task_list_item_check(context), Name::HtmlFlow => on_enter_html_flow(context), Name::HtmlText => on_enter_html_text(context), Name::Image => on_enter_image(context), @@ -370,10 +371,12 @@ fn exit(context: &mut CompileContext) { Name::DefinitionTitleString => on_exit_definition_title_string(context), Name::Emphasis => on_exit_emphasis(context), Name::Frontmatter => on_exit_frontmatter(context), - Name::GfmStrikethrough => on_exit_gfm_strikethrough(context), Name::GfmAutolinkLiteralProtocol => on_exit_gfm_autolink_literal_protocol(context), Name::GfmAutolinkLiteralWww => on_exit_gfm_autolink_literal_www(context), Name::GfmAutolinkLiteralEmail => on_exit_gfm_autolink_literal_email(context), + Name::GfmStrikethrough => on_exit_gfm_strikethrough(context), + Name::GfmTaskListItemCheck => on_exit_gfm_task_list_item_check(context), + Name::GfmTaskListItemValueChecked => on_exit_gfm_task_list_item_value_checked(context), Name::HardBreakEscape | Name::HardBreakTrailing => on_exit_break(context), Name::HeadingAtx => on_exit_heading_atx(context), Name::HeadingAtxSequence => on_exit_heading_atx_sequence(context), @@ -476,6 +479,13 @@ fn on_enter_gfm_strikethrough(context: &mut CompileContext) { } } +/// Handle [`Enter`][Kind::Enter]:[`GfmTaskListItemCheck`][Name::GfmTaskListItemCheck]. +fn on_enter_gfm_task_list_item_check(context: &mut CompileContext) { + if !context.image_alt_inside { + context.push(""); + } +} + +/// Handle [`Exit`][Kind::Exit]:[`GfmTaskListItemValueChecked`][Name::GfmTaskListItemValueChecked]. +fn on_exit_gfm_task_list_item_value_checked(context: &mut CompileContext) { + if !context.image_alt_inside { + context.push("checked=\"\" "); + } +} + /// Handle [`Exit`][Kind::Exit]:[`HeadingAtx`][Name::HeadingAtx]. fn on_exit_heading_atx(context: &mut CompileContext) { let rank = context diff --git a/src/construct/gfm_task_list_item_check.rs b/src/construct/gfm_task_list_item_check.rs new file mode 100644 index 0000000..62ff8aa --- /dev/null +++ b/src/construct/gfm_task_list_item_check.rs @@ -0,0 +1,157 @@ +//! GFM: Task list item check occurs in the [text][] content type. +//! +//! ## Grammar +//! +//! Checks form with the following BNF +//! (see [construct][crate::construct] for character groups): +//! +//! ```bnf +//! gfm_task_list_item_check ::= '[' (0x09 | ' ' | 'X' | 'x') ']' +//! ``` +//! +//! The check is only allowed at the start of the first paragraph, optionally +//! following zero or more definitions or a blank line, in a list item. +//! The check must be followed by whitespace, which is in turn followed by +//! non-whitespace. +//! +//! ## HTML +//! +//! Checks relate to the `` element, in the checkbox state +//! (`type=checkbox`), in HTML. +//! See [*§ 4.10.5.1.15 Checkbox state (`type=checkbox`)*][html-input-checkbox] +//! in the HTML spec for more info. +//! +//! ## Recommendation +//! +//! It is recommended to use lowercase `x` (instead of uppercase `X`), because +//! in markdown, it is more common to use lowercase in places where casing does +//! not matter. +//! It is also recommended to use a space (instead of a tab), as there is no +//! benefit of using tabs in this case. +//! +//! ## Tokens +//! +//! * [`GfmTaskListItemCheck`][Name::GfmTaskListItemCheck] +//! * [`GfmTaskListItemMarker`][Name::GfmTaskListItemMarker] +//! * [`GfmTaskListItemValueChecked`][Name::GfmTaskListItemValueChecked] +//! * [`GfmTaskListItemValueUnchecked`][Name::GfmTaskListItemValueUnchecked] +//! +//! ## References +//! +//! * [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-task-list-item) +//! * [*§ 5.3 Task list items (extension)* in `GFM`](https://github.github.com/gfm/#task-list-items-extension-) +//! +//! [text]: crate::construct::text +//! [html-input-checkbox]: https://html.spec.whatwg.org/multipage/input.html#checkbox-state-(type=checkbox) + +use crate::construct::partial_space_or_tab::space_or_tab; +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; + +/// At start of task list item check. +/// +/// ```markdown +/// > | * [x] y. +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if tokenizer.parse_state.options.constructs.gfm_task_list_item + && tokenizer + .tokenize_state + .document_at_first_paragraph_of_list_item + && tokenizer.current == Some(b'[') + && tokenizer.previous == None + { + tokenizer.enter(Name::GfmTaskListItemCheck); + tokenizer.enter(Name::GfmTaskListItemMarker); + tokenizer.consume(); + tokenizer.exit(Name::GfmTaskListItemMarker); + State::Next(StateName::GfmTaskListItemCheckInside) + } else { + State::Nok + } +} + +/// In task list item check. +/// +/// ```markdown +/// > | * [x] y. +/// ^ +/// ``` +pub fn inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.enter(Name::GfmTaskListItemValueUnchecked); + tokenizer.consume(); + tokenizer.exit(Name::GfmTaskListItemValueUnchecked); + State::Next(StateName::GfmTaskListItemCheckClose) + } + Some(b'X' | b'x') => { + tokenizer.enter(Name::GfmTaskListItemValueChecked); + tokenizer.consume(); + tokenizer.exit(Name::GfmTaskListItemValueChecked); + State::Next(StateName::GfmTaskListItemCheckClose) + } + _ => State::Nok, + } +} + +/// At close of task list item check. +/// +/// ```markdown +/// > | * [x] y. +/// ^ +/// ``` +pub fn close(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b']') => { + tokenizer.enter(Name::GfmTaskListItemMarker); + tokenizer.consume(); + tokenizer.exit(Name::GfmTaskListItemMarker); + tokenizer.exit(Name::GfmTaskListItemCheck); + State::Next(StateName::GfmTaskListItemCheckAfter) + } + _ => State::Nok, + } +} + +/// After task list item check. +/// +/// ```markdown +/// > | * [x] y. +/// ^ +/// ``` +pub fn after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // EOL in paragraph means there must be something else after it. + Some(b'\n') => State::Ok, + // Space or tab? + // Check what comes after. + Some(b'\t' | b' ') => { + tokenizer.check(State::Ok, State::Nok); + tokenizer.attempt( + State::Next(StateName::GfmTaskListItemCheckAfterSpaceOrTab), + State::Nok, + ); + State::Retry(space_or_tab(tokenizer)) + } + // EOF, or non-whitespace, both wrong. + _ => State::Nok, + } +} + +/// After whitespace, after task list item check. +/// +/// ```markdown +/// > | * [x] y. +/// ^ +/// ``` +pub fn after_space_or_tab(tokenizer: &mut Tokenizer) -> State { + // End of paragraph, after whitespace, after check, is not okay. + if tokenizer.current == None { + State::Nok + } else { + State::Ok + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index ba1a0b3..7ac3899 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -146,6 +146,7 @@ pub mod document; pub mod flow; pub mod frontmatter; pub mod gfm_autolink_literal; +pub mod gfm_task_list_item_check; pub mod hard_break_escape; pub mod heading_atx; pub mod heading_setext; diff --git a/src/construct/text.rs b/src/construct/text.rs index 9d40585..65f55d4 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -42,13 +42,21 @@ const MARKERS: [u8; 10] = [ /// Start of text. /// +/// There is a slightly weird case where task list items have their check at +/// the start of the first paragraph. +/// So we start by checking for that. +/// /// ```markdown /// > | abc /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.markers = &MARKERS; - State::Retry(StateName::TextBefore) + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBefore), + ); + State::Retry(StateName::GfmTaskListItemCheckStart) } /// Before text. diff --git a/src/event.rs b/src/event.rs index 3c690e1..f20c599 100644 --- a/src/event.rs +++ b/src/event.rs @@ -1074,6 +1074,80 @@ pub enum Name { /// ^ /// ``` GfmStrikethroughText, + /// GFM: Task list item check. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// [`GfmTaskListItemMarker`][Name::GfmTaskListItemMarker], + /// [`GfmTaskListItemValueChecked`][Name::GfmTaskListItemValueChecked], + /// [`GfmTaskListItemValueUnchecked`][Name::GfmTaskListItemValueUnchecked] + /// * **Construct**: + /// [`gfm_task_list_item_check`][crate::construct::gfm_task_list_item_check] + /// + /// ## Example + /// + /// ```markdown + /// > | * [x] y. + /// ^^^ + /// ``` + GfmTaskListItemCheck, + /// GFM: Task list item check marker. + /// + /// ## Info + /// + /// * **Context**: + /// [`GfmTaskListItemCheck`][Name::GfmTaskListItemCheck] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`gfm_task_list_item_check`][crate::construct::gfm_task_list_item_check] + /// + /// ## Example + /// + /// ```markdown + /// > | * [x] y. + /// ^ ^ + /// ``` + GfmTaskListItemMarker, + /// GFM: Task list item value: checked. + /// + /// ## Info + /// + /// * **Context**: + /// [`GfmTaskListItemCheck`][Name::GfmTaskListItemCheck] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`gfm_task_list_item_check`][crate::construct::gfm_task_list_item_check] + /// + /// ## Example + /// + /// ```markdown + /// > | * [x] y. + /// ^ + /// ``` + GfmTaskListItemValueChecked, + /// GFM: Task list item value: unchecked. + /// + /// ## Info + /// + /// * **Context**: + /// [`GfmTaskListItemCheck`][Name::GfmTaskListItemCheck] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`gfm_task_list_item_check`][crate::construct::gfm_task_list_item_check] + /// + /// ## Example + /// + /// ```markdown + /// > | * [ ] z. + /// ^ + /// ``` + GfmTaskListItemValueUnchecked, /// Whole hard break (escape). /// /// ## Info @@ -2031,7 +2105,7 @@ pub enum Name { } /// List of void events, used to make sure everything is working well. -pub const VOID_EVENTS: [Name; 47] = [ +pub const VOID_EVENTS: [Name; 50] = [ Name::AttentionSequence, Name::AutolinkEmail, Name::AutolinkMarker, @@ -2061,6 +2135,9 @@ pub const VOID_EVENTS: [Name; 47] = [ Name::GfmAutolinkLiteralProtocol, Name::GfmAutolinkLiteralWww, Name::GfmStrikethroughSequence, + Name::GfmTaskListItemMarker, + Name::GfmTaskListItemValueChecked, + Name::GfmTaskListItemValueUnchecked, Name::FrontmatterSequence, Name::HardBreakEscape, Name::HardBreakTrailing, diff --git a/src/lib.rs b/src/lib.rs index 893255a..5b7836c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -178,6 +178,13 @@ pub struct Constructs { /// ^^^ /// ``` pub gfm_strikethrough: bool, + /// GFM: task list item. + /// + /// ```markdown + /// > | * [x] y. + /// ^^^ + /// ``` + pub gfm_task_list_item: bool, /// Hard break (escape). /// /// ```markdown @@ -277,6 +284,7 @@ impl Default for Constructs { frontmatter: false, gfm_autolink_literal: false, gfm_strikethrough: false, + gfm_task_list_item: false, hard_break_escape: true, hard_break_trailing: true, heading_atx: true, @@ -301,6 +309,7 @@ impl Constructs { Self { gfm_autolink_literal: true, gfm_strikethrough: true, + gfm_task_list_item: true, ..Self::default() } } diff --git a/src/state.rs b/src/state.rs index da935d1..65ffbeb 100644 --- a/src/state.rs +++ b/src/state.rs @@ -145,6 +145,12 @@ pub enum Name { FrontmatterCloseSequence, FrontmatterCloseAfter, + GfmTaskListItemCheckStart, + GfmTaskListItemCheckInside, + GfmTaskListItemCheckClose, + GfmTaskListItemCheckAfter, + GfmTaskListItemCheckAfterSpaceOrTab, + HardBreakEscapeStart, HardBreakEscapeAfter, @@ -444,6 +450,14 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::FrontmatterCloseSequence => construct::frontmatter::close_sequence, Name::FrontmatterCloseAfter => construct::frontmatter::close_after, + Name::GfmTaskListItemCheckStart => construct::gfm_task_list_item_check::start, + Name::GfmTaskListItemCheckInside => construct::gfm_task_list_item_check::inside, + Name::GfmTaskListItemCheckClose => construct::gfm_task_list_item_check::close, + Name::GfmTaskListItemCheckAfter => construct::gfm_task_list_item_check::after, + Name::GfmTaskListItemCheckAfterSpaceOrTab => { + construct::gfm_task_list_item_check::after_space_or_tab + } + Name::HardBreakEscapeStart => construct::hard_break_escape::start, Name::HardBreakEscapeAfter => construct::hard_break_escape::after, diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 5932f11..7fcc481 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -17,11 +17,11 @@ //! whole document needs to be parsed up to the level of definitions, before //! any level that can include references can be parsed. -use crate::event::{Content, Event, Kind, VOID_EVENTS}; +use crate::event::{Content, Event, Kind, Name, VOID_EVENTS}; use crate::parser::ParseState; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::util::edit_map::EditMap; +use crate::util::{edit_map::EditMap, skip}; use alloc::{vec, vec::Vec}; /// Link two [`Event`][]s. @@ -94,6 +94,34 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { StateName::TextStart }); + // Check if this is the first paragraph, after zero or more + // definitions (or a blank line), in a list item. + // Used for GFM task list items. + if tokenizer.parse_state.options.constructs.gfm_task_list_item + && index > 2 + && events[index - 1].kind == Kind::Enter + && events[index - 1].name == Name::Paragraph + { + let before = skip::opt_back( + events, + index - 2, + &[ + Name::BlankLineEnding, + Name::Definition, + Name::LineEnding, + Name::SpaceOrTab, + ], + ); + + if events[before].kind == Kind::Exit + && events[before].name == Name::ListItemPrefix + { + tokenizer + .tokenize_state + .document_at_first_paragraph_of_list_item = true; + } + } + // Loop through links to pass them in order to the subtokenizer. while let Some(index) = link_index { let enter = &events[index]; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 0bd1f31..731b829 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -144,6 +144,10 @@ pub struct TokenizeState<'a> { pub document_exits: Vec>>, /// Whether the previous flow was a paragraph. pub document_paragraph_before: bool, + /// Whether this is the first paragraph (potentially after definitions) in + /// a list item. + /// Used for GFM task list items. + pub document_at_first_paragraph_of_list_item: bool, // Couple of very frequent settings for parsing whitespace. pub space_or_tab_eol_content: Option, @@ -282,6 +286,7 @@ impl<'a> Tokenizer<'a> { document_data_index: None, document_child_state: None, document_child: None, + document_at_first_paragraph_of_list_item: false, definitions: vec![], end: 0, label_starts: vec![], @@ -509,11 +514,6 @@ impl<'a> Tokenizer<'a> { /// Stack an attempt, moving to `ok` on [`State::Ok`][] and `nok` on /// [`State::Nok`][], reverting in both cases. pub fn check(&mut self, ok: State, nok: State) { - debug_assert_ne!( - nok, - State::Nok, - "checking w/ `State::Nok` should likely be an attempt" - ); // Always capture (and restore) when checking. // No need to capture (and restore) when `nok` is `State::Nok`, because the // parent attempt will do it. diff --git a/src/util/skip.rs b/src/util/skip.rs index a7de408..df63498 100644 --- a/src/util/skip.rs +++ b/src/util/skip.rs @@ -59,12 +59,20 @@ fn skip_opt_impl(events: &[Event], mut index: usize, names: &[Name], forward: bo balance - 1 }; + let next = if forward { + index + 1 + } else if index > 0 { + index - 1 + } else { + index + }; + if events[index].name == *current && balance == 0 { - index = if forward { index + 1 } else { index - 1 }; + index = next; break; } - index = if forward { index + 1 } else { index - 1 }; + index = next; } } diff --git a/tests/gfm_task_list_item.rs b/tests/gfm_task_list_item.rs new file mode 100644 index 0000000..5db5d1b --- /dev/null +++ b/tests/gfm_task_list_item.rs @@ -0,0 +1,236 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Constructs, Options}; +use pretty_assertions::assert_eq; + +#[test] +fn gfm_task_list_item() { + let gfm = Options { + constructs: Constructs::gfm(), + ..Options::default() + }; + + assert_eq!( + micromark("* [x] y."), + "
    \n
  • [x] y.
  • \n
", + "should ignore task list item checks by default" + ); + + assert_eq!( + micromark_with_options("* [x] y.", &gfm), + "
    \n
  • y.
  • \n
", + "should support task list item checks" + ); + + assert_eq!( + micromark_with_options("* [ ] z.", &gfm), + "
    \n
  • z.
  • \n
", + "should support unchecked task list item checks" + ); + + assert_eq!( + micromark_with_options("*\n [x]", &gfm), + "
    \n
  • [x]
  • \n
", + "should not support laziness (1)" + ); + + assert_eq!( + micromark_with_options("*\n[x]", &gfm), + "
    \n
  • \n
\n

[x]

", + "should not support laziness (2)" + ); + + assert_eq!( + micromark_with_options( + &r###" +* [ ] foo +* [x] bar + +- [x] foo + - [ ] bar + - [x] baz +- [ ] bim + ++ [ ] Unchecked? + +* [x] Checked? + ++ [y] What is this even? + +- [n]: # + [ ] After a definition + ++ [ ] In a setext heading + ======================= + +* In the… + + [ ] Second paragraph + +- [ ] With a tab + ++ [X] With an upper case `x` + +- [ ] With two spaces + ++ [x] Two spaces indent + +* [x] Three spaces indent + +- [x] Four spaces indent + ++ [x] Five spaces indent + +[ ] here? + +* > [ ] here? + +- [ ]No space? + +Empty? + ++ [ ] + +Space after: + ++ [ ]␠ + +* [ ]␠Text. + +Tab after: + ++ [ ]␉ + +* [ ]␉Text. + +EOL after: + ++ [ ] + +* [ ] + Text. + +- + [ ] after blank? + ++ # [ ] ATX Heading + +> * [x] In a list in a block quote +"### + .replace('␠', " ") + .replace('␉', "\t"), + &gfm + ), + r###"
    +
  • foo
  • +
  • bar
  • +
+
    +
  • foo +
      +
    • bar
    • +
    • baz
    • +
    +
  • +
  • bim
  • +
+
    +
  • Unchecked?
  • +
+
    +
  • Checked?
  • +
+
    +
  • [y] What is this even?
  • +
+
    +
  • After a definition
  • +
+
    +
  • +

    [ ] In a setext heading

    +
  • +
+
    +
  • +

    In the…

    +

    [ ] Second paragraph

    +
  • +
+
    +
  • With a tab
  • +
+
    +
  • With an upper case x
  • +
+
    +
  • [ ] With two spaces
  • +
+
    +
  • Two spaces indent
  • +
+
    +
  • Three spaces indent
  • +
+
    +
  • Four spaces indent
  • +
+
    +
  • +
    [x] Five spaces indent
    +
    +
  • +
+

[ ] here?

+
    +
  • +
    +

    [ ] here?

    +
    +
  • +
+
    +
  • [ ]No space?
  • +
+

Empty?

+
    +
  • [ ]
  • +
+

Space after:

+
    +
  • [ ]
  • +
+
    +
  • Text.
  • +
+

Tab after:

+
    +
  • [ ]
  • +
+
    +
  • Text.
  • +
+

EOL after:

+
    +
  • [ ]
  • +
+
    +
  • +Text.
  • +
+
    +
  • after blank?
  • +
+
    +
  • +

    [ ] ATX Heading

    +
  • +
+
+
    +
  • In a list in a block quote
  • +
+
+"###, + "should handle things like GitHub" + ); +} -- cgit