diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-08-22 11:50:42 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-08-22 11:50:42 +0200 |
commit | 351c69644bdbdf52c95e322904273657892920b5 (patch) | |
tree | 114a93ff760b522232f9f7290bc6f632b7250095 | |
parent | 5e6829c2fb79c2b7f59e38f924e2b2900c52b5d5 (diff) | |
download | markdown-rs-351c69644bdbdf52c95e322904273657892920b5.tar.gz markdown-rs-351c69644bdbdf52c95e322904273657892920b5.tar.bz2 markdown-rs-351c69644bdbdf52c95e322904273657892920b5.zip |
Add support for GFM strikethrough
28 files changed, 696 insertions, 120 deletions
diff --git a/examples/lib.rs b/examples/lib.rs index 94e04f5..167f169 100644 --- a/examples/lib.rs +++ b/examples/lib.rs @@ -26,6 +26,7 @@ fn main() { println!( "{}", micromark_with_options( + // To do: use readme example when all of GFM work. "Just a link! https://example.com.", &Options { constructs: Constructs::gfm(), @@ -142,7 +142,7 @@ They are not enabled by default but can be turned on with `options.constructs`. - [ ] gfm - [x] autolink literal - [ ] footnote - - [ ] strikethrough + - [x] strikethrough - [ ] table - [ ] tagfilter - [ ] task list item diff --git a/src/compiler.rs b/src/compiler.rs index 2e13294..abf35c8 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -326,6 +326,7 @@ fn enter(context: &mut CompileContext) { Name::DefinitionDestinationString => on_enter_definition_destination_string(context), Name::Emphasis => on_enter_emphasis(context), Name::Frontmatter => on_enter_frontmatter(context), + Name::GfmStrikethrough => on_enter_gfm_strikethrough(context), Name::HtmlFlow => on_enter_html_flow(context), Name::HtmlText => on_enter_html_text(context), Name::Image => on_enter_image(context), @@ -369,6 +370,7 @@ fn exit(context: &mut CompileContext) { Name::DefinitionTitleString => on_exit_definition_title_string(context), Name::Emphasis => on_exit_emphasis(context), Name::Frontmatter => on_exit_frontmatter(context), + Name::GfmStrikethrough => on_exit_gfm_strikethrough(context), Name::GfmAutolinkLiteralProtocol => on_exit_gfm_autolink_literal_protocol(context), Name::GfmAutolinkLiteralWww => on_exit_gfm_autolink_literal_www(context), Name::GfmAutolinkLiteralEmail => on_exit_gfm_autolink_literal_email(context), @@ -467,6 +469,13 @@ fn on_enter_frontmatter(context: &mut CompileContext) { context.buffer(); } +/// Handle [`Enter`][Kind::Enter]:[`GfmStrikethrough`][Name::GfmStrikethrough]. +fn on_enter_gfm_strikethrough(context: &mut CompileContext) { + if !context.image_alt_inside { + context.push("<del>"); + } +} + /// Handle [`Enter`][Kind::Enter]:[`HtmlFlow`][Name::HtmlFlow]. fn on_enter_html_flow(context: &mut CompileContext) { context.line_ending_if_needed(); @@ -898,7 +907,7 @@ fn on_exit_definition_title_string(context: &mut CompileContext) { context.media_stack.last_mut().unwrap().title = Some(buf); } -/// Handle [`Exit`][Kind::Exit]:[`Strong`][Name::Emphasis]. +/// Handle [`Exit`][Kind::Exit]:[`Emphasis`][Name::Emphasis]. fn on_exit_emphasis(context: &mut CompileContext) { if !context.image_alt_inside { context.push("</em>"); @@ -942,6 +951,13 @@ fn on_exit_gfm_autolink_literal_email(context: &mut CompileContext) { on_exit_autolink_email(context); } +/// Handle [`Exit`][Kind::Exit]:[`GfmStrikethrough`][Name::GfmStrikethrough]. +fn on_exit_gfm_strikethrough(context: &mut CompileContext) { + if !context.image_alt_inside { + context.push("</del>"); + } +} + /// Handle [`Exit`][Kind::Exit]:[`HeadingAtx`][Name::HeadingAtx]. fn on_exit_heading_atx(context: &mut CompileContext) { let rank = context diff --git a/src/construct/attention.rs b/src/construct/attention.rs index ef960d4..526f58c 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -1,4 +1,5 @@ -//! Attention (emphasis and strong) occurs in the [text][] content type. +//! Attention (emphasis, strong, optionally GFM strikethrough) occurs in the +//! [text][] content type. //! //! ## Grammar //! @@ -7,24 +8,31 @@ //! //! ```bnf //! attention_sequence ::= 1*'*' | 1*'_' +//! gfm_attention_sequence ::= 1*'~' //! ``` //! //! Sequences are matched together to form attention based on which character -//! they contain, and what character occurs before and after each sequence. +//! they contain, how long they are, and what character occurs before and after +//! each sequence. //! Otherwise they are turned into data. //! //! ## HTML //! -//! When sequences match, and two markers can be “taken” from them, they -//! together relate to the `<strong>` element in HTML. +//! When asterisk/underscore sequences match, and two markers can be “taken” +//! from them, they together relate to the `<strong>` element in HTML. //! When one marker can be taken, they relate to the `<em>` element. //! See [*§ 4.5.2 The `em` element*][html-em] and //! [*§ 4.5.3 The `strong` element*][html-strong] in the HTML spec for more //! info. //! +//! When tilde sequences match, they together relate to the `<del>` element in +//! HTML. +//! See [*§ 4.7.2 The `del` element*][html-del] in the HTML spec for more info. +//! //! ## Recommendation //! -//! It is recommended to use asterisks for attention when writing markdown. +//! It is recommended to use asterisks for emphasis/strong attention when +//! writing markdown. //! //! There are some small differences in whether sequences can open and/or close //! based on whether they are formed with asterisks or underscores. @@ -37,11 +45,18 @@ //! can look for asterisks to find syntax while not worrying about other //! characters. //! +//! For strikethrough attention, it is recommended to use two markers. +//! While `github.com` allows single tildes too, it technically prohibits it in +//! their spec. +//! //! ## Tokens //! //! * [`Emphasis`][Name::Emphasis] //! * [`EmphasisSequence`][Name::EmphasisSequence] //! * [`EmphasisText`][Name::EmphasisText] +//! * [`GfmStrikethrough`][Name::GfmStrikethrough] +//! * [`GfmStrikethroughSequence`][Name::GfmStrikethroughSequence] +//! * [`GfmStrikethroughText`][Name::GfmStrikethroughText] //! * [`Strong`][Name::Strong] //! * [`StrongSequence`][Name::StrongSequence] //! * [`StrongText`][Name::StrongText] @@ -52,11 +67,14 @@ //! ## References //! //! * [`attention.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/attention.js) +//! * [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough) //! * [*§ 6.2 Emphasis and strong emphasis* in `CommonMark`](https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis) +//! * [*§ 6.5 Strikethrough (extension)* in `GFM`](https://github.github.com/gfm/#strikethrough-extension-) //! //! [text]: crate::construct::text //! [html-em]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-em-element //! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element +//! [html-del]: https://html.spec.whatwg.org/multipage/edits.html#the-del-element use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; @@ -94,7 +112,11 @@ struct Sequence { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.attention && matches!(tokenizer.current, Some(b'*' | b'_')) + // Emphasis/strong: + if (tokenizer.parse_state.options.constructs.attention + && matches!(tokenizer.current, Some(b'*' | b'_'))) + // GFM strikethrough: + || (tokenizer.parse_state.options.constructs.gfm_strikethrough && tokenizer.current == Some(b'~')) { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); tokenizer.enter(Name::AttentionSequence); @@ -117,85 +139,15 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } else { tokenizer.exit(Name::AttentionSequence); tokenizer.register_resolver(ResolveName::Attention); - tokenizer.tokenize_state.marker = b'\0'; + tokenizer.tokenize_state.marker = 0; State::Ok } } -/// Resolve attention sequences. +/// Resolve sequences. pub fn resolve(tokenizer: &mut Tokenizer) { - let mut index = 0; - let mut balance = 0; - let mut sequences = vec![]; - // Find all sequences, gather info about them. - while index < tokenizer.events.len() { - let enter = &tokenizer.events[index]; - - if enter.kind == Kind::Enter { - balance += 1; - - if enter.name == Name::AttentionSequence { - let end = index + 1; - let exit = &tokenizer.events[end]; - - let before_end = enter.point.index; - let before_start = if before_end < 4 { 0 } else { before_end - 4 }; - let char_before = - String::from_utf8_lossy(&tokenizer.parse_state.bytes[before_start..before_end]) - .chars() - .last(); - - let after_start = exit.point.index; - let after_end = if after_start + 4 > tokenizer.parse_state.bytes.len() { - tokenizer.parse_state.bytes.len() - } else { - after_start + 4 - }; - let char_after = - String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]) - .chars() - .next(); - - let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) - .head() - .unwrap(); - let before = classify_opt(char_before); - let after = classify_opt(char_after); - let open = after == CharacterKind::Other - || (after == CharacterKind::Punctuation && before != CharacterKind::Other); - // To do: GFM strikethrough? - // || char_after == '~' - let close = before == CharacterKind::Other - || (before == CharacterKind::Punctuation && after != CharacterKind::Other); - // To do: GFM strikethrough? - // || char_before == '~' - - sequences.push(Sequence { - index, - balance, - start_point: enter.point.clone(), - end_point: exit.point.clone(), - size: exit.point.index - enter.point.index, - open: if marker == b'*' { - open - } else { - open && (before != CharacterKind::Other || !close) - }, - close: if marker == b'*' { - close - } else { - close && (after != CharacterKind::Other || !open) - }, - marker, - }); - } - } else { - balance -= 1; - } - - index += 1; - } + let mut sequences = get_sequences(tokenizer); // Now walk through them and match them. let mut close = 0; @@ -230,7 +182,20 @@ pub fn resolve(tokenizer: &mut Tokenizer) { continue; } - // We’ve found a match! + // For GFM strikethrough: + // * both sequences must have the same size + // * more than 2 markers don’t work + // * one marker is prohibited by the spec, but supported by GH + if sequence_close.marker == b'~' + && (sequence_close.size != sequence_open.size + || sequence_close.size > 2 + || sequence_close.size == 1 + && !tokenizer.parse_state.options.gfm_strikethrough_single_tilde) + { + continue; + } + + // We found a match! next_index = match_sequences(tokenizer, &mut sequences, open, close); break; @@ -253,7 +218,80 @@ pub fn resolve(tokenizer: &mut Tokenizer) { tokenizer.map.consume(&mut tokenizer.events); } +/// Get sequences. +fn get_sequences(tokenizer: &mut Tokenizer) -> Vec<Sequence> { + let mut index = 0; + let mut balance = 0; + let mut sequences = vec![]; + + while index < tokenizer.events.len() { + let enter = &tokenizer.events[index]; + + if enter.kind == Kind::Enter { + balance += 1; + + if enter.name == Name::AttentionSequence { + let end = index + 1; + let exit = &tokenizer.events[end]; + + let before_end = enter.point.index; + let before_start = if before_end < 4 { 0 } else { before_end - 4 }; + let after_start = exit.point.index; + let after_end = if after_start + 4 > tokenizer.parse_state.bytes.len() { + tokenizer.parse_state.bytes.len() + } else { + after_start + 4 + }; + + let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) + .head() + .unwrap(); + let before = classify_opt( + String::from_utf8_lossy(&tokenizer.parse_state.bytes[before_start..before_end]) + .chars() + .last(), + ); + let after = classify_opt( + String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]) + .chars() + .next(), + ); + let open = after == CharacterKind::Other + || (after == CharacterKind::Punctuation && before != CharacterKind::Other); + let close = before == CharacterKind::Other + || (before == CharacterKind::Punctuation && after != CharacterKind::Other); + + sequences.push(Sequence { + index, + balance, + start_point: enter.point.clone(), + end_point: exit.point.clone(), + size: exit.point.index - enter.point.index, + open: if marker == b'_' { + open && (before != CharacterKind::Other || !close) + } else { + open + }, + close: if marker == b'_' { + close && (after != CharacterKind::Other || !open) + } else { + close + }, + marker, + }); + } + } else { + balance -= 1; + } + + index += 1; + } + + sequences +} + /// Match two sequences. +#[allow(clippy::too_many_lines)] fn match_sequences( tokenizer: &mut Tokenizer, sequences: &mut Vec<Sequence>, @@ -292,7 +330,13 @@ fn match_sequences( between += 1; } - let (group_name, seq_name, text_name) = if take == 1 { + let (group_name, seq_name, text_name) = if sequences[open].marker == b'~' { + ( + Name::GfmStrikethrough, + Name::GfmStrikethroughSequence, + Name::GfmStrikethroughText, + ) + } else if take == 1 { (Name::Emphasis, Name::EmphasisSequence, Name::EmphasisText) } else { (Name::Strong, Name::StrongSequence, Name::StrongText) diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 4ecd580..21f8fa5 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -135,7 +135,7 @@ use crate::util::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.autolink && tokenizer.current == Some(b'<') { + if tokenizer.parse_state.options.constructs.autolink && tokenizer.current == Some(b'<') { tokenizer.enter(Name::Autolink); tokenizer.enter(Name::AutolinkMarker); tokenizer.consume(); diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 039c839..11783d0 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -59,7 +59,7 @@ use crate::util::constant::TAB_SIZE; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.block_quote { + if tokenizer.parse_state.options.constructs.block_quote { tokenizer.enter(Name::BlockQuote); State::Retry(StateName::BlockQuoteContStart) } else { @@ -82,7 +82,7 @@ pub fn cont_start(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_min_max( tokenizer, 1, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 438092e..67946a0 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -53,7 +53,8 @@ use crate::tokenizer::Tokenizer; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.character_escape && tokenizer.current == Some(b'\\') { + if tokenizer.parse_state.options.constructs.character_escape && tokenizer.current == Some(b'\\') + { tokenizer.enter(Name::CharacterEscape); tokenizer.enter(Name::CharacterEscapeMarker); tokenizer.consume(); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 4669836..927e3d9 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -90,7 +90,9 @@ use crate::util::{ /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.character_reference && tokenizer.current == Some(b'&') { + if tokenizer.parse_state.options.constructs.character_reference + && tokenizer.current == Some(b'&') + { tokenizer.enter(Name::CharacterReference); tokenizer.enter(Name::CharacterReferenceMarker); tokenizer.consume(); diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index bfd15dc..d117006 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -128,7 +128,7 @@ use crate::util::{ /// | ~~~ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.code_fenced { + if tokenizer.parse_state.options.constructs.code_fenced { if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.enter(Name::CodeFenced); tokenizer.enter(Name::CodeFencedFence); @@ -139,7 +139,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { return State::Retry(space_or_tab_min_max( tokenizer, 0, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX @@ -384,7 +384,7 @@ pub fn close_start(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_min_max( tokenizer, 0, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 866c78e..7d279c1 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -72,7 +72,7 @@ use crate::util::constant::TAB_SIZE; pub fn start(tokenizer: &mut Tokenizer) -> State { // Do not interrupt paragraphs. if !tokenizer.interrupt - && tokenizer.parse_state.constructs.code_indented + && tokenizer.parse_state.options.constructs.code_indented && matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.enter(Name::CodeIndented); diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 413b5ee..b2cfd17 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -100,7 +100,7 @@ use crate::tokenizer::Tokenizer; pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'`') - if tokenizer.parse_state.constructs.code_text + if tokenizer.parse_state.options.constructs.code_text && (tokenizer.previous != Some(b'`') || (!tokenizer.events.is_empty() && tokenizer.events[tokenizer.events.len() - 1].name diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 071e595..e65d979 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -120,7 +120,7 @@ use crate::util::{ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { // Do not interrupt paragraphs (but do follow definitions). - if tokenizer.parse_state.constructs.definition + if tokenizer.parse_state.options.constructs.definition && (!tokenizer.interrupt || (!tokenizer.events.is_empty() && tokenizer.events[skip::opt_back( diff --git a/src/construct/frontmatter.rs b/src/construct/frontmatter.rs index 74006f6..268d91d 100644 --- a/src/construct/frontmatter.rs +++ b/src/construct/frontmatter.rs @@ -72,7 +72,7 @@ use crate::util::constant::FRONTMATTER_SEQUENCE_SIZE; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { // Indent not allowed. - if tokenizer.parse_state.constructs.frontmatter + if tokenizer.parse_state.options.constructs.frontmatter && matches!(tokenizer.current, Some(b'+' | b'-')) { tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 64c909a..c562ff6 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -56,7 +56,9 @@ use crate::tokenizer::Tokenizer; /// | b /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.hard_break_escape && tokenizer.current == Some(b'\\') { + if tokenizer.parse_state.options.constructs.hard_break_escape + && tokenizer.current == Some(b'\\') + { tokenizer.enter(Name::HardBreakEscape); tokenizer.consume(); State::Next(StateName::HardBreakEscapeAfter) diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index dd09f74..c1090c4 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -77,14 +77,14 @@ use alloc::vec; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.heading_atx { + if tokenizer.parse_state.options.constructs.heading_atx { tokenizer.enter(Name::HeadingAtx); if matches!(tokenizer.current, Some(b'\t' | b' ')) { tokenizer.attempt(State::Next(StateName::HeadingAtxBefore), State::Nok); State::Retry(space_or_tab_min_max( tokenizer, 0, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 19d2dda..df1d4fb 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -85,7 +85,7 @@ use alloc::vec; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.heading_setext + if tokenizer.parse_state.options.constructs.heading_setext && !tokenizer.lazy // Require a paragraph before. && (!tokenizer.events.is_empty() @@ -102,7 +102,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_min_max( tokenizer, 0, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index edb500e..3f6e19a 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -131,7 +131,7 @@ const COMPLETE: u8 = 7; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.html_flow { + if tokenizer.parse_state.options.constructs.html_flow { tokenizer.enter(Name::HtmlFlow); if matches!(tokenizer.current, Some(b'\t' | b' ')) { @@ -141,7 +141,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { SpaceOrTabOptions { kind: Name::HtmlFlowData, min: 0, - max: if tokenizer.parse_state.constructs.code_indented { + max: if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 5aa6137..d40361d 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -64,7 +64,7 @@ use crate::util::constant::HTML_CDATA_PREFIX; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Some(b'<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { + if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.html_text { tokenizer.enter(Name::HtmlText); tokenizer.enter(Name::HtmlTextData); tokenizer.consume(); diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 4532920..0ea745f 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -183,7 +183,7 @@ use alloc::vec; /// > | [a] b /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Some(b']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { + if Some(b']') == tokenizer.current && tokenizer.parse_state.options.constructs.label_end { // If there is an okay opening: if !tokenizer.tokenize_state.label_starts.is_empty() { let label_start = tokenizer.tokenize_state.label_starts.last().unwrap(); diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 8d35df2..a8c9ac3 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -44,7 +44,8 @@ use crate::tokenizer::{LabelStart, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.label_start_image && tokenizer.current == Some(b'!') { + if tokenizer.parse_state.options.constructs.label_start_image && tokenizer.current == Some(b'!') + { tokenizer.enter(Name::LabelImage); tokenizer.enter(Name::LabelImageMarker); tokenizer.consume(); diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index e079b2d..3aeb68b 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -43,7 +43,8 @@ use crate::tokenizer::{LabelStart, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.label_start_link && tokenizer.current == Some(b'[') { + if tokenizer.parse_state.options.constructs.label_start_link && tokenizer.current == Some(b'[') + { let start = tokenizer.events.len(); tokenizer.enter(Name::LabelLink); tokenizer.enter(Name::LabelMarker); diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index 7228a00..39b5d13 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -77,7 +77,7 @@ use alloc::{vec, vec::Vec}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.list_item { + if tokenizer.parse_state.options.constructs.list_item { tokenizer.enter(Name::ListItem); if matches!(tokenizer.current, Some(b'\t' | b' ')) { @@ -85,7 +85,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_min_max( tokenizer, 0, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/construct/text.rs b/src/construct/text.rs index 06ba378..9d40585 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -27,7 +27,7 @@ use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; /// Characters that can start something in text. -const MARKERS: [u8; 9] = [ +const MARKERS: [u8; 10] = [ b'!', // `label_start_image` b'&', // `character_reference` b'*', // `attention` @@ -37,6 +37,7 @@ const MARKERS: [u8; 9] = [ b']', // `label_end` b'_', // `attention` b'`', // `code_text` + b'~', // `attention` (w/ `gfm_strikethrough`) ]; /// Start of text. @@ -77,7 +78,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::CharacterReferenceStart) } - Some(b'*' | b'_') => { + Some(b'*' | b'_' | b'~') => { tokenizer.attempt( State::Next(StateName::TextBefore), State::Next(StateName::TextBeforeData), @@ -171,11 +172,16 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { pub fn resolve(tokenizer: &mut Tokenizer) { resolve_whitespace( tokenizer, - tokenizer.parse_state.constructs.hard_break_trailing, + tokenizer.parse_state.options.constructs.hard_break_trailing, true, ); - if tokenizer.parse_state.constructs.gfm_autolink_literal { + if tokenizer + .parse_state + .options + .constructs + .gfm_autolink_literal + { resolve_gfm_autolink_literal(tokenizer); } } diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index f77f83e..12dd7cf 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -69,7 +69,7 @@ use crate::util::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.thematic_break { + if tokenizer.parse_state.options.constructs.thematic_break { tokenizer.enter(Name::ThematicBreak); if matches!(tokenizer.current, Some(b'\t' | b' ')) { @@ -77,7 +77,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_min_max( tokenizer, 0, - if tokenizer.parse_state.constructs.code_indented { + if tokenizer.parse_state.options.constructs.code_indented { TAB_SIZE - 1 } else { usize::MAX diff --git a/src/event.rs b/src/event.rs index 169fdb5..3c690e1 100644 --- a/src/event.rs +++ b/src/event.rs @@ -878,7 +878,6 @@ pub enum Name { /// ^ /// ``` EmphasisText, - // To do: sort. /// Whole frontmatter. /// /// ## Info @@ -1020,6 +1019,61 @@ pub enum Name { /// ^^^^^^^^^^^^^^^ /// ``` GfmAutolinkLiteralWww, + /// GFM: Strikethrough. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// [`GfmStrikethroughSequence`][Name::GfmStrikethroughSequence], + /// [`GfmStrikethroughText`][Name::GfmStrikethroughText] + /// * **Construct**: + /// [`attention`][crate::construct::attention] + /// + /// ## Example + /// + /// ```markdown + /// > | ~a~ + /// ^^^ + /// ``` + GfmStrikethrough, + /// Gfm: Strikethrough sequence. + /// + /// ## Info + /// + /// * **Context**: + /// [`GfmStrikethrough`][Name::GfmStrikethrough] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`attention`][crate::construct::attention] + /// + /// ## Example + /// + /// ```markdown + /// > | ~a~ + /// ^ ^ + /// ``` + GfmStrikethroughSequence, + /// Gfm: Strikethrough text. + /// + /// ## Info + /// + /// * **Context**: + /// [`GfmStrikethrough`][Name::GfmStrikethrough] + /// * **Content model**: + /// [text content][crate::construct::text] + /// * **Construct**: + /// [`attention`][crate::construct::attention] + /// + /// ## Example + /// + /// ```markdown + /// > | ~a~ + /// ^ + /// ``` + GfmStrikethroughText, /// Whole hard break (escape). /// /// ## Info @@ -1977,7 +2031,7 @@ pub enum Name { } /// List of void events, used to make sure everything is working well. -pub const VOID_EVENTS: [Name; 46] = [ +pub const VOID_EVENTS: [Name; 47] = [ Name::AttentionSequence, Name::AutolinkEmail, Name::AutolinkMarker, @@ -2006,6 +2060,7 @@ pub const VOID_EVENTS: [Name; 46] = [ Name::GfmAutolinkLiteralEmail, Name::GfmAutolinkLiteralProtocol, Name::GfmAutolinkLiteralWww, + Name::GfmStrikethroughSequence, Name::FrontmatterSequence, Name::HardBreakEscape, Name::HardBreakTrailing, @@ -171,6 +171,13 @@ pub struct Constructs { /// ^^^^^^^^^^^^^^^^^^^ /// ``` pub gfm_autolink_literal: bool, + /// GFM: strikethrough. + /// + /// ```markdown + /// > | a ~b~ c. + /// ^^^ + /// ``` + pub gfm_strikethrough: bool, /// Hard break (escape). /// /// ```markdown @@ -269,6 +276,7 @@ impl Default for Constructs { definition: true, frontmatter: false, gfm_autolink_literal: false, + gfm_strikethrough: false, hard_break_escape: true, hard_break_trailing: true, heading_atx: true, @@ -292,13 +300,14 @@ impl Constructs { pub fn gfm() -> Self { Self { gfm_autolink_literal: true, + gfm_strikethrough: true, ..Self::default() } } } /// Configuration (optional). -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug)] pub struct Options { /// Whether to allow (dangerous) HTML. /// The default is `false`, you can turn it on to `true` for trusted @@ -358,6 +367,43 @@ pub struct Options { /// ``` pub allow_dangerous_protocol: bool, + /// Whether to support GFM strikethrough (if enabled in `constructs`) with + /// a single tilde (default: true). + /// + /// Single tildes work on github.com but are technically prohibited by GFM. + /// + /// ## Examples + /// + /// ``` + /// use micromark::{micromark, micromark_with_options, Options, Constructs}; + /// + /// // micromark supports single tildes by default: + /// assert_eq!( + /// micromark_with_options( + /// "~a~", + /// &Options { + /// constructs: Constructs::gfm(), + /// ..Options::default() + /// } + /// ), + /// "<p><del>a</del></p>" + /// ); + /// + /// // Pass `gfm_strikethrough_single_tilde: false` to turn that off: + /// assert_eq!( + /// micromark_with_options( + /// "~a~", + /// &Options { + /// constructs: Constructs::gfm(), + /// gfm_strikethrough_single_tilde: false, + /// ..Options::default() + /// } + /// ), + /// "<p>~a~</p>" + /// ); + /// ``` + pub gfm_strikethrough_single_tilde: bool, + /// Default line ending to use, for line endings not in `value`. /// /// Generally, micromark copies line endings (`\r`, `\n`, `\r\n`) in the @@ -427,6 +473,19 @@ pub struct Options { pub constructs: Constructs, } +impl Default for Options { + /// Safe `CommonMark` defaults. + fn default() -> Self { + Self { + allow_dangerous_html: false, + allow_dangerous_protocol: false, + gfm_strikethrough_single_tilde: true, + default_line_ending: LineEnding::default(), + constructs: Constructs::default(), + } + } +} + /// Turn markdown into HTML. /// /// ## Examples diff --git a/src/parser.rs b/src/parser.rs index 404fd0f..afa08ac 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4,7 +4,7 @@ use crate::event::{Event, Point}; use crate::state::{Name as StateName, State}; use crate::subtokenize::subtokenize; use crate::tokenizer::Tokenizer; -use crate::{Constructs, Options}; +use crate::Options; use alloc::{string::String, vec, vec::Vec}; /// Info needed, in all content types, when parsing markdown. @@ -13,7 +13,8 @@ use alloc::{string::String, vec, vec::Vec}; /// It also references the input value as bytes (`u8`). #[derive(Debug)] pub struct ParseState<'a> { - pub constructs: &'a Constructs, + /// Configuration. + pub options: &'a Options, /// List of chars. pub bytes: &'a [u8], /// Set of defined identifiers. @@ -25,7 +26,7 @@ pub struct ParseState<'a> { /// Passes the bytes back so the compiler can access the source. pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, &'a [u8]) { let mut parse_state = ParseState { - constructs: &options.constructs, + options, bytes: value.as_bytes(), definitions: vec![], }; diff --git a/tests/gfm_strikethrough.rs b/tests/gfm_strikethrough.rs new file mode 100644 index 0000000..f39be07 --- /dev/null +++ b/tests/gfm_strikethrough.rs @@ -0,0 +1,387 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Constructs, Options}; +use pretty_assertions::assert_eq; + +#[test] +fn gfm_strikethrough() { + let gfm = Options { + constructs: Constructs::gfm(), + ..Options::default() + }; + + assert_eq!( + micromark("a ~b~ c"), + "<p>a ~b~ c</p>", + "should ignore strikethrough by default" + ); + + assert_eq!( + micromark_with_options("a ~b~", &gfm), + "<p>a <del>b</del></p>", + "should support strikethrough w/ one tilde" + ); + + assert_eq!( + micromark_with_options("a ~~b~~", &gfm), + "<p>a <del>b</del></p>", + "should support strikethrough w/ two tildes" + ); + + assert_eq!( + micromark_with_options("a ~~~b~~~", &gfm), + "<p>a ~~~b~~~</p>", + "should not support strikethrough w/ three tildes" + ); + + assert_eq!( + micromark_with_options("a \\~~~b~~ c", &gfm), + "<p>a ~<del>b</del> c</p>", + "should support strikethrough after an escaped tilde" + ); + + assert_eq!( + micromark_with_options("a ~~b ~~c~~ d~~ e", &gfm), + "<p>a <del>b <del>c</del> d</del> e</p>", + "should support nested strikethrough" + ); + + assert_eq!( + micromark_with_options("a ~-1~ b", &gfm), + "<p>a <del>-1</del> b</p>", + "should open if preceded by whitespace and followed by punctuation" + ); + + assert_eq!( + micromark_with_options("a ~b.~ c", &gfm), + "<p>a <del>b.</del> c</p>", + "should close if preceded by punctuation and followed by whitespace" + ); + + assert_eq!( + micromark_with_options("~b.~.", &gfm), + "<p><del>b.</del>.</p>", + "should close if preceded and followed by punctuation" + ); + + assert_eq!( + micromark_with_options( + r###" +# Balanced + +a ~one~ b + +a ~~two~~ b + +a ~~~three~~~ b + +a ~~~~four~~~~ b + +# Unbalanced + +a ~one/two~~ b + +a ~one/three~~~ b + +a ~one/four~~~~ b + +*** + +a ~~two/one~ b + +a ~~two/three~~~ b + +a ~~two/four~~~~ b + +*** + +a ~~~three/one~ b + +a ~~~three/two~~ b + +a ~~~three/four~~~~ b + +*** + +a ~~~~four/one~ b + +a ~~~~four/two~~ b + +a ~~~~four/three~~~ b + +## Multiple + +a ~one b one~ c one~ d + +a ~one b two~~ c one~ d + +a ~one b one~ c two~~ d + +a ~~two b two~~ c two~~ d + +a ~~two b one~ c two~~ d + +a ~~two b two~~ c one~ d +"###, + &gfm + ), + r###"<h1>Balanced</h1> +<p>a <del>one</del> b</p> +<p>a <del>two</del> b</p> +<p>a ~~~three~~~ b</p> +<p>a ~~~~four~~~~ b</p> +<h1>Unbalanced</h1> +<p>a ~one/two~~ b</p> +<p>a ~one/three~~~ b</p> +<p>a ~one/four~~~~ b</p> +<hr /> +<p>a ~~two/one~ b</p> +<p>a ~~two/three~~~ b</p> +<p>a ~~two/four~~~~ b</p> +<hr /> +<p>a ~~~three/one~ b</p> +<p>a ~~~three/two~~ b</p> +<p>a ~~~three/four~~~~ b</p> +<hr /> +<p>a ~~~~four/one~ b</p> +<p>a ~~~~four/two~~ b</p> +<p>a ~~~~four/three~~~ b</p> +<h2>Multiple</h2> +<p>a <del>one b one</del> c one~ d</p> +<p>a <del>one b two~~ c one</del> d</p> +<p>a <del>one b one</del> c two~~ d</p> +<p>a <del>two b two</del> c two~~ d</p> +<p>a <del>two b one~ c two</del> d</p> +<p>a <del>two b two</del> c one~ d</p> +"###, + "should handle balance like GitHub" + ); + + assert_eq!( + micromark_with_options( + r###" +# Flank + +a oneRight~ b oneRight~ c oneRight~ d + +a oneRight~ b oneRight~ c ~oneLeft d + +a oneRight~ b ~oneLeft c oneRight~ d + +a ~oneLeft b oneRight~ c oneRight~ d + +a ~oneLeft b oneRight~ c ~oneLeft d + +a ~oneLeft b ~oneLeft c oneRight~ d + +a ~oneLeft b ~oneLeft c ~oneLeft d + +*** + +a twoRight~~ b twoRight~~ c twoRight~~ d + +a twoRight~~ b twoRight~~ c ~~twoLeft d + +a twoRight~~ b ~~twoLeft c twoRight~~ d + +a ~~twoLeft b twoRight~~ c twoRight~~ d + +a ~~twoLeft b twoRight~~ c ~~twoLeft d + +a ~~twoLeft b ~~twoLeft c twoRight~~ d + +a ~~twoLeft b ~~twoLeft c ~~twoLeft d +"###, + &gfm + ), + r###"<h1>Flank</h1> +<p>a oneRight~ b oneRight~ c oneRight~ d</p> +<p>a oneRight~ b oneRight~ c ~oneLeft d</p> +<p>a oneRight~ b <del>oneLeft c oneRight</del> d</p> +<p>a <del>oneLeft b oneRight</del> c oneRight~ d</p> +<p>a <del>oneLeft b oneRight</del> c ~oneLeft d</p> +<p>a ~oneLeft b <del>oneLeft c oneRight</del> d</p> +<p>a ~oneLeft b ~oneLeft c ~oneLeft d</p> +<hr /> +<p>a twoRight~~ b twoRight~~ c twoRight~~ d</p> +<p>a twoRight~~ b twoRight~~ c ~~twoLeft d</p> +<p>a twoRight~~ b <del>twoLeft c twoRight</del> d</p> +<p>a <del>twoLeft b twoRight</del> c twoRight~~ d</p> +<p>a <del>twoLeft b twoRight</del> c ~~twoLeft d</p> +<p>a ~~twoLeft b <del>twoLeft c twoRight</del> d</p> +<p>a ~~twoLeft b ~~twoLeft c ~~twoLeft d</p> +"###, + "should handle flanking like GitHub" + ); + + assert_eq!( + micromark_with_options( + r###" +# Interlpay + +## Interleave with attention + +a ~~two *emphasis* two~~ b + +a ~~two **strong** two~~ b + +a *marker ~~two marker* two~~ b + +a ~~two *marker two~~ marker* b + +## Interleave with links + +a ~~two [resource](#) two~~ b + +a ~~two [reference][#] two~~ b + +a [label start ~~two label end](#) two~~ b + +a ~~two [label start two~~ label end](#) b + +a ~~two [label start ~one one~ label end](#) two~~ b + +a ~one [label start ~~two two~~ label end](#) one~ b + +a ~one [label start ~one one~ label end](#) one~ b + +a ~~two [label start ~~two two~~ label end](#) two~~ b + +[#]: # + +## Interleave with code (text) + +a ~~two `code` two~~ b + +a ~~two `code two~~` b + +a `code start ~~two code end` two~~ b + +a ~~two `code start two~~ code end` b + +a ~~two `code start ~one one~ code end` two~~ b + +a ~one `code start ~~two two~~ code end` one~ b + +a ~one `code start ~one one~ code end` one~ b + +a ~~two `code start ~~two two~~ code end` two~~ b + +## Emphasis/strong/strikethrough interplay + +a ***~~xxx~~*** zzz + +b ***xxx***zzz + +c **xxx**zzz + +d *xxx*zzz + +e ***~~xxx~~***yyy + +f **~~xxx~~**yyy + +g *~~xxx~~*yyy + +h ***~~xxx~~*** zzz + +i **~~xxx~~** zzz + +j *~~xxx~~* zzz + +k ~~~**xxx**~~~ zzz + +l ~~~xxx~~~zzz + +m ~~xxx~~zzz + +n ~xxx~zzz + +o ~~~**xxx**~~~yyy + +p ~~**xxx**~~yyy + +r ~**xxx**~yyy + +s ~~~**xxx**~~~ zzz + +t ~~**xxx**~~ zzz + +u ~**xxx**~ zzz +"###, + &gfm + ), + r###"<h1>Interlpay</h1> +<h2>Interleave with attention</h2> +<p>a <del>two <em>emphasis</em> two</del> b</p> +<p>a <del>two <strong>strong</strong> two</del> b</p> +<p>a <em>marker ~~two marker</em> two~~ b</p> +<p>a <del>two *marker two</del> marker* b</p> +<h2>Interleave with links</h2> +<p>a <del>two <a href="#">resource</a> two</del> b</p> +<p>a <del>two <a href="#">reference</a> two</del> b</p> +<p>a <a href="#">label start ~~two label end</a> two~~ b</p> +<p>a ~~two <a href="#">label start two~~ label end</a> b</p> +<p>a <del>two <a href="#">label start <del>one one</del> label end</a> two</del> b</p> +<p>a <del>one <a href="#">label start <del>two two</del> label end</a> one</del> b</p> +<p>a <del>one <a href="#">label start <del>one one</del> label end</a> one</del> b</p> +<p>a <del>two <a href="#">label start <del>two two</del> label end</a> two</del> b</p> +<h2>Interleave with code (text)</h2> +<p>a <del>two <code>code</code> two</del> b</p> +<p>a ~~two <code>code two~~</code> b</p> +<p>a <code>code start ~~two code end</code> two~~ b</p> +<p>a ~~two <code>code start two~~ code end</code> b</p> +<p>a <del>two <code>code start ~one one~ code end</code> two</del> b</p> +<p>a <del>one <code>code start ~~two two~~ code end</code> one</del> b</p> +<p>a <del>one <code>code start ~one one~ code end</code> one</del> b</p> +<p>a <del>two <code>code start ~~two two~~ code end</code> two</del> b</p> +<h2>Emphasis/strong/strikethrough interplay</h2> +<p>a <em><strong><del>xxx</del></strong></em> zzz</p> +<p>b <em><strong>xxx</strong></em>zzz</p> +<p>c <strong>xxx</strong>zzz</p> +<p>d <em>xxx</em>zzz</p> +<p>e <em><strong><del>xxx</del></strong></em>yyy</p> +<p>f <strong><del>xxx</del></strong>yyy</p> +<p>g <em><del>xxx</del></em>yyy</p> +<p>h <em><strong><del>xxx</del></strong></em> zzz</p> +<p>i <strong><del>xxx</del></strong> zzz</p> +<p>j <em><del>xxx</del></em> zzz</p> +<p>k ~~~<strong>xxx</strong>~~~ zzz</p> +<p>l ~~~xxx~~~zzz</p> +<p>m <del>xxx</del>zzz</p> +<p>n <del>xxx</del>zzz</p> +<p>o ~~~<strong>xxx</strong>~~~yyy</p> +<p>p ~~<strong>xxx</strong>~~yyy</p> +<p>r ~<strong>xxx</strong>~yyy</p> +<p>s ~~~<strong>xxx</strong>~~~ zzz</p> +<p>t <del><strong>xxx</strong></del> zzz</p> +<p>u <del><strong>xxx</strong></del> zzz</p> +"###, + "should handle interplay like GitHub" + ); + + assert_eq!( + micromark_with_options( + "a ~b~ ~~c~~ d", + &Options { + constructs: Constructs::gfm(), + gfm_strikethrough_single_tilde: false, + ..Options::default() + } + ), + "<p>a ~b~ <del>c</del> d</p>", + "should not support strikethrough w/ one tilde if `singleTilde: false`" + ); + + assert_eq!( + micromark_with_options( + "a ~b~ ~~c~~ d", + &Options { + constructs: Constructs::gfm(), + gfm_strikethrough_single_tilde: true, + ..Options::default() + } + ), + "<p>a <del>b</del> <del>c</del> d</p>", + "should support strikethrough w/ one tilde if `singleTilde: true`" + ); +} |