aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct/attention.rs
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-22 11:50:42 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-22 11:50:42 +0200
commit351c69644bdbdf52c95e322904273657892920b5 (patch)
tree114a93ff760b522232f9f7290bc6f632b7250095 /src/construct/attention.rs
parent5e6829c2fb79c2b7f59e38f924e2b2900c52b5d5 (diff)
downloadmarkdown-rs-351c69644bdbdf52c95e322904273657892920b5.tar.gz
markdown-rs-351c69644bdbdf52c95e322904273657892920b5.tar.bz2
markdown-rs-351c69644bdbdf52c95e322904273657892920b5.zip
Add support for GFM strikethrough
Diffstat (limited to '')
-rw-r--r--src/construct/attention.rs206
1 files changed, 125 insertions, 81 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index ef960d4..526f58c 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -1,4 +1,5 @@
-//! Attention (emphasis and strong) occurs in the [text][] content type.
+//! Attention (emphasis, strong, optionally GFM strikethrough) occurs in the
+//! [text][] content type.
//!
//! ## Grammar
//!
@@ -7,24 +8,31 @@
//!
//! ```bnf
//! attention_sequence ::= 1*'*' | 1*'_'
+//! gfm_attention_sequence ::= 1*'~'
//! ```
//!
//! Sequences are matched together to form attention based on which character
-//! they contain, and what character occurs before and after each sequence.
+//! they contain, how long they are, and what character occurs before and after
+//! each sequence.
//! Otherwise they are turned into data.
//!
//! ## HTML
//!
-//! When sequences match, and two markers can be “taken” from them, they
-//! together relate to the `<strong>` element in HTML.
+//! When asterisk/underscore sequences match, and two markers can be “taken”
+//! from them, they together relate to the `<strong>` element in HTML.
//! When one marker can be taken, they relate to the `<em>` element.
//! See [*§ 4.5.2 The `em` element*][html-em] and
//! [*§ 4.5.3 The `strong` element*][html-strong] in the HTML spec for more
//! info.
//!
+//! When tilde sequences match, they together relate to the `<del>` element in
+//! HTML.
+//! See [*§ 4.7.2 The `del` element*][html-del] in the HTML spec for more info.
+//!
//! ## Recommendation
//!
-//! It is recommended to use asterisks for attention when writing markdown.
+//! It is recommended to use asterisks for emphasis/strong attention when
+//! writing markdown.
//!
//! There are some small differences in whether sequences can open and/or close
//! based on whether they are formed with asterisks or underscores.
@@ -37,11 +45,18 @@
//! can look for asterisks to find syntax while not worrying about other
//! characters.
//!
+//! For strikethrough attention, it is recommended to use two markers.
+//! While `github.com` allows single tildes too, it technically prohibits it in
+//! their spec.
+//!
//! ## Tokens
//!
//! * [`Emphasis`][Name::Emphasis]
//! * [`EmphasisSequence`][Name::EmphasisSequence]
//! * [`EmphasisText`][Name::EmphasisText]
+//! * [`GfmStrikethrough`][Name::GfmStrikethrough]
+//! * [`GfmStrikethroughSequence`][Name::GfmStrikethroughSequence]
+//! * [`GfmStrikethroughText`][Name::GfmStrikethroughText]
//! * [`Strong`][Name::Strong]
//! * [`StrongSequence`][Name::StrongSequence]
//! * [`StrongText`][Name::StrongText]
@@ -52,11 +67,14 @@
//! ## References
//!
//! * [`attention.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/attention.js)
+//! * [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough)
//! * [*§ 6.2 Emphasis and strong emphasis* in `CommonMark`](https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis)
+//! * [*§ 6.5 Strikethrough (extension)* in `GFM`](https://github.github.com/gfm/#strikethrough-extension-)
//!
//! [text]: crate::construct::text
//! [html-em]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-em-element
//! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element
+//! [html-del]: https://html.spec.whatwg.org/multipage/edits.html#the-del-element
use crate::event::{Event, Kind, Name, Point};
use crate::resolve::Name as ResolveName;
@@ -94,7 +112,11 @@ struct Sequence {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- if tokenizer.parse_state.constructs.attention && matches!(tokenizer.current, Some(b'*' | b'_'))
+ // Emphasis/strong:
+ if (tokenizer.parse_state.options.constructs.attention
+ && matches!(tokenizer.current, Some(b'*' | b'_')))
+ // GFM strikethrough:
+ || (tokenizer.parse_state.options.constructs.gfm_strikethrough && tokenizer.current == Some(b'~'))
{
tokenizer.tokenize_state.marker = tokenizer.current.unwrap();
tokenizer.enter(Name::AttentionSequence);
@@ -117,85 +139,15 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State {
} else {
tokenizer.exit(Name::AttentionSequence);
tokenizer.register_resolver(ResolveName::Attention);
- tokenizer.tokenize_state.marker = b'\0';
+ tokenizer.tokenize_state.marker = 0;
State::Ok
}
}
-/// Resolve attention sequences.
+/// Resolve sequences.
pub fn resolve(tokenizer: &mut Tokenizer) {
- let mut index = 0;
- let mut balance = 0;
- let mut sequences = vec![];
-
// Find all sequences, gather info about them.
- while index < tokenizer.events.len() {
- let enter = &tokenizer.events[index];
-
- if enter.kind == Kind::Enter {
- balance += 1;
-
- if enter.name == Name::AttentionSequence {
- let end = index + 1;
- let exit = &tokenizer.events[end];
-
- let before_end = enter.point.index;
- let before_start = if before_end < 4 { 0 } else { before_end - 4 };
- let char_before =
- String::from_utf8_lossy(&tokenizer.parse_state.bytes[before_start..before_end])
- .chars()
- .last();
-
- let after_start = exit.point.index;
- let after_end = if after_start + 4 > tokenizer.parse_state.bytes.len() {
- tokenizer.parse_state.bytes.len()
- } else {
- after_start + 4
- };
- let char_after =
- String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end])
- .chars()
- .next();
-
- let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
- .head()
- .unwrap();
- let before = classify_opt(char_before);
- let after = classify_opt(char_after);
- let open = after == CharacterKind::Other
- || (after == CharacterKind::Punctuation && before != CharacterKind::Other);
- // To do: GFM strikethrough?
- // || char_after == '~'
- let close = before == CharacterKind::Other
- || (before == CharacterKind::Punctuation && after != CharacterKind::Other);
- // To do: GFM strikethrough?
- // || char_before == '~'
-
- sequences.push(Sequence {
- index,
- balance,
- start_point: enter.point.clone(),
- end_point: exit.point.clone(),
- size: exit.point.index - enter.point.index,
- open: if marker == b'*' {
- open
- } else {
- open && (before != CharacterKind::Other || !close)
- },
- close: if marker == b'*' {
- close
- } else {
- close && (after != CharacterKind::Other || !open)
- },
- marker,
- });
- }
- } else {
- balance -= 1;
- }
-
- index += 1;
- }
+ let mut sequences = get_sequences(tokenizer);
// Now walk through them and match them.
let mut close = 0;
@@ -230,7 +182,20 @@ pub fn resolve(tokenizer: &mut Tokenizer) {
continue;
}
- // We’ve found a match!
+ // For GFM strikethrough:
+ // * both sequences must have the same size
+ // * more than 2 markers don’t work
+ // * one marker is prohibited by the spec, but supported by GH
+ if sequence_close.marker == b'~'
+ && (sequence_close.size != sequence_open.size
+ || sequence_close.size > 2
+ || sequence_close.size == 1
+ && !tokenizer.parse_state.options.gfm_strikethrough_single_tilde)
+ {
+ continue;
+ }
+
+ // We found a match!
next_index = match_sequences(tokenizer, &mut sequences, open, close);
break;
@@ -253,7 +218,80 @@ pub fn resolve(tokenizer: &mut Tokenizer) {
tokenizer.map.consume(&mut tokenizer.events);
}
+/// Get sequences.
+fn get_sequences(tokenizer: &mut Tokenizer) -> Vec<Sequence> {
+ let mut index = 0;
+ let mut balance = 0;
+ let mut sequences = vec![];
+
+ while index < tokenizer.events.len() {
+ let enter = &tokenizer.events[index];
+
+ if enter.kind == Kind::Enter {
+ balance += 1;
+
+ if enter.name == Name::AttentionSequence {
+ let end = index + 1;
+ let exit = &tokenizer.events[end];
+
+ let before_end = enter.point.index;
+ let before_start = if before_end < 4 { 0 } else { before_end - 4 };
+ let after_start = exit.point.index;
+ let after_end = if after_start + 4 > tokenizer.parse_state.bytes.len() {
+ tokenizer.parse_state.bytes.len()
+ } else {
+ after_start + 4
+ };
+
+ let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
+ .head()
+ .unwrap();
+ let before = classify_opt(
+ String::from_utf8_lossy(&tokenizer.parse_state.bytes[before_start..before_end])
+ .chars()
+ .last(),
+ );
+ let after = classify_opt(
+ String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end])
+ .chars()
+ .next(),
+ );
+ let open = after == CharacterKind::Other
+ || (after == CharacterKind::Punctuation && before != CharacterKind::Other);
+ let close = before == CharacterKind::Other
+ || (before == CharacterKind::Punctuation && after != CharacterKind::Other);
+
+ sequences.push(Sequence {
+ index,
+ balance,
+ start_point: enter.point.clone(),
+ end_point: exit.point.clone(),
+ size: exit.point.index - enter.point.index,
+ open: if marker == b'_' {
+ open && (before != CharacterKind::Other || !close)
+ } else {
+ open
+ },
+ close: if marker == b'_' {
+ close && (after != CharacterKind::Other || !open)
+ } else {
+ close
+ },
+ marker,
+ });
+ }
+ } else {
+ balance -= 1;
+ }
+
+ index += 1;
+ }
+
+ sequences
+}
+
/// Match two sequences.
+#[allow(clippy::too_many_lines)]
fn match_sequences(
tokenizer: &mut Tokenizer,
sequences: &mut Vec<Sequence>,
@@ -292,7 +330,13 @@ fn match_sequences(
between += 1;
}
- let (group_name, seq_name, text_name) = if take == 1 {
+ let (group_name, seq_name, text_name) = if sequences[open].marker == b'~' {
+ (
+ Name::GfmStrikethrough,
+ Name::GfmStrikethroughSequence,
+ Name::GfmStrikethroughText,
+ )
+ } else if take == 1 {
(Name::Emphasis, Name::EmphasisSequence, Name::EmphasisText)
} else {
(Name::Strong, Name::StrongSequence, Name::StrongText)