Add docs to attention

author: Titus Wormer <tituswormer@gmail.com> 2022-07-05 09:45:30 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-07-05 09:45:30 +0200
commit: 1baad17d92108de592905f8c69bf29fbec02a57b (patch)
tree: 45e0d72d7a7f1439aa99a07eee3b919a258dd9f8 /src/construct/attention.rs
parent: 63d8645773ee0bf34363e46384ff1149c985bf28 (diff)
download: markdown-rs-1baad17d92108de592905f8c69bf29fbec02a57b.tar.gz
markdown-rs-1baad17d92108de592905f8c69bf29fbec02a57b.tar.bz2
markdown-rs-1baad17d92108de592905f8c69bf29fbec02a57b.zip
1 files changed, 180 insertions, 66 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index f4bb841..dff8633 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -1,25 +1,119 @@
-//! To do.
+//! Attention is a construct that occurs in the [text][] content type.
+//!
+//! How attention parses is too complex to explain in BNF.
+//! Essentially, one or more of `*` or `_` form attention sequences.
+//! Depending on the code before and after a sequence, it can open or close
+//! attention.
+//! When everything is parsed, we find each sequence that can close, and a
+//! corresponding sequence that can open which uses the same marker.
+//! If both sequences have two or more markers, strong is formed.
+//! Otherwise emphasis is formed.
+//!
+//! Attention sequences do not, on their own, relate to anything in HTML.
+//! When matched with another sequence, and two markers can be “taken” from
+//! them, they together relate to the `<strong>` element in HTML.
+//! When one marker can be taken, they relate to the `<em>` element.
+//! See [*§ 4.5.2 The `em` element*][html-em] and
+//! [*§ 4.5.3 The `strong` element*][html-strong] in the HTML spec for more
+//! info.
+//!
+//! It is recommended to use asterisks for attention when writing markdown.
+//!
+//! There are some small differences in whether sequences can open and/or close
+//! based on whether they are formed with asterisks or underscores.
+//! Because underscores also frequently occur in natural language inside words,
+//! while asterisks typically never do, `CommonMark` prohobits underscore
+//! sequences from opening or closing when *inside* a word.
+//!
+//! Because asterisks can be used to form the most markdown constructs, using
+//! them has the added benefit of making it easier to gloss over markdown: you
+//! can look for asterisks to find syntax while not worrying about other
+//! characters.
+//!
+//! ## Tokens
+//!
+//! *   [`Emphasis`][TokenType::Emphasis]
+//! *   [`EmphasisSequence`][TokenType::EmphasisSequence]
+//! *   [`EmphasisText`][TokenType::EmphasisText]
+//! *   [`Strong`][TokenType::Strong]
+//! *   [`StrongSequence`][TokenType::StrongSequence]
+//! *   [`StrongText`][TokenType::StrongText]
+//!
+//! > 👉 **Note**: while parsing, [`AttentionSequence`][TokenType::AttentionSequence]
+//! > is used, which is later compiled away.
+//!
+//! ## References
+//!
+//! *   [`attention.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/attention.js)
+//! *   [*§ 6.2 Emphasis and strong emphasis* in `CommonMark`](https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis)
+//!
+//! [text]: crate::content::text
+//! [html-em]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-em-element
+//! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element
 
 use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer};
 use crate::unicode::PUNCTUATION;
 use crate::util::edit_map::EditMap;
 
-/// To do
+/// Character code kinds.
 #[derive(Debug, PartialEq)]
 enum GroupKind {
+    /// Whitespace.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// > | **a_b_ c**.
+    ///    ^      ^    ^
+    /// ```
     Whitespace,
+    /// Punctuation.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// > | **a_b_ c**.
+    ///     ^^ ^ ^    ^
+    /// ```
     Punctuation,
+    /// Everything else.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// > | **a_b_ c**.
+    ///       ^ ^  ^
+    /// ```
     Other,
 }
 
-/// To do
+/// Type of sequence.
 #[derive(Debug, PartialEq)]
 enum MarkerKind {
+    /// In a run with asterisks.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// *a*
+    /// ```
     Asterisk,
+    /// In a run with underscores.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// _a_
+    /// ```
     Underscore,
 }
 
 impl MarkerKind {
+    /// Turn [char] into a kind.
+    ///
+    /// ## Panics
+    ///
+    /// Panics if `char` is not `*` or `_`.
     fn from_char(char: char) -> MarkerKind {
         match char {
             '*' => MarkerKind::Asterisk,
@@ -27,6 +121,11 @@ impl MarkerKind {
             _ => unreachable!("invalid char"),
         }
     }
+    /// Turn [Code] into a kind.
+    ///
+    /// ## Panics
+    ///
+    /// Panics if `code` is not `Code::Char('*' | '_')`.
     fn from_code(code: Code) -> MarkerKind {
         match code {
             Code::Char(char) => MarkerKind::from_char(char),
@@ -35,9 +134,9 @@ impl MarkerKind {
     }
 }
 
-/// To do
+/// Attentention sequence that we can take markers from.
 #[derive(Debug)]
-struct Run {
+struct Sequence {
     marker: MarkerKind,
     event_index: usize,
     start_point: Point,
@@ -49,10 +148,10 @@ struct Run {
     close: bool,
 }
 
-/// Before a paragraph.
+/// Before a sequence.
 ///
 /// ```markdown
-/// |qwe
+/// |**
 /// ```
 pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
@@ -64,10 +163,10 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     }
 }
 
-/// In a paragraph.
+/// In a sequence.
 ///
 /// ```markdown
-/// al|pha
+/// *|*
 /// ```
 fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult {
     match code {
@@ -83,9 +182,9 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult
     }
 }
 
-/// To do.
+/// Resolve attention sequences.
 #[allow(clippy::too_many_lines)]
-pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
+fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
     let mut index = 0;
     println!("before: {:?}", tokenizer.events.len());
     while index < tokenizer.events.len() {
@@ -105,9 +204,9 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
     let codes = &tokenizer.parse_state.codes;
     let mut edit_map = EditMap::new();
     let mut start = 0;
-    let mut runs: Vec<Run> = vec![];
+    let mut sequences: Vec<Sequence> = vec![];
 
-    // Find runs of sequences and information about them.
+    // Find sequences of sequences and information about them.
     while start < tokenizer.events.len() {
         let enter = &tokenizer.events[start];
 
@@ -135,7 +234,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
             // To do: GFM strikethrough?
             // || attentionMarkers.includes(previous)
 
-            runs.push(Run {
+            sequences.push(Sequence {
                 event_index: start,
                 start_point: enter.point.clone(),
                 start_index: enter.index,
@@ -161,35 +260,35 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
         start += 1;
     }
 
-    // Walk through runs and match them.
+    // Walk through sequences and match them.
     let mut close = 0;
 
-    while close < runs.len() {
-        let run_close = &runs[close];
+    while close < sequences.len() {
+        let sequence_close = &sequences[close];
         let mut next_index = close + 1;
-        println!("walk! {:?} {:?}", close, runs.len());
+        println!("walk! {:?} {:?}", close, sequences.len());
 
-        // Find a run that can close.
-        if run_close.close {
-            println!("close! {:?} {:?}", close, run_close);
+        // Find a sequence that can close.
+        if sequence_close.close {
+            println!("close! {:?} {:?}", close, sequence_close);
             let mut open = close;
 
             // Now walk back to find an opener.
             while open > 0 {
                 open -= 1;
 
-                let run_open = &runs[open];
+                let sequence_open = &sequences[open];
 
-                // We found a run that can open the closer we found.
-                if run_open.open && run_close.marker == run_open.marker {
-                    println!("open! {:?} {:?}", open, run_open);
+                // We found a sequence that can open the closer we found.
+                if sequence_open.open && sequence_close.marker == sequence_open.marker {
+                    println!("open! {:?} {:?}", open, sequence_open);
                     // If the opening can close or the closing can open,
                     // and the close size *is not* a multiple of three,
                     // but the sum of the opening and closing size *is*
                     // multiple of three, then **don’t** match.
-                    if (run_open.close || run_close.open)
-                        && run_close.size % 3 != 0
-                        && (run_open.size + run_close.size) % 3 == 0
+                    if (sequence_open.close || sequence_close.open)
+                        && sequence_close.size % 3 != 0
+                        && (sequence_open.size + sequence_close.size) % 3 == 0
                     {
                         continue;
                     }
@@ -197,34 +296,40 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
                     // We’ve found a match!
 
                     // Number of markers to use from the sequence.
-                    let take = if run_open.size > 1 && run_close.size > 1 {
+                    let take = if sequence_open.size > 1 && sequence_close.size > 1 {
                         2
                     } else {
                         1
                     };
 
-                    let run_close = &mut runs[close];
-                    let close_event_index = run_close.event_index;
-                    let seq_close_enter = (run_close.start_point.clone(), run_close.start_index);
-                    run_close.size -= take;
-                    run_close.start_point.column += take;
-                    run_close.start_point.offset += take;
-                    run_close.start_index += take;
-                    let seq_close_exit = (run_close.start_point.clone(), run_close.start_index);
+                    let sequence_close = &mut sequences[close];
+                    let close_event_index = sequence_close.event_index;
+                    let seq_close_enter = (
+                        sequence_close.start_point.clone(),
+                        sequence_close.start_index,
+                    );
+                    sequence_close.size -= take;
+                    sequence_close.start_point.column += take;
+                    sequence_close.start_point.offset += take;
+                    sequence_close.start_index += take;
+                    let seq_close_exit = (
+                        sequence_close.start_point.clone(),
+                        sequence_close.start_index,
+                    );
 
-                    // Stay on this closing run for the next iteration: it
+                    // Stay on this closing sequence for the next iteration: it
                     // might close more things.
                     next_index -= 1;
 
-                    // Remove closing run if fully used.
-                    if run_close.size == 0 {
-                        runs.remove(close);
+                    // Remove closing sequence if fully used.
+                    if sequence_close.size == 0 {
+                        sequences.remove(close);
                         edit_map.add(close_event_index, 2, vec![]);
                         println!("remove close");
                     } else {
-                        // Shift remaining closing run forward.
-                        // Do it here because a run can open and close different
-                        // other runs, and the remainder can be on any side or
+                        // Shift remaining closing sequence forward.
+                        // Do it here because a sequence can open and close different
+                        // other sequences, and the remainder can be on any side or
                         // somewhere in the middle.
                         let mut enter = &mut tokenizer.events[close_event_index];
                         enter.point = seq_close_exit.0.clone();
@@ -232,23 +337,23 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
                         println!("change close");
                     }
 
-                    let run_open = &mut runs[open];
-                    let open_event_index = run_open.event_index;
-                    let seq_open_exit = (run_open.end_point.clone(), run_open.end_index);
-                    run_open.size -= take;
-                    run_open.end_point.column -= take;
-                    run_open.end_point.offset -= take;
-                    run_open.end_index -= take;
-                    let seq_open_enter = (run_open.end_point.clone(), run_open.end_index);
-
-                    // Remove opening run if fully used.
-                    if run_open.size == 0 {
-                        runs.remove(open);
+                    let sequence_open = &mut sequences[open];
+                    let open_event_index = sequence_open.event_index;
+                    let seq_open_exit = (sequence_open.end_point.clone(), sequence_open.end_index);
+                    sequence_open.size -= take;
+                    sequence_open.end_point.column -= take;
+                    sequence_open.end_point.offset -= take;
+                    sequence_open.end_index -= take;
+                    let seq_open_enter = (sequence_open.end_point.clone(), sequence_open.end_index);
+
+                    // Remove opening sequence if fully used.
+                    if sequence_open.size == 0 {
+                        sequences.remove(open);
                         edit_map.add(open_event_index, 2, vec![]);
                         next_index -= 1;
                         println!("remove open");
                     } else {
-                        // Shift remaining opening run backwards.
+                        // Shift remaining opening sequence backwards.
                         // See note above for why that happens here.
                         let mut exit = &mut tokenizer.events[open_event_index + 1];
                         exit.point = seq_open_enter.0.clone();
@@ -385,10 +490,10 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
 
     // Mark remaining sequences as data.
     let mut index = 0;
-    while index < runs.len() {
-        let run = &runs[index];
-        tokenizer.events[run.event_index].token_type = TokenType::Data;
-        tokenizer.events[run.event_index + 1].token_type = TokenType::Data;
+    while index < sequences.len() {
+        let sequence = &sequences[index];
+        tokenizer.events[sequence.event_index].token_type = TokenType::Data;
+        tokenizer.events[sequence.event_index + 1].token_type = TokenType::Data;
         index += 1;
     }
 
@@ -412,17 +517,26 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
     events
 }
 
+/// Classify whether a character code represents whitespace, punctuation, or
+/// something else.
+///
+/// Used for attention (emphasis, strong), whose sequences can open or close
+/// based on the class of surrounding characters.
+///
+/// > 👉 **Note** that eof (`Code::None`) is seen as whitespace.
+///
+/// ## References
+///
+/// *   [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
 fn classify_character(code: Code) -> GroupKind {
     match code {
-        // Markdown whitespace.
-        Code::None
-        | Code::CarriageReturnLineFeed
-        | Code::VirtualSpace
-        | Code::Char('\t' | '\r' | '\n' | ' ') => GroupKind::Whitespace,
+        // Custom characters.
+        Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace => GroupKind::Whitespace,
         // Unicode whitespace.
         Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace,
         // Unicode punctuation.
         Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation,
+        // Everything else.
         Code::Char(_) => GroupKind::Other,
     }
 }
author	Titus Wormer <tituswormer@gmail.com>	2022-07-05 09:45:30 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-07-05 09:45:30 +0200
commit	1baad17d92108de592905f8c69bf29fbec02a57b (patch)
tree	45e0d72d7a7f1439aa99a07eee3b919a258dd9f8 /src/construct/attention.rs
parent	63d8645773ee0bf34363e46384ff1149c985bf28 (diff)
download	markdown-rs-1baad17d92108de592905f8c69bf29fbec02a57b.tar.gz markdown-rs-1baad17d92108de592905f8c69bf29fbec02a57b.tar.bz2 markdown-rs-1baad17d92108de592905f8c69bf29fbec02a57b.zip