From faca28020f4894bdfcf5a4b164ebbc75864d8776 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Mon, 4 Jul 2022 12:16:51 +0200
Subject: Add support for attention (emphasis, strong)

---
 src/compiler.rs                |  24 +++
 src/construct/attention.rs     | 401 +++++++++++++++++++++++++++++++++++++++++
 src/construct/mod.rs           |   3 +-
 src/construct/partial_label.rs |   5 +-
 src/content/text.rs            |  18 +-
 src/tokenizer.rs               |   9 +
 6 files changed, 449 insertions(+), 11 deletions(-)
 create mode 100644 src/construct/attention.rs

(limited to 'src')

diff --git a/src/compiler.rs b/src/compiler.rs
index 1f16648..061d3e3 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -421,6 +421,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
     enter_map.insert(TokenType::CodeIndented, on_enter_code_indented);
     enter_map.insert(TokenType::CodeFenced, on_enter_code_fenced);
     enter_map.insert(TokenType::CodeText, on_enter_code_text);
+    enter_map.insert(TokenType::Emphasis, on_enter_emphasis);
     enter_map.insert(TokenType::HtmlFlow, on_enter_html_flow);
     enter_map.insert(TokenType::HtmlText, on_enter_html_text);
     enter_map.insert(TokenType::Image, on_enter_image);
@@ -431,6 +432,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
         on_enter_resource_destination_string,
     );
     enter_map.insert(TokenType::Paragraph, on_enter_paragraph);
+    enter_map.insert(TokenType::Strong, on_enter_strong);
     enter_map.insert(TokenType::Definition, on_enter_definition);
     enter_map.insert(
         TokenType::DefinitionDestinationString,
@@ -441,6 +443,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
     enter_map.insert(TokenType::DefinitionTitleString, on_enter_buffer);
 
     let mut exit_map: Map = HashMap::new();
+    exit_map.insert(TokenType::Emphasis, on_exit_emphasis);
     exit_map.insert(TokenType::Label, on_exit_label);
     exit_map.insert(TokenType::LabelText, on_exit_label_text);
     exit_map.insert(TokenType::ReferenceString, on_exit_reference_string);
@@ -452,6 +455,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
         TokenType::ResourceTitleString,
         on_exit_resource_title_string,
     );
+    exit_map.insert(TokenType::Strong, on_exit_strong);
     exit_map.insert(TokenType::Image, on_exit_media);
     exit_map.insert(TokenType::Link, on_exit_media);
     exit_map.insert(TokenType::CodeTextData, on_exit_data);
@@ -644,6 +648,11 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) {
     context.ignore_encode = true;
 }
 
+/// Handle [`Enter`][EventType::Enter]:[`Emphasis`][TokenType::Emphasis].
+fn on_enter_emphasis(context: &mut CompileContext) {
+    context.tag("<em>".to_string());
+}
+
 /// Handle [`Enter`][EventType::Enter]:[`HtmlFlow`][TokenType::HtmlFlow].
 fn on_enter_html_flow(context: &mut CompileContext) {
     context.line_ending_if_needed();
@@ -704,6 +713,11 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) {
     context.ignore_encode = true;
 }
 
+/// Handle [`Enter`][EventType::Enter]:[`Strong`][TokenType::Strong].
+fn on_enter_strong(context: &mut CompileContext) {
+    context.tag("<strong>".to_string());
+}
+
 /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][TokenType::AutolinkEmail].
 fn on_exit_autolink_email(context: &mut CompileContext) {
     let slice = serialize(
@@ -933,6 +947,11 @@ fn on_exit_definition_title_string(context: &mut CompileContext) {
     definition.title = Some(buf);
 }
 
+/// Handle [`Exit`][EventType::Exit]:[`Strong`][TokenType::Emphasis].
+fn on_exit_emphasis(context: &mut CompileContext) {
+    context.tag("</em>".to_string());
+}
+
 /// Handle [`Exit`][EventType::Exit]:[`HeadingAtx`][TokenType::HeadingAtx].
 fn on_exit_heading_atx(context: &mut CompileContext) {
     let rank = context
@@ -1132,6 +1151,11 @@ fn on_exit_resource_title_string(context: &mut CompileContext) {
     media.title = Some(buf);
 }
 
+/// Handle [`Exit`][EventType::Exit]:[`Strong`][TokenType::Strong].
+fn on_exit_strong(context: &mut CompileContext) {
+    context.tag("</strong>".to_string());
+}
+
 /// Handle [`Exit`][EventType::Exit]:[`ThematicBreak`][TokenType::ThematicBreak].
 fn on_exit_thematic_break(context: &mut CompileContext) {
     context.tag("<hr />".to_string());
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
new file mode 100644
index 0000000..f022e6e
--- /dev/null
+++ b/src/construct/attention.rs
@@ -0,0 +1,401 @@
+//! To do.
+
+use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::edit_map::EditMap;
+
+/// To do
+#[derive(Debug, PartialEq)]
+enum GroupKind {
+    Whitespace,
+    Punctuation,
+    Other,
+}
+
+/// To do
+#[derive(Debug, PartialEq)]
+enum MarkerKind {
+    Asterisk,
+    Underscore,
+}
+
+impl MarkerKind {
+    fn from_char(char: char) -> MarkerKind {
+        match char {
+            '*' => MarkerKind::Asterisk,
+            '_' => MarkerKind::Underscore,
+            _ => unreachable!("invalid char"),
+        }
+    }
+    fn from_code(code: Code) -> MarkerKind {
+        match code {
+            Code::Char(char) => MarkerKind::from_char(char),
+            _ => unreachable!("invalid code"),
+        }
+    }
+}
+
+/// To do
+#[derive(Debug)]
+struct Run {
+    marker: MarkerKind,
+    event_index: usize,
+    start_point: Point,
+    start_index: usize,
+    end_point: Point,
+    end_index: usize,
+    size: usize,
+    open: bool,
+    close: bool,
+}
+
+/// Before a paragraph.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char(char) if char == '*' || char == '_' => {
+            tokenizer.enter(TokenType::AttentionSequence);
+            inside(tokenizer, code, char)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// In a paragraph.
+///
+/// ```markdown
+/// al|pha
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult {
+    match code {
+        Code::Char(char) if char == marker => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(move |t, c| inside(t, c, marker))), None)
+        }
+        _ => {
+            tokenizer.exit(TokenType::AttentionSequence);
+            tokenizer.register_resolver("attention".to_string(), Box::new(resolve));
+            (State::Ok, Some(vec![code]))
+        }
+    }
+}
+
+/// To do.
+#[allow(clippy::too_many_lines)]
+pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
+    let mut index = 0;
+    println!("before: {:?}", tokenizer.events.len());
+    while index < tokenizer.events.len() {
+        let event = &tokenizer.events[index];
+        println!(
+            "ev: {:?} {:?} {:?} {:?} {:?} {:?}",
+            index,
+            event.event_type,
+            event.token_type,
+            event.content_type,
+            event.previous,
+            event.next
+        );
+        index += 1;
+    }
+
+    let codes = &tokenizer.parse_state.codes;
+    let mut edit_map = EditMap::new();
+    let mut start = 0;
+    let mut runs: Vec<Run> = vec![];
+
+    // Find runs of sequences and information about them.
+    while start < tokenizer.events.len() {
+        let enter = &tokenizer.events[start];
+
+        if enter.event_type == EventType::Enter && enter.token_type == TokenType::AttentionSequence
+        {
+            let end = start + 1;
+            let exit = &tokenizer.events[end];
+            let marker = MarkerKind::from_code(codes[enter.index]);
+            let before = classify_character(if enter.index > 0 {
+                codes[enter.index - 1]
+            } else {
+                Code::None
+            });
+            let after = classify_character(if exit.index < codes.len() {
+                codes[exit.index]
+            } else {
+                Code::None
+            });
+            let open = after == GroupKind::Other
+                || (after == GroupKind::Punctuation && before != GroupKind::Other);
+            // To do: GFM strikethrough?
+            // || attentionMarkers.includes(code)
+            let close = before == GroupKind::Other
+                || (before == GroupKind::Punctuation && after != GroupKind::Other);
+            // To do: GFM strikethrough?
+            // || attentionMarkers.includes(previous)
+
+            runs.push(Run {
+                event_index: start,
+                start_point: enter.point.clone(),
+                start_index: enter.index,
+                end_point: exit.point.clone(),
+                end_index: exit.index,
+                size: exit.index - enter.index,
+                open: if marker == MarkerKind::Asterisk {
+                    open
+                } else {
+                    open && (before != GroupKind::Other || !close)
+                },
+                close: if marker == MarkerKind::Asterisk {
+                    close
+                } else {
+                    close && (after != GroupKind::Other || !open)
+                },
+                marker,
+            });
+
+            start += 1;
+        }
+
+        start += 1;
+    }
+
+    // Walk through runs and match them.
+    let mut close = 0;
+
+    while close < runs.len() {
+        let run_close = &runs[close];
+
+        // Find a run that can close.
+        if run_close.close {
+            let mut open = close;
+
+            // Now walk back to find an opener.
+            while open > 0 {
+                open -= 1;
+
+                let run_open = &runs[open];
+
+                // Find a token that can open the closer.
+                if run_open.open && run_close.marker == run_open.marker {
+                    // If the opening can close or the closing can open,
+                    // and the close size *is not* a multiple of three,
+                    // but the sum of the opening and closing size *is*
+                    // multiple of three, then **don’t** match.
+                    if (run_open.close || run_close.open)
+                        && run_close.size % 3 != 0
+                        && (run_open.size + run_close.size) % 3 == 0
+                    {
+                        continue;
+                    }
+
+                    // Number of markers to use from the sequence.
+                    let take = if run_open.size > 1 && run_close.size > 1 {
+                        2
+                    } else {
+                        1
+                    };
+
+                    let run_close = &mut runs[close];
+                    let close_event_index = run_close.event_index;
+                    let seq_close_enter = (run_close.start_point.clone(), run_close.start_index);
+                    run_close.size -= take;
+                    run_close.start_point.column += take;
+                    run_close.start_point.offset += take;
+                    let seq_close_exit = (run_close.start_point.clone(), run_close.start_index);
+
+                    // Remove closing run if fully used.
+                    if run_close.size == 0 {
+                        runs.remove(close);
+                        edit_map.add(close_event_index, 2, vec![]);
+                    }
+
+                    let run_open = &mut runs[open];
+                    let open_event_index = run_open.event_index;
+                    let seq_open_exit = (run_open.end_point.clone(), run_open.end_index);
+                    run_open.size -= take;
+                    run_open.end_point.column -= take;
+                    run_open.end_point.offset -= take;
+                    let seq_open_enter = (run_open.end_point.clone(), run_open.end_index);
+
+                    // Remove opening run if fully used.
+                    if run_open.size == 0 {
+                        runs.remove(open);
+                        edit_map.add(open_event_index, 2, vec![]);
+                    }
+
+                    // Opening.
+                    edit_map.add(
+                        open_event_index,
+                        0,
+                        vec![
+                            Event {
+                                event_type: EventType::Enter,
+                                token_type: if take == 1 {
+                                    TokenType::Emphasis
+                                } else {
+                                    TokenType::Strong
+                                },
+                                point: seq_open_enter.0.clone(),
+                                index: seq_open_enter.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                            Event {
+                                event_type: EventType::Enter,
+                                token_type: if take == 1 {
+                                    TokenType::EmphasisSequence
+                                } else {
+                                    TokenType::StrongSequence
+                                },
+                                point: seq_open_enter.0.clone(),
+                                index: seq_open_enter.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                            Event {
+                                event_type: EventType::Exit,
+                                token_type: if take == 1 {
+                                    TokenType::EmphasisSequence
+                                } else {
+                                    TokenType::StrongSequence
+                                },
+                                point: seq_open_exit.0.clone(),
+                                index: seq_open_exit.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                            Event {
+                                event_type: EventType::Enter,
+                                token_type: if take == 1 {
+                                    TokenType::EmphasisText
+                                } else {
+                                    TokenType::StrongText
+                                },
+                                point: seq_open_exit.0.clone(),
+                                index: seq_open_exit.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                        ],
+                    );
+                    // Closing.
+                    edit_map.add(
+                        close_event_index,
+                        0,
+                        vec![
+                            Event {
+                                event_type: EventType::Exit,
+                                token_type: if take == 1 {
+                                    TokenType::EmphasisText
+                                } else {
+                                    TokenType::StrongText
+                                },
+                                point: seq_close_enter.0.clone(),
+                                index: seq_close_enter.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                            Event {
+                                event_type: EventType::Enter,
+                                token_type: if take == 1 {
+                                    TokenType::EmphasisSequence
+                                } else {
+                                    TokenType::StrongSequence
+                                },
+                                point: seq_close_enter.0.clone(),
+                                index: seq_close_enter.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                            Event {
+                                event_type: EventType::Exit,
+                                token_type: if take == 1 {
+                                    TokenType::EmphasisSequence
+                                } else {
+                                    TokenType::StrongSequence
+                                },
+                                point: seq_close_exit.0.clone(),
+                                index: seq_close_exit.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                            Event {
+                                event_type: EventType::Exit,
+                                token_type: if take == 1 {
+                                    TokenType::Emphasis
+                                } else {
+                                    TokenType::Strong
+                                },
+                                point: seq_close_exit.0.clone(),
+                                index: seq_close_exit.1,
+                                previous: None,
+                                next: None,
+                                content_type: None,
+                            },
+                        ],
+                    );
+
+                    break;
+                }
+            }
+        }
+
+        close += 1;
+    }
+
+    // Mark remaining sequences as data.
+    let mut index = 0;
+    while index < runs.len() {
+        let run = &runs[index];
+        // To do: resize!
+        tokenizer.events[run.event_index].token_type = TokenType::Data;
+        tokenizer.events[run.event_index + 1].token_type = TokenType::Data;
+
+        index += 1;
+    }
+
+    let events = edit_map.consume(&mut tokenizer.events);
+    let mut index = 0;
+    println!("after: {:?}", events.len());
+    while index < events.len() {
+        let event = &events[index];
+        println!(
+            "ev: {:?} {:?} {:?} {:?} {:?} {:?}",
+            index,
+            event.event_type,
+            event.token_type,
+            event.content_type,
+            event.previous,
+            event.next
+        );
+        index += 1;
+    }
+
+    events
+}
+
+fn classify_character(code: Code) -> GroupKind {
+    match code {
+        // Markdown whitespace.
+        Code::None
+        | Code::CarriageReturnLineFeed
+        | Code::VirtualSpace
+        | Code::Char('\t' | '\r' | '\n' | ' ') => GroupKind::Whitespace,
+        // Unicode whitespace.
+        Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace,
+        // Unicode punctuation.
+        // To do: `is_punctuation` is not in rust? Why not?
+        // Perhaps we need to generate stuff just like:
+        // <https://github.com/micromark/micromark/blob/main/packages/micromark-util-character/dev/lib/unicode-punctuation-regex.js>.
+        Code::Char(char) if char.is_ascii_punctuation() => GroupKind::Punctuation,
+        Code::Char(_) => GroupKind::Other,
+    }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 9e3dfb0..66b2a3c 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -14,7 +14,7 @@
 //!
 //! The following constructs are found in markdown:
 //!
-//! *   attention (strong, emphasis)
+//! *   [attention (strong, emphasis)][attention]
 //! *   [autolink][]
 //! *   [blank line][blank_line]
 //! *   block quote
@@ -61,6 +61,7 @@
 //! example `ascii_punctuation` refers to
 //! [`char::is_ascii_punctuation`][char::is_ascii_punctuation].
 
+pub mod attention;
 pub mod autolink;
 pub mod blank_line;
 pub mod character_escape;
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index e505997..32182d6 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -41,7 +41,7 @@
 //! > ([label start (image)][label_start_image] or
 //! > [label start (link)][label_start_link]) and a closing
 //! > ([label end][label_end]), so as to allow further phrasing such as
-//! > [code (text)][code_text] or attention.
+//! > [code (text)][code_text] or [attention][].
 //!
 //! ## References
 //!
@@ -49,6 +49,7 @@
 //!
 //! [definition]: crate::construct::definition
 //! [string]: crate::content::string
+//! [attention]: crate::construct::attention
 //! [character_escape]: crate::construct::character_escape
 //! [character_reference]: crate::construct::character_reference
 //! [label_start_image]: crate::construct::label_start_image
@@ -56,8 +57,6 @@
 //! [label_end]: crate::construct::label_end
 //! [code_text]: crate::construct::code_text
 //! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX
-//!
-//! <!-- To do: link attention. -->
 
 use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions};
 use crate::constant::LINK_REFERENCE_SIZE_MAX;
diff --git a/src/content/text.rs b/src/content/text.rs
index c3f4e1b..ecb6ae1 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -1,12 +1,13 @@
 //! The text content type.
 //!
-//! **Text** contains phrasing content such as attention (emphasis, strong),
-//! media (links, images), and actual text.
+//! **Text** contains phrasing content such as
+//! [attention][crate::construct::attention] (emphasis, strong),
+//! [code (text)][crate::construct::code_text], and actual text.
 //!
 //! The constructs found in text are:
 //!
+//! *   [Attention][crate::construct::attention]
 //! *   [Autolink][crate::construct::autolink]
-//! *   Attention
 //! *   [HTML (text)][crate::construct::html_text]
 //! *   [Hard break (escape)][crate::construct::hard_break_escape]
 //! *   [Hard break (trailing)][crate::construct::hard_break_trailing]
@@ -18,9 +19,9 @@
 //! *   [Character reference][crate::construct::character_reference]
 
 use crate::construct::{
-    autolink::start as autolink, character_escape::start as character_escape,
-    character_reference::start as character_reference, code_text::start as code_text,
-    hard_break_escape::start as hard_break_escape,
+    attention::start as attention, autolink::start as autolink,
+    character_escape::start as character_escape, character_reference::start as character_reference,
+    code_text::start as code_text, hard_break_escape::start as hard_break_escape,
     hard_break_trailing::start as hard_break_trailing, html_text::start as html_text,
     label_end::start as label_end, label_start_image::start as label_start_image,
     label_start_link::start as label_start_link, partial_data::start as data,
@@ -28,16 +29,18 @@ use crate::construct::{
 };
 use crate::tokenizer::{Code, State, StateFnResult, Tokenizer};
 
-const MARKERS: [Code; 10] = [
+const MARKERS: [Code; 12] = [
     Code::VirtualSpace, // `whitespace`
     Code::Char('\t'),   // `whitespace`
     Code::Char(' '),    // `hard_break_trailing`, `whitespace`
     Code::Char('!'),    // `label_start_image`
     Code::Char('&'),    // `character_reference`
+    Code::Char('*'),    // `attention`
     Code::Char('<'),    // `autolink`, `html_text`
     Code::Char('['),    // `label_start_link`
     Code::Char('\\'),   // `character_escape`, `hard_break_escape`
     Code::Char(']'),    // `label_end`
+    Code::Char('_'),    // `attention`
     Code::Char('`'),    // `code_text`
 ];
 
@@ -55,6 +58,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
         Code::None => (State::Ok, None),
         _ => tokenizer.attempt_n(
             vec![
+                Box::new(attention),
                 Box::new(autolink),
                 Box::new(character_escape),
                 Box::new(character_reference),
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index b70e706..282c99f 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1593,6 +1593,15 @@ pub enum TokenType {
     ///     ^ ^ ^
     /// ```
     ThematicBreakSequence,
+    Strong,
+    StrongSequence,
+    StrongText,
+    Emphasis,
+    EmphasisSequence,
+    EmphasisText,
+    // To do: this is removed.
+    // Should it reuse something e.g., emphasis? Data?
+    AttentionSequence,
 }
 
 /// Embedded content type.
-- 
cgit