diff options
Diffstat (limited to '')
-rw-r--r-- | src/compiler.rs | 24 | ||||
-rw-r--r-- | src/construct/attention.rs | 401 | ||||
-rw-r--r-- | src/construct/mod.rs | 3 | ||||
-rw-r--r-- | src/construct/partial_label.rs | 5 | ||||
-rw-r--r-- | src/content/text.rs | 18 | ||||
-rw-r--r-- | src/tokenizer.rs | 9 |
6 files changed, 449 insertions, 11 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 1f16648..061d3e3 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -421,6 +421,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { enter_map.insert(TokenType::CodeIndented, on_enter_code_indented); enter_map.insert(TokenType::CodeFenced, on_enter_code_fenced); enter_map.insert(TokenType::CodeText, on_enter_code_text); + enter_map.insert(TokenType::Emphasis, on_enter_emphasis); enter_map.insert(TokenType::HtmlFlow, on_enter_html_flow); enter_map.insert(TokenType::HtmlText, on_enter_html_text); enter_map.insert(TokenType::Image, on_enter_image); @@ -431,6 +432,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { on_enter_resource_destination_string, ); enter_map.insert(TokenType::Paragraph, on_enter_paragraph); + enter_map.insert(TokenType::Strong, on_enter_strong); enter_map.insert(TokenType::Definition, on_enter_definition); enter_map.insert( TokenType::DefinitionDestinationString, @@ -441,6 +443,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { enter_map.insert(TokenType::DefinitionTitleString, on_enter_buffer); let mut exit_map: Map = HashMap::new(); + exit_map.insert(TokenType::Emphasis, on_exit_emphasis); exit_map.insert(TokenType::Label, on_exit_label); exit_map.insert(TokenType::LabelText, on_exit_label_text); exit_map.insert(TokenType::ReferenceString, on_exit_reference_string); @@ -452,6 +455,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { TokenType::ResourceTitleString, on_exit_resource_title_string, ); + exit_map.insert(TokenType::Strong, on_exit_strong); exit_map.insert(TokenType::Image, on_exit_media); exit_map.insert(TokenType::Link, on_exit_media); exit_map.insert(TokenType::CodeTextData, on_exit_data); @@ -644,6 +648,11 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) { context.ignore_encode = true; } +/// Handle [`Enter`][EventType::Enter]:[`Emphasis`][TokenType::Emphasis]. +fn on_enter_emphasis(context: &mut CompileContext) { + context.tag("<em>".to_string()); +} + /// Handle [`Enter`][EventType::Enter]:[`HtmlFlow`][TokenType::HtmlFlow]. fn on_enter_html_flow(context: &mut CompileContext) { context.line_ending_if_needed(); @@ -704,6 +713,11 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) { context.ignore_encode = true; } +/// Handle [`Enter`][EventType::Enter]:[`Strong`][TokenType::Strong]. +fn on_enter_strong(context: &mut CompileContext) { + context.tag("<strong>".to_string()); +} + /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][TokenType::AutolinkEmail]. fn on_exit_autolink_email(context: &mut CompileContext) { let slice = serialize( @@ -933,6 +947,11 @@ fn on_exit_definition_title_string(context: &mut CompileContext) { definition.title = Some(buf); } +/// Handle [`Exit`][EventType::Exit]:[`Strong`][TokenType::Emphasis]. +fn on_exit_emphasis(context: &mut CompileContext) { + context.tag("</em>".to_string()); +} + /// Handle [`Exit`][EventType::Exit]:[`HeadingAtx`][TokenType::HeadingAtx]. fn on_exit_heading_atx(context: &mut CompileContext) { let rank = context @@ -1132,6 +1151,11 @@ fn on_exit_resource_title_string(context: &mut CompileContext) { media.title = Some(buf); } +/// Handle [`Exit`][EventType::Exit]:[`Strong`][TokenType::Strong]. +fn on_exit_strong(context: &mut CompileContext) { + context.tag("</strong>".to_string()); +} + /// Handle [`Exit`][EventType::Exit]:[`ThematicBreak`][TokenType::ThematicBreak]. fn on_exit_thematic_break(context: &mut CompileContext) { context.tag("<hr />".to_string()); diff --git a/src/construct/attention.rs b/src/construct/attention.rs new file mode 100644 index 0000000..f022e6e --- /dev/null +++ b/src/construct/attention.rs @@ -0,0 +1,401 @@ +//! To do. + +use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; + +/// To do +#[derive(Debug, PartialEq)] +enum GroupKind { + Whitespace, + Punctuation, + Other, +} + +/// To do +#[derive(Debug, PartialEq)] +enum MarkerKind { + Asterisk, + Underscore, +} + +impl MarkerKind { + fn from_char(char: char) -> MarkerKind { + match char { + '*' => MarkerKind::Asterisk, + '_' => MarkerKind::Underscore, + _ => unreachable!("invalid char"), + } + } + fn from_code(code: Code) -> MarkerKind { + match code { + Code::Char(char) => MarkerKind::from_char(char), + _ => unreachable!("invalid code"), + } + } +} + +/// To do +#[derive(Debug)] +struct Run { + marker: MarkerKind, + event_index: usize, + start_point: Point, + start_index: usize, + end_point: Point, + end_index: usize, + size: usize, + open: bool, + close: bool, +} + +/// Before a paragraph. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '*' || char == '_' => { + tokenizer.enter(TokenType::AttentionSequence); + inside(tokenizer, code, char) + } + _ => (State::Nok, None), + } +} + +/// In a paragraph. +/// +/// ```markdown +/// al|pha +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult { + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + (State::Fn(Box::new(move |t, c| inside(t, c, marker))), None) + } + _ => { + tokenizer.exit(TokenType::AttentionSequence); + tokenizer.register_resolver("attention".to_string(), Box::new(resolve)); + (State::Ok, Some(vec![code])) + } + } +} + +/// To do. +#[allow(clippy::too_many_lines)] +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut index = 0; + println!("before: {:?}", tokenizer.events.len()); + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + println!( + "ev: {:?} {:?} {:?} {:?} {:?} {:?}", + index, + event.event_type, + event.token_type, + event.content_type, + event.previous, + event.next + ); + index += 1; + } + + let codes = &tokenizer.parse_state.codes; + let mut edit_map = EditMap::new(); + let mut start = 0; + let mut runs: Vec<Run> = vec![]; + + // Find runs of sequences and information about them. + while start < tokenizer.events.len() { + let enter = &tokenizer.events[start]; + + if enter.event_type == EventType::Enter && enter.token_type == TokenType::AttentionSequence + { + let end = start + 1; + let exit = &tokenizer.events[end]; + let marker = MarkerKind::from_code(codes[enter.index]); + let before = classify_character(if enter.index > 0 { + codes[enter.index - 1] + } else { + Code::None + }); + let after = classify_character(if exit.index < codes.len() { + codes[exit.index] + } else { + Code::None + }); + let open = after == GroupKind::Other + || (after == GroupKind::Punctuation && before != GroupKind::Other); + // To do: GFM strikethrough? + // || attentionMarkers.includes(code) + let close = before == GroupKind::Other + || (before == GroupKind::Punctuation && after != GroupKind::Other); + // To do: GFM strikethrough? + // || attentionMarkers.includes(previous) + + runs.push(Run { + event_index: start, + start_point: enter.point.clone(), + start_index: enter.index, + end_point: exit.point.clone(), + end_index: exit.index, + size: exit.index - enter.index, + open: if marker == MarkerKind::Asterisk { + open + } else { + open && (before != GroupKind::Other || !close) + }, + close: if marker == MarkerKind::Asterisk { + close + } else { + close && (after != GroupKind::Other || !open) + }, + marker, + }); + + start += 1; + } + + start += 1; + } + + // Walk through runs and match them. + let mut close = 0; + + while close < runs.len() { + let run_close = &runs[close]; + + // Find a run that can close. + if run_close.close { + let mut open = close; + + // Now walk back to find an opener. + while open > 0 { + open -= 1; + + let run_open = &runs[open]; + + // Find a token that can open the closer. + if run_open.open && run_close.marker == run_open.marker { + // If the opening can close or the closing can open, + // and the close size *is not* a multiple of three, + // but the sum of the opening and closing size *is* + // multiple of three, then **don’t** match. + if (run_open.close || run_close.open) + && run_close.size % 3 != 0 + && (run_open.size + run_close.size) % 3 == 0 + { + continue; + } + + // Number of markers to use from the sequence. + let take = if run_open.size > 1 && run_close.size > 1 { + 2 + } else { + 1 + }; + + let run_close = &mut runs[close]; + let close_event_index = run_close.event_index; + let seq_close_enter = (run_close.start_point.clone(), run_close.start_index); + run_close.size -= take; + run_close.start_point.column += take; + run_close.start_point.offset += take; + let seq_close_exit = (run_close.start_point.clone(), run_close.start_index); + + // Remove closing run if fully used. + if run_close.size == 0 { + runs.remove(close); + edit_map.add(close_event_index, 2, vec![]); + } + + let run_open = &mut runs[open]; + let open_event_index = run_open.event_index; + let seq_open_exit = (run_open.end_point.clone(), run_open.end_index); + run_open.size -= take; + run_open.end_point.column -= take; + run_open.end_point.offset -= take; + let seq_open_enter = (run_open.end_point.clone(), run_open.end_index); + + // Remove opening run if fully used. + if run_open.size == 0 { + runs.remove(open); + edit_map.add(open_event_index, 2, vec![]); + } + + // Opening. + edit_map.add( + open_event_index, + 0, + vec![ + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::Emphasis + } else { + TokenType::Strong + }, + point: seq_open_enter.0.clone(), + index: seq_open_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_open_enter.0.clone(), + index: seq_open_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_open_exit.0.clone(), + index: seq_open_exit.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::EmphasisText + } else { + TokenType::StrongText + }, + point: seq_open_exit.0.clone(), + index: seq_open_exit.1, + previous: None, + next: None, + content_type: None, + }, + ], + ); + // Closing. + edit_map.add( + close_event_index, + 0, + vec![ + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::EmphasisText + } else { + TokenType::StrongText + }, + point: seq_close_enter.0.clone(), + index: seq_close_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_close_enter.0.clone(), + index: seq_close_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_close_exit.0.clone(), + index: seq_close_exit.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::Emphasis + } else { + TokenType::Strong + }, + point: seq_close_exit.0.clone(), + index: seq_close_exit.1, + previous: None, + next: None, + content_type: None, + }, + ], + ); + + break; + } + } + } + + close += 1; + } + + // Mark remaining sequences as data. + let mut index = 0; + while index < runs.len() { + let run = &runs[index]; + // To do: resize! + tokenizer.events[run.event_index].token_type = TokenType::Data; + tokenizer.events[run.event_index + 1].token_type = TokenType::Data; + + index += 1; + } + + let events = edit_map.consume(&mut tokenizer.events); + let mut index = 0; + println!("after: {:?}", events.len()); + while index < events.len() { + let event = &events[index]; + println!( + "ev: {:?} {:?} {:?} {:?} {:?} {:?}", + index, + event.event_type, + event.token_type, + event.content_type, + event.previous, + event.next + ); + index += 1; + } + + events +} + +fn classify_character(code: Code) -> GroupKind { + match code { + // Markdown whitespace. + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\r' | '\n' | ' ') => GroupKind::Whitespace, + // Unicode whitespace. + Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace, + // Unicode punctuation. + // To do: `is_punctuation` is not in rust? Why not? + // Perhaps we need to generate stuff just like: + // <https://github.com/micromark/micromark/blob/main/packages/micromark-util-character/dev/lib/unicode-punctuation-regex.js>. + Code::Char(char) if char.is_ascii_punctuation() => GroupKind::Punctuation, + Code::Char(_) => GroupKind::Other, + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 9e3dfb0..66b2a3c 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -14,7 +14,7 @@ //! //! The following constructs are found in markdown: //! -//! * attention (strong, emphasis) +//! * [attention (strong, emphasis)][attention] //! * [autolink][] //! * [blank line][blank_line] //! * block quote @@ -61,6 +61,7 @@ //! example `ascii_punctuation` refers to //! [`char::is_ascii_punctuation`][char::is_ascii_punctuation]. +pub mod attention; pub mod autolink; pub mod blank_line; pub mod character_escape; diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index e505997..32182d6 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -41,7 +41,7 @@ //! > ([label start (image)][label_start_image] or //! > [label start (link)][label_start_link]) and a closing //! > ([label end][label_end]), so as to allow further phrasing such as -//! > [code (text)][code_text] or attention. +//! > [code (text)][code_text] or [attention][]. //! //! ## References //! @@ -49,6 +49,7 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [attention]: crate::construct::attention //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference //! [label_start_image]: crate::construct::label_start_image @@ -56,8 +57,6 @@ //! [label_end]: crate::construct::label_end //! [code_text]: crate::construct::code_text //! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX -//! -//! <!-- To do: link attention. --> use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::constant::LINK_REFERENCE_SIZE_MAX; diff --git a/src/content/text.rs b/src/content/text.rs index c3f4e1b..ecb6ae1 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -1,12 +1,13 @@ //! The text content type. //! -//! **Text** contains phrasing content such as attention (emphasis, strong), -//! media (links, images), and actual text. +//! **Text** contains phrasing content such as +//! [attention][crate::construct::attention] (emphasis, strong), +//! [code (text)][crate::construct::code_text], and actual text. //! //! The constructs found in text are: //! +//! * [Attention][crate::construct::attention] //! * [Autolink][crate::construct::autolink] -//! * Attention //! * [HTML (text)][crate::construct::html_text] //! * [Hard break (escape)][crate::construct::hard_break_escape] //! * [Hard break (trailing)][crate::construct::hard_break_trailing] @@ -18,9 +19,9 @@ //! * [Character reference][crate::construct::character_reference] use crate::construct::{ - autolink::start as autolink, character_escape::start as character_escape, - character_reference::start as character_reference, code_text::start as code_text, - hard_break_escape::start as hard_break_escape, + attention::start as attention, autolink::start as autolink, + character_escape::start as character_escape, character_reference::start as character_reference, + code_text::start as code_text, hard_break_escape::start as hard_break_escape, hard_break_trailing::start as hard_break_trailing, html_text::start as html_text, label_end::start as label_end, label_start_image::start as label_start_image, label_start_link::start as label_start_link, partial_data::start as data, @@ -28,16 +29,18 @@ use crate::construct::{ }; use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; -const MARKERS: [Code; 10] = [ +const MARKERS: [Code; 12] = [ Code::VirtualSpace, // `whitespace` Code::Char('\t'), // `whitespace` Code::Char(' '), // `hard_break_trailing`, `whitespace` Code::Char('!'), // `label_start_image` Code::Char('&'), // `character_reference` + Code::Char('*'), // `attention` Code::Char('<'), // `autolink`, `html_text` Code::Char('['), // `label_start_link` Code::Char('\\'), // `character_escape`, `hard_break_escape` Code::Char(']'), // `label_end` + Code::Char('_'), // `attention` Code::Char('`'), // `code_text` ]; @@ -55,6 +58,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None => (State::Ok, None), _ => tokenizer.attempt_n( vec![ + Box::new(attention), Box::new(autolink), Box::new(character_escape), Box::new(character_reference), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b70e706..282c99f 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1593,6 +1593,15 @@ pub enum TokenType { /// ^ ^ ^ /// ``` ThematicBreakSequence, + Strong, + StrongSequence, + StrongText, + Emphasis, + EmphasisSequence, + EmphasisText, + // To do: this is removed. + // Should it reuse something e.g., emphasis? Data? + AttentionSequence, } /// Embedded content type. |