diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/attention.rs | 246 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 4 |
2 files changed, 182 insertions, 68 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs index f4bb841..dff8633 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -1,25 +1,119 @@ -//! To do. +//! Attention is a construct that occurs in the [text][] content type. +//! +//! How attention parses is too complex to explain in BNF. +//! Essentially, one or more of `*` or `_` form attention sequences. +//! Depending on the code before and after a sequence, it can open or close +//! attention. +//! When everything is parsed, we find each sequence that can close, and a +//! corresponding sequence that can open which uses the same marker. +//! If both sequences have two or more markers, strong is formed. +//! Otherwise emphasis is formed. +//! +//! Attention sequences do not, on their own, relate to anything in HTML. +//! When matched with another sequence, and two markers can be โtakenโ from +//! them, they together relate to the `<strong>` element in HTML. +//! When one marker can be taken, they relate to the `<em>` element. +//! See [*ยง 4.5.2 The `em` element*][html-em] and +//! [*ยง 4.5.3 The `strong` element*][html-strong] in the HTML spec for more +//! info. +//! +//! It is recommended to use asterisks for attention when writing markdown. +//! +//! There are some small differences in whether sequences can open and/or close +//! based on whether they are formed with asterisks or underscores. +//! Because underscores also frequently occur in natural language inside words, +//! while asterisks typically never do, `CommonMark` prohobits underscore +//! sequences from opening or closing when *inside* a word. +//! +//! Because asterisks can be used to form the most markdown constructs, using +//! them has the added benefit of making it easier to gloss over markdown: you +//! can look for asterisks to find syntax while not worrying about other +//! characters. +//! +//! ## Tokens +//! +//! * [`Emphasis`][TokenType::Emphasis] +//! * [`EmphasisSequence`][TokenType::EmphasisSequence] +//! * [`EmphasisText`][TokenType::EmphasisText] +//! * [`Strong`][TokenType::Strong] +//! * [`StrongSequence`][TokenType::StrongSequence] +//! * [`StrongText`][TokenType::StrongText] +//! +//! > ๐ **Note**: while parsing, [`AttentionSequence`][TokenType::AttentionSequence] +//! > is used, which is later compiled away. +//! +//! ## References +//! +//! * [`attention.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/attention.js) +//! * [*ยง 6.2 Emphasis and strong emphasis* in `CommonMark`](https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis) +//! +//! [text]: crate::content::text +//! [html-em]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-em-element +//! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; use crate::unicode::PUNCTUATION; use crate::util::edit_map::EditMap; -/// To do +/// Character code kinds. #[derive(Debug, PartialEq)] enum GroupKind { + /// Whitespace. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` Whitespace, + /// Punctuation. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^^ ^ ^ ^ + /// ``` Punctuation, + /// Everything else. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` Other, } -/// To do +/// Type of sequence. #[derive(Debug, PartialEq)] enum MarkerKind { + /// In a run with asterisks. + /// + /// ## Example + /// + /// ```markdown + /// *a* + /// ``` Asterisk, + /// In a run with underscores. + /// + /// ## Example + /// + /// ```markdown + /// _a_ + /// ``` Underscore, } impl MarkerKind { + /// Turn [char] into a kind. + /// + /// ## Panics + /// + /// Panics if `char` is not `*` or `_`. fn from_char(char: char) -> MarkerKind { match char { '*' => MarkerKind::Asterisk, @@ -27,6 +121,11 @@ impl MarkerKind { _ => unreachable!("invalid char"), } } + /// Turn [Code] into a kind. + /// + /// ## Panics + /// + /// Panics if `code` is not `Code::Char('*' | '_')`. fn from_code(code: Code) -> MarkerKind { match code { Code::Char(char) => MarkerKind::from_char(char), @@ -35,9 +134,9 @@ impl MarkerKind { } } -/// To do +/// Attentention sequence that we can take markers from. #[derive(Debug)] -struct Run { +struct Sequence { marker: MarkerKind, event_index: usize, start_point: Point, @@ -49,10 +148,10 @@ struct Run { close: bool, } -/// Before a paragraph. +/// Before a sequence. /// /// ```markdown -/// |qwe +/// |** /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { @@ -64,10 +163,10 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// In a paragraph. +/// In a sequence. /// /// ```markdown -/// al|pha +/// *|* /// ``` fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult { match code { @@ -83,9 +182,9 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult } } -/// To do. +/// Resolve attention sequences. #[allow(clippy::too_many_lines)] -pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { +fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { let mut index = 0; println!("before: {:?}", tokenizer.events.len()); while index < tokenizer.events.len() { @@ -105,9 +204,9 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { let codes = &tokenizer.parse_state.codes; let mut edit_map = EditMap::new(); let mut start = 0; - let mut runs: Vec<Run> = vec![]; + let mut sequences: Vec<Sequence> = vec![]; - // Find runs of sequences and information about them. + // Find sequences of sequences and information about them. while start < tokenizer.events.len() { let enter = &tokenizer.events[start]; @@ -135,7 +234,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { // To do: GFM strikethrough? // || attentionMarkers.includes(previous) - runs.push(Run { + sequences.push(Sequence { event_index: start, start_point: enter.point.clone(), start_index: enter.index, @@ -161,35 +260,35 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { start += 1; } - // Walk through runs and match them. + // Walk through sequences and match them. let mut close = 0; - while close < runs.len() { - let run_close = &runs[close]; + while close < sequences.len() { + let sequence_close = &sequences[close]; let mut next_index = close + 1; - println!("walk! {:?} {:?}", close, runs.len()); + println!("walk! {:?} {:?}", close, sequences.len()); - // Find a run that can close. - if run_close.close { - println!("close! {:?} {:?}", close, run_close); + // Find a sequence that can close. + if sequence_close.close { + println!("close! {:?} {:?}", close, sequence_close); let mut open = close; // Now walk back to find an opener. while open > 0 { open -= 1; - let run_open = &runs[open]; + let sequence_open = &sequences[open]; - // We found a run that can open the closer we found. - if run_open.open && run_close.marker == run_open.marker { - println!("open! {:?} {:?}", open, run_open); + // We found a sequence that can open the closer we found. + if sequence_open.open && sequence_close.marker == sequence_open.marker { + println!("open! {:?} {:?}", open, sequence_open); // If the opening can close or the closing can open, // and the close size *is not* a multiple of three, // but the sum of the opening and closing size *is* // multiple of three, then **donโt** match. - if (run_open.close || run_close.open) - && run_close.size % 3 != 0 - && (run_open.size + run_close.size) % 3 == 0 + if (sequence_open.close || sequence_close.open) + && sequence_close.size % 3 != 0 + && (sequence_open.size + sequence_close.size) % 3 == 0 { continue; } @@ -197,34 +296,40 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { // Weโve found a match! // Number of markers to use from the sequence. - let take = if run_open.size > 1 && run_close.size > 1 { + let take = if sequence_open.size > 1 && sequence_close.size > 1 { 2 } else { 1 }; - let run_close = &mut runs[close]; - let close_event_index = run_close.event_index; - let seq_close_enter = (run_close.start_point.clone(), run_close.start_index); - run_close.size -= take; - run_close.start_point.column += take; - run_close.start_point.offset += take; - run_close.start_index += take; - let seq_close_exit = (run_close.start_point.clone(), run_close.start_index); + let sequence_close = &mut sequences[close]; + let close_event_index = sequence_close.event_index; + let seq_close_enter = ( + sequence_close.start_point.clone(), + sequence_close.start_index, + ); + sequence_close.size -= take; + sequence_close.start_point.column += take; + sequence_close.start_point.offset += take; + sequence_close.start_index += take; + let seq_close_exit = ( + sequence_close.start_point.clone(), + sequence_close.start_index, + ); - // Stay on this closing run for the next iteration: it + // Stay on this closing sequence for the next iteration: it // might close more things. next_index -= 1; - // Remove closing run if fully used. - if run_close.size == 0 { - runs.remove(close); + // Remove closing sequence if fully used. + if sequence_close.size == 0 { + sequences.remove(close); edit_map.add(close_event_index, 2, vec![]); println!("remove close"); } else { - // Shift remaining closing run forward. - // Do it here because a run can open and close different - // other runs, and the remainder can be on any side or + // Shift remaining closing sequence forward. + // Do it here because a sequence can open and close different + // other sequences, and the remainder can be on any side or // somewhere in the middle. let mut enter = &mut tokenizer.events[close_event_index]; enter.point = seq_close_exit.0.clone(); @@ -232,23 +337,23 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { println!("change close"); } - let run_open = &mut runs[open]; - let open_event_index = run_open.event_index; - let seq_open_exit = (run_open.end_point.clone(), run_open.end_index); - run_open.size -= take; - run_open.end_point.column -= take; - run_open.end_point.offset -= take; - run_open.end_index -= take; - let seq_open_enter = (run_open.end_point.clone(), run_open.end_index); - - // Remove opening run if fully used. - if run_open.size == 0 { - runs.remove(open); + let sequence_open = &mut sequences[open]; + let open_event_index = sequence_open.event_index; + let seq_open_exit = (sequence_open.end_point.clone(), sequence_open.end_index); + sequence_open.size -= take; + sequence_open.end_point.column -= take; + sequence_open.end_point.offset -= take; + sequence_open.end_index -= take; + let seq_open_enter = (sequence_open.end_point.clone(), sequence_open.end_index); + + // Remove opening sequence if fully used. + if sequence_open.size == 0 { + sequences.remove(open); edit_map.add(open_event_index, 2, vec![]); next_index -= 1; println!("remove open"); } else { - // Shift remaining opening run backwards. + // Shift remaining opening sequence backwards. // See note above for why that happens here. let mut exit = &mut tokenizer.events[open_event_index + 1]; exit.point = seq_open_enter.0.clone(); @@ -385,10 +490,10 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { // Mark remaining sequences as data. let mut index = 0; - while index < runs.len() { - let run = &runs[index]; - tokenizer.events[run.event_index].token_type = TokenType::Data; - tokenizer.events[run.event_index + 1].token_type = TokenType::Data; + while index < sequences.len() { + let sequence = &sequences[index]; + tokenizer.events[sequence.event_index].token_type = TokenType::Data; + tokenizer.events[sequence.event_index + 1].token_type = TokenType::Data; index += 1; } @@ -412,17 +517,26 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { events } +/// Classify whether a character code represents whitespace, punctuation, or +/// something else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// > ๐ **Note** that eof (`Code::None`) is seen as whitespace. +/// +/// ## References +/// +/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) fn classify_character(code: Code) -> GroupKind { match code { - // Markdown whitespace. - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\r' | '\n' | ' ') => GroupKind::Whitespace, + // Custom characters. + Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace => GroupKind::Whitespace, // Unicode whitespace. Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace, // Unicode punctuation. Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation, + // Everything else. Code::Char(_) => GroupKind::Other, } } diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 3d0bfb6..caacb0d 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -79,7 +79,7 @@ enum Kind { impl Kind { /// Turn the kind into a [char]. /// - /// > ๐ **Note**: a closing paren is used. + /// > ๐ **Note**: a closing paren is used for `Kind::Paren`. fn as_char(&self) -> char { match self { Kind::Paren => ')', @@ -89,7 +89,7 @@ impl Kind { } /// Turn a [char] into a kind. /// - /// > ๐ **Note**: an opening paren must be used. + /// > ๐ **Note**: an opening paren must be used for `Kind::Paren`. /// /// ## Panics /// |