diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/attention.rs | 401 |
1 files changed, 401 insertions, 0 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs new file mode 100644 index 0000000..f022e6e --- /dev/null +++ b/src/construct/attention.rs @@ -0,0 +1,401 @@ +//! To do. + +use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; + +/// To do +#[derive(Debug, PartialEq)] +enum GroupKind { + Whitespace, + Punctuation, + Other, +} + +/// To do +#[derive(Debug, PartialEq)] +enum MarkerKind { + Asterisk, + Underscore, +} + +impl MarkerKind { + fn from_char(char: char) -> MarkerKind { + match char { + '*' => MarkerKind::Asterisk, + '_' => MarkerKind::Underscore, + _ => unreachable!("invalid char"), + } + } + fn from_code(code: Code) -> MarkerKind { + match code { + Code::Char(char) => MarkerKind::from_char(char), + _ => unreachable!("invalid code"), + } + } +} + +/// To do +#[derive(Debug)] +struct Run { + marker: MarkerKind, + event_index: usize, + start_point: Point, + start_index: usize, + end_point: Point, + end_index: usize, + size: usize, + open: bool, + close: bool, +} + +/// Before a paragraph. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '*' || char == '_' => { + tokenizer.enter(TokenType::AttentionSequence); + inside(tokenizer, code, char) + } + _ => (State::Nok, None), + } +} + +/// In a paragraph. +/// +/// ```markdown +/// al|pha +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult { + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + (State::Fn(Box::new(move |t, c| inside(t, c, marker))), None) + } + _ => { + tokenizer.exit(TokenType::AttentionSequence); + tokenizer.register_resolver("attention".to_string(), Box::new(resolve)); + (State::Ok, Some(vec![code])) + } + } +} + +/// To do. +#[allow(clippy::too_many_lines)] +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut index = 0; + println!("before: {:?}", tokenizer.events.len()); + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + println!( + "ev: {:?} {:?} {:?} {:?} {:?} {:?}", + index, + event.event_type, + event.token_type, + event.content_type, + event.previous, + event.next + ); + index += 1; + } + + let codes = &tokenizer.parse_state.codes; + let mut edit_map = EditMap::new(); + let mut start = 0; + let mut runs: Vec<Run> = vec![]; + + // Find runs of sequences and information about them. + while start < tokenizer.events.len() { + let enter = &tokenizer.events[start]; + + if enter.event_type == EventType::Enter && enter.token_type == TokenType::AttentionSequence + { + let end = start + 1; + let exit = &tokenizer.events[end]; + let marker = MarkerKind::from_code(codes[enter.index]); + let before = classify_character(if enter.index > 0 { + codes[enter.index - 1] + } else { + Code::None + }); + let after = classify_character(if exit.index < codes.len() { + codes[exit.index] + } else { + Code::None + }); + let open = after == GroupKind::Other + || (after == GroupKind::Punctuation && before != GroupKind::Other); + // To do: GFM strikethrough? + // || attentionMarkers.includes(code) + let close = before == GroupKind::Other + || (before == GroupKind::Punctuation && after != GroupKind::Other); + // To do: GFM strikethrough? + // || attentionMarkers.includes(previous) + + runs.push(Run { + event_index: start, + start_point: enter.point.clone(), + start_index: enter.index, + end_point: exit.point.clone(), + end_index: exit.index, + size: exit.index - enter.index, + open: if marker == MarkerKind::Asterisk { + open + } else { + open && (before != GroupKind::Other || !close) + }, + close: if marker == MarkerKind::Asterisk { + close + } else { + close && (after != GroupKind::Other || !open) + }, + marker, + }); + + start += 1; + } + + start += 1; + } + + // Walk through runs and match them. + let mut close = 0; + + while close < runs.len() { + let run_close = &runs[close]; + + // Find a run that can close. + if run_close.close { + let mut open = close; + + // Now walk back to find an opener. + while open > 0 { + open -= 1; + + let run_open = &runs[open]; + + // Find a token that can open the closer. + if run_open.open && run_close.marker == run_open.marker { + // If the opening can close or the closing can open, + // and the close size *is not* a multiple of three, + // but the sum of the opening and closing size *is* + // multiple of three, then **don’t** match. + if (run_open.close || run_close.open) + && run_close.size % 3 != 0 + && (run_open.size + run_close.size) % 3 == 0 + { + continue; + } + + // Number of markers to use from the sequence. + let take = if run_open.size > 1 && run_close.size > 1 { + 2 + } else { + 1 + }; + + let run_close = &mut runs[close]; + let close_event_index = run_close.event_index; + let seq_close_enter = (run_close.start_point.clone(), run_close.start_index); + run_close.size -= take; + run_close.start_point.column += take; + run_close.start_point.offset += take; + let seq_close_exit = (run_close.start_point.clone(), run_close.start_index); + + // Remove closing run if fully used. + if run_close.size == 0 { + runs.remove(close); + edit_map.add(close_event_index, 2, vec![]); + } + + let run_open = &mut runs[open]; + let open_event_index = run_open.event_index; + let seq_open_exit = (run_open.end_point.clone(), run_open.end_index); + run_open.size -= take; + run_open.end_point.column -= take; + run_open.end_point.offset -= take; + let seq_open_enter = (run_open.end_point.clone(), run_open.end_index); + + // Remove opening run if fully used. + if run_open.size == 0 { + runs.remove(open); + edit_map.add(open_event_index, 2, vec![]); + } + + // Opening. + edit_map.add( + open_event_index, + 0, + vec![ + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::Emphasis + } else { + TokenType::Strong + }, + point: seq_open_enter.0.clone(), + index: seq_open_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_open_enter.0.clone(), + index: seq_open_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_open_exit.0.clone(), + index: seq_open_exit.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::EmphasisText + } else { + TokenType::StrongText + }, + point: seq_open_exit.0.clone(), + index: seq_open_exit.1, + previous: None, + next: None, + content_type: None, + }, + ], + ); + // Closing. + edit_map.add( + close_event_index, + 0, + vec![ + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::EmphasisText + } else { + TokenType::StrongText + }, + point: seq_close_enter.0.clone(), + index: seq_close_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Enter, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_close_enter.0.clone(), + index: seq_close_enter.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::EmphasisSequence + } else { + TokenType::StrongSequence + }, + point: seq_close_exit.0.clone(), + index: seq_close_exit.1, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: if take == 1 { + TokenType::Emphasis + } else { + TokenType::Strong + }, + point: seq_close_exit.0.clone(), + index: seq_close_exit.1, + previous: None, + next: None, + content_type: None, + }, + ], + ); + + break; + } + } + } + + close += 1; + } + + // Mark remaining sequences as data. + let mut index = 0; + while index < runs.len() { + let run = &runs[index]; + // To do: resize! + tokenizer.events[run.event_index].token_type = TokenType::Data; + tokenizer.events[run.event_index + 1].token_type = TokenType::Data; + + index += 1; + } + + let events = edit_map.consume(&mut tokenizer.events); + let mut index = 0; + println!("after: {:?}", events.len()); + while index < events.len() { + let event = &events[index]; + println!( + "ev: {:?} {:?} {:?} {:?} {:?} {:?}", + index, + event.event_type, + event.token_type, + event.content_type, + event.previous, + event.next + ); + index += 1; + } + + events +} + +fn classify_character(code: Code) -> GroupKind { + match code { + // Markdown whitespace. + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\r' | '\n' | ' ') => GroupKind::Whitespace, + // Unicode whitespace. + Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace, + // Unicode punctuation. + // To do: `is_punctuation` is not in rust? Why not? + // Perhaps we need to generate stuff just like: + // <https://github.com/micromark/micromark/blob/main/packages/micromark-util-character/dev/lib/unicode-punctuation-regex.js>. + Code::Char(char) if char.is_ascii_punctuation() => GroupKind::Punctuation, + Code::Char(_) => GroupKind::Other, + } +} |