From 053a2603e4bd5ec9caf40617b52136e5ef3fcf0a Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 11 Aug 2022 11:01:49 +0200 Subject: Add improved container exit injection --- src/compiler.rs | 69 +++++++++++-------- src/content/document.rs | 173 +++++++++++++++++++----------------------------- src/subtokenize.rs | 13 ++-- src/tokenizer.rs | 23 +++---- tests/misc_tabs.rs | 8 +++ 5 files changed, 137 insertions(+), 149 deletions(-) diff --git a/src/compiler.rs b/src/compiler.rs index 57ab40a..4a9ec36 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -479,48 +479,63 @@ fn on_enter_list(context: &mut CompileContext) { } else { balance -= 1; - // Blank line directly in list or directly in list item, - // but not a blank line after an empty list item. if balance < 3 && event.token_type == Token::BlankLineEnding { - let mut at_marker = false; + // Blank line directly after a prefix: + // + // ```markdown + // > | -␊ + // ^ + // | a + // ``` + let mut at_prefix = false; + // Blank line directly after item, which is just a prefix. + // + // ```markdown + // > | -␊ + // ^ + // | - a + // ``` + let mut at_empty_list_item = false; + // Blank line at block quote prefix: + // + // ```markdown + // > | * >␊ + // ^ + // | * a + // ``` + let mut at_empty_block_quote = false; - if balance == 2 { + if balance == 1 { let mut before = index - 2; - if events[before].token_type == Token::SpaceOrTab { - before -= 2; - } - - if events[before].token_type == Token::ListItemPrefix { - at_marker = true; - } - } + if events[before].token_type == Token::ListItem { + before -= 1; - let mut at_empty_list_item = false; - let mut at_empty_block_quote = false; + if events[before].token_type == Token::SpaceOrTab { + before -= 2; + } - if balance == 1 { + if events[before].token_type == Token::BlockQuote + && events[before - 1].token_type == Token::BlockQuotePrefix + { + at_empty_block_quote = true; + } else if events[before].token_type == Token::ListItemPrefix { + at_empty_list_item = true; + } + } + } else { let mut before = index - 2; if events[before].token_type == Token::SpaceOrTab { before -= 2; } - if events[before].token_type == Token::ListItem - && events[before - 1].token_type == Token::ListItemPrefix - { - at_empty_list_item = true; - } - - if events[before].token_type == Token::ListItem - && events[before - 1].token_type == Token::BlockQuote - && events[before - 2].token_type == Token::BlockQuotePrefix - { - at_empty_block_quote = true; + if events[before].token_type == Token::ListItemPrefix { + at_prefix = true; } } - if !at_marker && !at_empty_list_item && !at_empty_block_quote { + if !at_prefix && !at_empty_list_item && !at_empty_block_quote { loose = true; break; } diff --git a/src/content/document.rs b/src/content/document.rs index 73c9803..98f8a7d 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -56,7 +56,11 @@ enum Phase { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.bytes.len(), StateName::DocumentStart); + let state = tokenizer.push( + (0, 0), + (parse_state.bytes.len(), 0), + StateName::DocumentStart, + ); tokenizer.flush(state, true); let mut index = 0; @@ -347,8 +351,13 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { _ => unreachable!("expected state name"), }; - // To do: handle VS? - let state = child.push(child.point.index, tokenizer.point.index, name); + tokenizer.tokenize_state.document_exits.push(None); + + let state = child.push( + (child.point.index, child.point.vs), + (tokenizer.point.index, tokenizer.point.vs), + name, + ); let paragraph = matches!(state, State::Next(StateName::ParagraphInside)) || (!child.events.is_empty() @@ -408,69 +417,8 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { } if !stack_close.is_empty() { - let mut inject_index = tokenizer.events.len(); - - // Move past the current data to find the last container start if we’re - // closing due to a potential lazy flow that was not lazy. - if *phase == Phase::After { - inject_index -= 2; - } - - // Move past the container starts to find the last data if we’re - // closing due to a different container or lazy flow like above. - if *phase != Phase::Eof { - while inject_index > 0 { - let event = &tokenizer.events[inject_index - 1]; - - if event.token_type == Token::Data { - break; - } - - inject_index -= 1; - } - } - - // Move past data starts that are just whitespace only without - // container starts. - while inject_index > 0 { - let event = &tokenizer.events[inject_index - 1]; - - if event.token_type == Token::Data { - if event.event_type == EventType::Exit { - let slice = Slice::from_position( - tokenizer.parse_state.bytes, - &Position::from_exit_event(&tokenizer.events, inject_index - 1), - ); - let bytes = slice.bytes; - let mut whitespace = true; - let mut index = 0; - while index < bytes.len() { - match bytes[index] { - b'\t' | b'\n' | b'\r' | b' ' => index += 1, - _ => { - whitespace = false; - break; - } - } - } - - if !whitespace { - break; - } - } - } else { - break; - } - - inject_index -= 1; - } - - let ref_point = if inject_index == tokenizer.events.len() { - tokenizer.point.clone() - } else { - tokenizer.events[inject_index].point.clone() - }; - + let index = tokenizer.tokenize_state.document_exits.len() + - (if *phase == Phase::After { 2 } else { 1 }); let mut exits = Vec::with_capacity(stack_close.len()); while !stack_close.is_empty() { @@ -483,7 +431,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { exits.push(Event { event_type: EventType::Exit, token_type: token_type.clone(), - point: ref_point.clone(), + point: tokenizer.point.clone(), link: None, }); @@ -503,18 +451,49 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { debug_assert!(found, "expected to find container token to exit"); } - tokenizer.map.add(inject_index, 0, exits); + if let Some(ref mut list) = tokenizer.tokenize_state.document_exits[index] { + list.append(&mut exits); + } else { + tokenizer.tokenize_state.document_exits[index] = Some(exits); + } } child.interrupt = false; } -// Inject the container events. +// Inject everything together. fn resolve(tokenizer: &mut Tokenizer) { let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap(); - // To do: see if we can do this less. - tokenizer.map.consume(&mut tokenizer.events); + // First, add the container exits into `child`. + let mut child_index = 0; + let mut line = 0; + + while child_index < child.events.len() { + let event = &child.events[child_index]; + + if event.event_type == EventType::Enter + && (event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding) + { + if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() { + let mut exit_index = 0; + while exit_index < exits.len() { + exits[exit_index].point = event.point.clone(); + exit_index += 1; + } + + child.map.add(child_index, 0, exits); + } + + line += 1; + } + + child_index += 1; + } + + child.map.consume(&mut child.events); + + // Now, add all child events into our parent document tokenizer. divide_events( &mut tokenizer.map, &tokenizer.events, @@ -522,43 +501,29 @@ fn resolve(tokenizer: &mut Tokenizer) { &mut child.events, ); - tokenizer - .resolvers - .append(&mut child.resolvers.split_off(0)); - tokenizer - .resolver_ids - .append(&mut child.resolver_ids.split_off(0)); - - // To do: see if we can do this less. + // Replace the flow data with actual events. tokenizer.map.consume(&mut tokenizer.events); - let mut index = 0; - let mut last_eol_enter: Option = None; - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - if event.event_type == EventType::Exit { - if event.token_type == Token::BlockQuote || event.token_type == Token::ListItem { - if let Some(inject) = last_eol_enter { - let point = tokenizer.events[inject].point.clone(); - let mut clone = event.clone(); - clone.point = point; - // Inject a fixed exit. - tokenizer.map.add(inject, 0, vec![clone]); - // Remove this exit. - tokenizer.map.add(index, 1, vec![]); - } - } else if event.token_type == Token::LineEnding - || event.token_type == Token::BlankLineEnding - { - last_eol_enter = Some(index - 1); - } else { - last_eol_enter = None; + // Now, add some final container exits due to the EOF. + // We can’t inject them into the child earlier, as they are “outside” its + // linked data. + if line < tokenizer.tokenize_state.document_exits.len() { + if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() { + let mut exit_index = 0; + while exit_index < exits.len() { + exits[exit_index].point = tokenizer.point.clone(); + exit_index += 1; } - } - index += 1; + tokenizer.events.append(&mut exits); + } } - tokenizer.map.consume(&mut tokenizer.events); + // Add the resolvers from child. + tokenizer + .resolvers + .append(&mut child.resolvers.split_off(0)); + tokenizer + .resolver_ids + .append(&mut child.resolver_ids.split_off(0)); } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index e0465a0..3d923d3 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -94,9 +94,11 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { tokenizer.define_skip(enter.point.clone()); } + let end = &events[index + 1].point; + state = tokenizer.push( - enter.point.index, - events[index + 1].point.index, + (enter.point.index, enter.point.vs), + (end.index, end.vs), match state { State::Next(func) => func, _ => unreachable!("cannot be ok/nok"), @@ -140,11 +142,12 @@ pub fn divide_events( let mut old_prev: Option = None; while subindex < child_events.len() { + let current = &child_events[subindex].point; + let end = &events[link_index + 1].point; + // Find the first event that starts after the end we’re looking // for. - if child_events[subindex].event_type == EventType::Enter - && child_events[subindex].point.index >= events[link_index + 1].point.index - { + if current.index > end.index || (current.index == end.index && current.vs > end.vs) { slices.push((link_index, slice_start)); slice_start = subindex; link_index = events[link_index].link.as_ref().unwrap().next.unwrap(); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7b8c9a5..3cdd2d3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -17,7 +17,6 @@ use crate::content; use crate::parser::ParseState; use crate::token::{Token, VOID_TOKENS}; use crate::util::edit_map::EditMap; -use std::str; /// Embedded content type. #[derive(Debug, Clone, PartialEq)] @@ -473,6 +472,8 @@ pub struct TokenizeState<'a> { /// To do. pub document_container_stack: Vec, /// To do. + pub document_exits: Vec>>, + /// To do. pub document_continued: usize, /// To do. pub document_paragraph_before: bool, @@ -607,6 +608,7 @@ impl<'a> Tokenizer<'a> { tokenize_state: TokenizeState { connect: false, document_container_stack: vec![], + document_exits: vec![], document_continued: 0, document_paragraph_before: false, document_data_index: None, @@ -897,16 +899,18 @@ impl<'a> Tokenizer<'a> { /// This is set up to support repeatedly calling `feed`, and thus streaming /// markdown into the state machine, and normally pauses after feeding. // Note: if needed: accept `vs`? - pub fn push(&mut self, min: usize, max: usize, name: StateName) -> State { + pub fn push(&mut self, min: (usize, usize), max: (usize, usize), name: StateName) -> State { debug_assert!(!self.resolved, "cannot feed after drain"); + // debug_assert!(min >= self.point.index, "cannot move backwards"); - if min > self.point.index { - self.move_to((min, 0)); + + if min.0 > self.point.index || (min.0 == self.point.index && min.1 > self.point.vs) { + self.move_to(min); } let mut state = State::Next(name); - while self.point.index < max { + while self.point.index < max.0 || (self.point.index == max.0 && self.point.vs < max.1) { match state { State::Ok | State::Nok => { if let Some(attempt) = self.attempts.pop() { @@ -1080,14 +1084,7 @@ fn feed_action_impl( None }; - log::debug!( - "feed: `{:?}` to {:?}", - byte.map_or_else( - || "eof".to_string(), - |d| str::from_utf8(&[d]).unwrap().to_string() - ), - name - ); + log::debug!("feed: `{:?}` to {:?}", byte, name); tokenizer.expect(byte); call_impl(tokenizer, name) } diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs index c5e5c43..da54e59 100644 --- a/tests/misc_tabs.rs +++ b/tests/misc_tabs.rs @@ -275,4 +275,12 @@ fn tabs_virtual_spaces() { "
 x\n
\n", "should strip 3 spaces from an initial tab in fenced code if the opening fence is indented as such" ); + + assert_eq!( + micromark("-\ta\n\n\tb"), + "
    \n
  • \n

    a

    \n

    \tb

    \n
  • \n
", + // To do: CM.js does not output the tab before `b`. See if that makes sense? + // "
    \n
  • \n

    a

    \n

    b

    \n
  • \n
", + "should support a part of a tab as a container, and the rest of a tab as flow" + ); } -- cgit