aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-11 11:01:49 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-11 11:01:49 +0200
commit053a2603e4bd5ec9caf40617b52136e5ef3fcf0a (patch)
tree14719bc0759a3a9039e88368d3c10ace5075e906
parent30e5f806277d14d5dcab708ccd0ce07a4894c1f9 (diff)
downloadmarkdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.tar.gz
markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.tar.bz2
markdown-rs-053a2603e4bd5ec9caf40617b52136e5ef3fcf0a.zip
Add improved container exit injection
-rw-r--r--src/compiler.rs69
-rw-r--r--src/content/document.rs173
-rw-r--r--src/subtokenize.rs13
-rw-r--r--src/tokenizer.rs23
-rw-r--r--tests/misc_tabs.rs8
5 files changed, 137 insertions, 149 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 57ab40a..4a9ec36 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -479,48 +479,63 @@ fn on_enter_list(context: &mut CompileContext) {
} else {
balance -= 1;
- // Blank line directly in list or directly in list item,
- // but not a blank line after an empty list item.
if balance < 3 && event.token_type == Token::BlankLineEnding {
- let mut at_marker = false;
+ // Blank line directly after a prefix:
+ //
+ // ```markdown
+ // > | -␊
+ // ^
+ // | a
+ // ```
+ let mut at_prefix = false;
+ // Blank line directly after item, which is just a prefix.
+ //
+ // ```markdown
+ // > | -␊
+ // ^
+ // | - a
+ // ```
+ let mut at_empty_list_item = false;
+ // Blank line at block quote prefix:
+ //
+ // ```markdown
+ // > | * >␊
+ // ^
+ // | * a
+ // ```
+ let mut at_empty_block_quote = false;
- if balance == 2 {
+ if balance == 1 {
let mut before = index - 2;
- if events[before].token_type == Token::SpaceOrTab {
- before -= 2;
- }
-
- if events[before].token_type == Token::ListItemPrefix {
- at_marker = true;
- }
- }
+ if events[before].token_type == Token::ListItem {
+ before -= 1;
- let mut at_empty_list_item = false;
- let mut at_empty_block_quote = false;
+ if events[before].token_type == Token::SpaceOrTab {
+ before -= 2;
+ }
- if balance == 1 {
+ if events[before].token_type == Token::BlockQuote
+ && events[before - 1].token_type == Token::BlockQuotePrefix
+ {
+ at_empty_block_quote = true;
+ } else if events[before].token_type == Token::ListItemPrefix {
+ at_empty_list_item = true;
+ }
+ }
+ } else {
let mut before = index - 2;
if events[before].token_type == Token::SpaceOrTab {
before -= 2;
}
- if events[before].token_type == Token::ListItem
- && events[before - 1].token_type == Token::ListItemPrefix
- {
- at_empty_list_item = true;
- }
-
- if events[before].token_type == Token::ListItem
- && events[before - 1].token_type == Token::BlockQuote
- && events[before - 2].token_type == Token::BlockQuotePrefix
- {
- at_empty_block_quote = true;
+ if events[before].token_type == Token::ListItemPrefix {
+ at_prefix = true;
}
}
- if !at_marker && !at_empty_list_item && !at_empty_block_quote {
+ if !at_prefix && !at_empty_list_item && !at_empty_block_quote {
loose = true;
break;
}
diff --git a/src/content/document.rs b/src/content/document.rs
index 73c9803..98f8a7d 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -56,7 +56,11 @@ enum Phase {
pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let mut tokenizer = Tokenizer::new(point, parse_state);
- let state = tokenizer.push(0, parse_state.bytes.len(), StateName::DocumentStart);
+ let state = tokenizer.push(
+ (0, 0),
+ (parse_state.bytes.len(), 0),
+ StateName::DocumentStart,
+ );
tokenizer.flush(state, true);
let mut index = 0;
@@ -347,8 +351,13 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
_ => unreachable!("expected state name"),
};
- // To do: handle VS?
- let state = child.push(child.point.index, tokenizer.point.index, name);
+ tokenizer.tokenize_state.document_exits.push(None);
+
+ let state = child.push(
+ (child.point.index, child.point.vs),
+ (tokenizer.point.index, tokenizer.point.vs),
+ name,
+ );
let paragraph = matches!(state, State::Next(StateName::ParagraphInside))
|| (!child.events.is_empty()
@@ -408,69 +417,8 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
}
if !stack_close.is_empty() {
- let mut inject_index = tokenizer.events.len();
-
- // Move past the current data to find the last container start if we’re
- // closing due to a potential lazy flow that was not lazy.
- if *phase == Phase::After {
- inject_index -= 2;
- }
-
- // Move past the container starts to find the last data if we’re
- // closing due to a different container or lazy flow like above.
- if *phase != Phase::Eof {
- while inject_index > 0 {
- let event = &tokenizer.events[inject_index - 1];
-
- if event.token_type == Token::Data {
- break;
- }
-
- inject_index -= 1;
- }
- }
-
- // Move past data starts that are just whitespace only without
- // container starts.
- while inject_index > 0 {
- let event = &tokenizer.events[inject_index - 1];
-
- if event.token_type == Token::Data {
- if event.event_type == EventType::Exit {
- let slice = Slice::from_position(
- tokenizer.parse_state.bytes,
- &Position::from_exit_event(&tokenizer.events, inject_index - 1),
- );
- let bytes = slice.bytes;
- let mut whitespace = true;
- let mut index = 0;
- while index < bytes.len() {
- match bytes[index] {
- b'\t' | b'\n' | b'\r' | b' ' => index += 1,
- _ => {
- whitespace = false;
- break;
- }
- }
- }
-
- if !whitespace {
- break;
- }
- }
- } else {
- break;
- }
-
- inject_index -= 1;
- }
-
- let ref_point = if inject_index == tokenizer.events.len() {
- tokenizer.point.clone()
- } else {
- tokenizer.events[inject_index].point.clone()
- };
-
+ let index = tokenizer.tokenize_state.document_exits.len()
+ - (if *phase == Phase::After { 2 } else { 1 });
let mut exits = Vec::with_capacity(stack_close.len());
while !stack_close.is_empty() {
@@ -483,7 +431,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
exits.push(Event {
event_type: EventType::Exit,
token_type: token_type.clone(),
- point: ref_point.clone(),
+ point: tokenizer.point.clone(),
link: None,
});
@@ -503,18 +451,49 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
debug_assert!(found, "expected to find container token to exit");
}
- tokenizer.map.add(inject_index, 0, exits);
+ if let Some(ref mut list) = tokenizer.tokenize_state.document_exits[index] {
+ list.append(&mut exits);
+ } else {
+ tokenizer.tokenize_state.document_exits[index] = Some(exits);
+ }
}
child.interrupt = false;
}
-// Inject the container events.
+// Inject everything together.
fn resolve(tokenizer: &mut Tokenizer) {
let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
- // To do: see if we can do this less.
- tokenizer.map.consume(&mut tokenizer.events);
+ // First, add the container exits into `child`.
+ let mut child_index = 0;
+ let mut line = 0;
+
+ while child_index < child.events.len() {
+ let event = &child.events[child_index];
+
+ if event.event_type == EventType::Enter
+ && (event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding)
+ {
+ if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
+ let mut exit_index = 0;
+ while exit_index < exits.len() {
+ exits[exit_index].point = event.point.clone();
+ exit_index += 1;
+ }
+
+ child.map.add(child_index, 0, exits);
+ }
+
+ line += 1;
+ }
+
+ child_index += 1;
+ }
+
+ child.map.consume(&mut child.events);
+
+ // Now, add all child events into our parent document tokenizer.
divide_events(
&mut tokenizer.map,
&tokenizer.events,
@@ -522,43 +501,29 @@ fn resolve(tokenizer: &mut Tokenizer) {
&mut child.events,
);
- tokenizer
- .resolvers
- .append(&mut child.resolvers.split_off(0));
- tokenizer
- .resolver_ids
- .append(&mut child.resolver_ids.split_off(0));
-
- // To do: see if we can do this less.
+ // Replace the flow data with actual events.
tokenizer.map.consume(&mut tokenizer.events);
- let mut index = 0;
- let mut last_eol_enter: Option<usize> = None;
- while index < tokenizer.events.len() {
- let event = &tokenizer.events[index];
-
- if event.event_type == EventType::Exit {
- if event.token_type == Token::BlockQuote || event.token_type == Token::ListItem {
- if let Some(inject) = last_eol_enter {
- let point = tokenizer.events[inject].point.clone();
- let mut clone = event.clone();
- clone.point = point;
- // Inject a fixed exit.
- tokenizer.map.add(inject, 0, vec![clone]);
- // Remove this exit.
- tokenizer.map.add(index, 1, vec![]);
- }
- } else if event.token_type == Token::LineEnding
- || event.token_type == Token::BlankLineEnding
- {
- last_eol_enter = Some(index - 1);
- } else {
- last_eol_enter = None;
+ // Now, add some final container exits due to the EOF.
+ // We can’t inject them into the child earlier, as they are “outside” its
+ // linked data.
+ if line < tokenizer.tokenize_state.document_exits.len() {
+ if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
+ let mut exit_index = 0;
+ while exit_index < exits.len() {
+ exits[exit_index].point = tokenizer.point.clone();
+ exit_index += 1;
}
- }
- index += 1;
+ tokenizer.events.append(&mut exits);
+ }
}
- tokenizer.map.consume(&mut tokenizer.events);
+ // Add the resolvers from child.
+ tokenizer
+ .resolvers
+ .append(&mut child.resolvers.split_off(0));
+ tokenizer
+ .resolver_ids
+ .append(&mut child.resolver_ids.split_off(0));
}
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index e0465a0..3d923d3 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -94,9 +94,11 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> bool {
tokenizer.define_skip(enter.point.clone());
}
+ let end = &events[index + 1].point;
+
state = tokenizer.push(
- enter.point.index,
- events[index + 1].point.index,
+ (enter.point.index, enter.point.vs),
+ (end.index, end.vs),
match state {
State::Next(func) => func,
_ => unreachable!("cannot be ok/nok"),
@@ -140,11 +142,12 @@ pub fn divide_events(
let mut old_prev: Option<usize> = None;
while subindex < child_events.len() {
+ let current = &child_events[subindex].point;
+ let end = &events[link_index + 1].point;
+
// Find the first event that starts after the end we’re looking
// for.
- if child_events[subindex].event_type == EventType::Enter
- && child_events[subindex].point.index >= events[link_index + 1].point.index
- {
+ if current.index > end.index || (current.index == end.index && current.vs > end.vs) {
slices.push((link_index, slice_start));
slice_start = subindex;
link_index = events[link_index].link.as_ref().unwrap().next.unwrap();
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 7b8c9a5..3cdd2d3 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -17,7 +17,6 @@ use crate::content;
use crate::parser::ParseState;
use crate::token::{Token, VOID_TOKENS};
use crate::util::edit_map::EditMap;
-use std::str;
/// Embedded content type.
#[derive(Debug, Clone, PartialEq)]
@@ -473,6 +472,8 @@ pub struct TokenizeState<'a> {
/// To do.
pub document_container_stack: Vec<ContainerState>,
/// To do.
+ pub document_exits: Vec<Option<Vec<Event>>>,
+ /// To do.
pub document_continued: usize,
/// To do.
pub document_paragraph_before: bool,
@@ -607,6 +608,7 @@ impl<'a> Tokenizer<'a> {
tokenize_state: TokenizeState {
connect: false,
document_container_stack: vec![],
+ document_exits: vec![],
document_continued: 0,
document_paragraph_before: false,
document_data_index: None,
@@ -897,16 +899,18 @@ impl<'a> Tokenizer<'a> {
/// This is set up to support repeatedly calling `feed`, and thus streaming
/// markdown into the state machine, and normally pauses after feeding.
// Note: if needed: accept `vs`?
- pub fn push(&mut self, min: usize, max: usize, name: StateName) -> State {
+ pub fn push(&mut self, min: (usize, usize), max: (usize, usize), name: StateName) -> State {
debug_assert!(!self.resolved, "cannot feed after drain");
+
// debug_assert!(min >= self.point.index, "cannot move backwards");
- if min > self.point.index {
- self.move_to((min, 0));
+
+ if min.0 > self.point.index || (min.0 == self.point.index && min.1 > self.point.vs) {
+ self.move_to(min);
}
let mut state = State::Next(name);
- while self.point.index < max {
+ while self.point.index < max.0 || (self.point.index == max.0 && self.point.vs < max.1) {
match state {
State::Ok | State::Nok => {
if let Some(attempt) = self.attempts.pop() {
@@ -1080,14 +1084,7 @@ fn feed_action_impl(
None
};
- log::debug!(
- "feed: `{:?}` to {:?}",
- byte.map_or_else(
- || "eof".to_string(),
- |d| str::from_utf8(&[d]).unwrap().to_string()
- ),
- name
- );
+ log::debug!("feed: `{:?}` to {:?}", byte, name);
tokenizer.expect(byte);
call_impl(tokenizer, name)
}
diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs
index c5e5c43..da54e59 100644
--- a/tests/misc_tabs.rs
+++ b/tests/misc_tabs.rs
@@ -275,4 +275,12 @@ fn tabs_virtual_spaces() {
"<pre><code> x\n</code></pre>\n",
"should strip 3 spaces from an initial tab in fenced code if the opening fence is indented as such"
);
+
+ assert_eq!(
+ micromark("-\ta\n\n\tb"),
+ "<ul>\n<li>\n<p>a</p>\n<p>\tb</p>\n</li>\n</ul>",
+ // To do: CM.js does not output the tab before `b`. See if that makes sense?
+ // "<ul>\n<li>\n<p>a</p>\n<p>b</p>\n</li>\n</ul>",
+ "should support a part of a tab as a container, and the rest of a tab as flow"
+ );
}