aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-10 13:44:09 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-10 13:44:09 +0200
commit30e5f806277d14d5dcab708ccd0ce07a4894c1f9 (patch)
tree6a7ab8f589c9144b4efcd7ec66b12913e0d16fdc /src
parent29e08c059addb2529637613f7122c573cfc46f6a (diff)
downloadmarkdown-rs-30e5f806277d14d5dcab708ccd0ce07a4894c1f9.tar.gz
markdown-rs-30e5f806277d14d5dcab708ccd0ce07a4894c1f9.tar.bz2
markdown-rs-30e5f806277d14d5dcab708ccd0ce07a4894c1f9.zip
Refactor some code for document parsing
Diffstat (limited to 'src')
-rw-r--r--src/construct/list.rs10
-rw-r--r--src/content/document.rs392
-rw-r--r--src/tokenizer.rs16
3 files changed, 188 insertions, 230 deletions
diff --git a/src/construct/list.rs b/src/construct/list.rs
index 36c1dac..d726c73 100644
--- a/src/construct/list.rs
+++ b/src/construct/list.rs
@@ -275,7 +275,9 @@ pub fn after(tokenizer: &mut Tokenizer) -> State {
prefix += 1;
}
- let container = tokenizer.container.as_mut().unwrap();
+ let container = &mut tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
+
container.blank_initial = blank;
container.size = prefix;
@@ -309,7 +311,8 @@ pub fn cont_start(tokenizer: &mut Tokenizer) -> State {
/// | b
/// ```
pub fn cont_blank(tokenizer: &mut Tokenizer) -> State {
- let container = tokenizer.container.as_ref().unwrap();
+ let container = &mut tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
let size = container.size;
if container.blank_initial {
@@ -329,7 +332,8 @@ pub fn cont_blank(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
pub fn cont_filled(tokenizer: &mut Tokenizer) -> State {
- let container = tokenizer.container.as_mut().unwrap();
+ let container = &mut tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
let size = container.size;
container.blank_initial = false;
diff --git a/src/content/document.rs b/src/content/document.rs
index d47a31a..73c9803 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -105,29 +105,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
tokenizer.point.clone(),
tokenizer.parse_state,
)));
- tokenizer.tokenize_state.document_child_state = Some(State::Next(StateName::FlowStart));
+
tokenizer.attempt(
StateName::BomStart,
- State::Next(StateName::DocumentLineStart),
- State::Next(StateName::DocumentLineStart),
+ State::Next(StateName::DocumentContainerExistingBefore),
+ State::Next(StateName::DocumentContainerExistingBefore),
)
}
-/// Start of a line.
-//
-/// ```markdown
-/// > | * a
-/// ^
-/// > | > b
-/// ^
-/// ```
-pub fn line_start(tokenizer: &mut Tokenizer) -> State {
- tokenizer.tokenize_state.document_continued = 0;
- // Containers would only be interrupting if we’ve continued.
- tokenizer.interrupt = false;
- State::Retry(StateName::DocumentContainerExistingBefore)
-}
-
/// Before existing containers.
//
/// ```markdown
@@ -140,20 +125,16 @@ pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.document_continued
< tokenizer.tokenize_state.document_container_stack.len()
{
- let container = tokenizer
- .tokenize_state
- .document_container_stack
- .remove(tokenizer.tokenize_state.document_continued);
- let name = match container.kind {
- Container::BlockQuote => StateName::BlockQuoteContStart,
- Container::ListItem => StateName::ListContStart,
- };
+ let container = &tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
- tokenizer.container = Some(container);
tokenizer.attempt(
- name,
+ match container.kind {
+ Container::BlockQuote => StateName::BlockQuoteContStart,
+ Container::ListItem => StateName::ListContStart,
+ },
State::Next(StateName::DocumentContainerExistingAfter),
- State::Next(StateName::DocumentContainerExistingMissing),
+ State::Next(StateName::DocumentContainerNewBefore),
)
}
// Otherwise, check new containers.
@@ -162,22 +143,6 @@ pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
}
}
-/// At a missing, existing containers.
-//
-/// ```markdown
-/// | * a
-/// > | > b
-/// ^
-/// ```
-pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State {
- let container = tokenizer.container.take().unwrap();
- tokenizer
- .tokenize_state
- .document_container_stack
- .insert(tokenizer.tokenize_state.document_continued, container);
- State::Retry(StateName::DocumentContainerNewBefore)
-}
-
/// After an existing container.
//
/// ```markdown
@@ -186,11 +151,6 @@ pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State {
- let container = tokenizer.container.take().unwrap();
- tokenizer
- .tokenize_state
- .document_container_stack
- .insert(tokenizer.tokenize_state.document_continued, container);
tokenizer.tokenize_state.document_continued += 1;
State::Retry(StateName::DocumentContainerExistingBefore)
}
@@ -209,33 +169,34 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.document_continued
== tokenizer.tokenize_state.document_container_stack.len()
{
- tokenizer.interrupt = tokenizer
- .tokenize_state
- .child_tokenizer
- .as_ref()
- .unwrap()
- .interrupt;
+ let child = tokenizer.tokenize_state.child_tokenizer.as_ref().unwrap();
+
+ tokenizer.interrupt = child.interrupt;
// …and if we’re in a concrete construct, new containers can’t “pierce”
// into them.
- if tokenizer
- .tokenize_state
- .child_tokenizer
- .as_ref()
- .unwrap()
- .concrete
- {
+ if child.concrete {
return State::Retry(StateName::DocumentContainersAfter);
}
}
// Check for a new container.
// Block quote?
- tokenizer.container = Some(ContainerState {
- kind: Container::BlockQuote,
- blank_initial: false,
- size: 0,
- });
+ // Add a new container at the end of the stack.
+ let tail = tokenizer.tokenize_state.document_container_stack.len();
+ tokenizer
+ .tokenize_state
+ .document_container_stack
+ .push(ContainerState {
+ kind: Container::BlockQuote,
+ blank_initial: false,
+ size: 0,
+ });
+ // Swap the existing container with the new one.
+ tokenizer
+ .tokenize_state
+ .document_container_stack
+ .swap(tokenizer.tokenize_state.document_continued, tail);
tokenizer.attempt(
StateName::BlockQuoteStart,
@@ -247,19 +208,34 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
/// To do.
pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {
// List item?
- tokenizer.container = Some(ContainerState {
+ // We replace the empty block quote container for this new list one.
+ tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued] = ContainerState {
kind: Container::ListItem,
blank_initial: false,
size: 0,
- });
+ };
tokenizer.attempt(
StateName::ListStart,
State::Next(StateName::DocumentContainerNewAfter),
- State::Next(StateName::DocumentContainersAfter),
+ State::Next(StateName::DocumentContainerNewBeforeNotList),
)
}
+/// To do.
+pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State {
+ // It wasn’t a new block quote or a list.
+ // Swap the new container (in the middle) with the existing one (at the end).
+ // Drop what was in the middle.
+ tokenizer
+ .tokenize_state
+ .document_container_stack
+ .swap_remove(tokenizer.tokenize_state.document_continued);
+
+ State::Retry(StateName::DocumentContainersAfter)
+}
+
/// After a new container.
///
/// ```markdown
@@ -269,7 +245,13 @@ pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State
/// ^
/// ```
pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
- let container = tokenizer.container.take().unwrap();
+ // It was a new block quote or a list.
+ // Swap the new container (in the middle) with the existing one (at the end).
+ // Take the new container.
+ let container = tokenizer
+ .tokenize_state
+ .document_container_stack
+ .swap_remove(tokenizer.tokenize_state.document_continued);
// If we did not continue all existing containers, and there is a new one,
// close the flow and those containers.
@@ -279,13 +261,11 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
exit_containers(tokenizer, &Phase::Prefix);
}
- // Try another new container.
tokenizer
.tokenize_state
.document_container_stack
.push(container);
tokenizer.tokenize_state.document_continued += 1;
- tokenizer.tokenize_state.document_interrupt_before = false;
tokenizer.interrupt = false;
State::Retry(StateName::DocumentContainerNewBefore)
}
@@ -299,19 +279,18 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
- if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
- child.lazy = tokenizer.tokenize_state.document_continued
- != tokenizer.tokenize_state.document_container_stack.len();
- child.interrupt = tokenizer.tokenize_state.document_interrupt_before;
- child.define_skip(tokenizer.point.clone());
- }
+ let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+
+ child.lazy = tokenizer.tokenize_state.document_continued
+ != tokenizer.tokenize_state.document_container_stack.len();
+ child.define_skip(tokenizer.point.clone());
match tokenizer.current {
// Note: EOL is part of data.
None => State::Retry(StateName::DocumentFlowEnd),
Some(_) => {
let current = tokenizer.events.len();
- let previous = tokenizer.tokenize_state.document_data_index.take();
+ let previous = tokenizer.tokenize_state.document_data_index;
if let Some(previous) = previous {
tokenizer.events[previous].link.as_mut().unwrap().next = Some(current);
}
@@ -357,57 +336,38 @@ pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
/// ^ ^
/// ```
pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
- let mut paragraph = false;
- let mut interrupt = false;
+ let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+ let state = tokenizer
+ .tokenize_state
+ .document_child_state
+ .unwrap_or(State::Next(StateName::FlowStart));
- // We have new data.
- // Note that everything except for a `null` is data.
- if tokenizer.events.len() > 1
- && tokenizer.events[tokenizer.events.len() - 1].token_type == Token::Data
- {
- let position = Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
+ let name = match state {
+ State::Next(name) => name,
+ _ => unreachable!("expected state name"),
+ };
- let state = tokenizer
- .tokenize_state
- .document_child_state
- .take()
- .unwrap_or(State::Next(StateName::FlowStart));
+ // To do: handle VS?
+ let state = child.push(child.point.index, tokenizer.point.index, name);
- let name = match state {
- State::Next(name) => name,
- _ => unreachable!("expected state name"),
- };
+ let paragraph = matches!(state, State::Next(StateName::ParagraphInside))
+ || (!child.events.is_empty()
+ && child.events
+ [skip::opt_back(&child.events, child.events.len() - 1, &[Token::LineEnding])]
+ .token_type
+ == Token::Paragraph);
- if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
- // To do: handle VS?
- // if position.start.vs > 0 {
- // }
- let state = child.push(position.start.index, position.end.index, name);
-
- interrupt = child.interrupt;
- paragraph = matches!(state, State::Next(StateName::ParagraphInside))
- || (!child.events.is_empty()
- && child.events[skip::opt_back(
- &child.events,
- child.events.len() - 1,
- &[Token::LineEnding],
- )]
- .token_type
- == Token::Paragraph);
-
- tokenizer.tokenize_state.document_child_state = Some(state);
-
- if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before {
- tokenizer.tokenize_state.document_continued =
- tokenizer.tokenize_state.document_container_stack.len();
- }
+ tokenizer.tokenize_state.document_child_state = Some(state);
- if tokenizer.tokenize_state.document_continued
- != tokenizer.tokenize_state.document_container_stack.len()
- {
- exit_containers(tokenizer, &Phase::After);
- }
- }
+ if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before {
+ tokenizer.tokenize_state.document_continued =
+ tokenizer.tokenize_state.document_container_stack.len();
+ }
+
+ if tokenizer.tokenize_state.document_continued
+ != tokenizer.tokenize_state.document_container_stack.len()
+ {
+ exit_containers(tokenizer, &Phase::After);
}
match tokenizer.current {
@@ -418,9 +378,11 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
State::Ok
}
Some(_) => {
+ tokenizer.tokenize_state.document_continued = 0;
tokenizer.tokenize_state.document_paragraph_before = paragraph;
- tokenizer.tokenize_state.document_interrupt_before = interrupt;
- State::Retry(StateName::DocumentLineStart)
+ // Containers would only be interrupting if we’ve continued.
+ tokenizer.interrupt = false;
+ State::Retry(StateName::DocumentContainerExistingBefore)
}
}
}
@@ -432,124 +394,124 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
.document_container_stack
.split_off(tokenizer.tokenize_state.document_continued);
- // So, we’re at the end of a line, but we need to close the *previous* line.
- if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
- if *phase != Phase::After {
- let state = tokenizer
- .tokenize_state
- .document_child_state
- .take()
- .unwrap_or(State::Next(StateName::FlowStart));
+ let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
- child.flush(state, false);
- }
-
- if !stack_close.is_empty() {
- let mut inject_index = tokenizer.events.len();
-
- // Move past the current data to find the last container start if we’re
- // closing due to a potential lazy flow that was not lazy.
- if *phase == Phase::After {
- inject_index -= 2;
- }
+ // Flush if needed.
+ if *phase != Phase::After {
+ let state = tokenizer
+ .tokenize_state
+ .document_child_state
+ .take()
+ .unwrap_or(State::Next(StateName::FlowStart));
- // Move past the container starts to find the last data if we’re
- // closing due to a different container or lazy flow like above.
- if *phase == Phase::After || *phase == Phase::Prefix {
- while inject_index > 0 {
- let event = &tokenizer.events[inject_index - 1];
+ child.flush(state, false);
+ }
- if event.token_type == Token::Data {
- break;
- }
+ if !stack_close.is_empty() {
+ let mut inject_index = tokenizer.events.len();
- inject_index -= 1;
- }
- }
+ // Move past the current data to find the last container start if we’re
+ // closing due to a potential lazy flow that was not lazy.
+ if *phase == Phase::After {
+ inject_index -= 2;
+ }
- // Move past data starts that are just whitespace only without
- // container starts.
+ // Move past the container starts to find the last data if we’re
+ // closing due to a different container or lazy flow like above.
+ if *phase != Phase::Eof {
while inject_index > 0 {
let event = &tokenizer.events[inject_index - 1];
if event.token_type == Token::Data {
- if event.event_type == EventType::Exit {
- let slice = Slice::from_position(
- tokenizer.parse_state.bytes,
- &Position::from_exit_event(&tokenizer.events, inject_index - 1),
- );
- let bytes = slice.bytes;
- let mut whitespace = true;
- let mut index = 0;
- while index < bytes.len() {
- match bytes[index] {
- b'\t' | b'\n' | b'\r' | b' ' => index += 1,
- _ => {
- whitespace = false;
- break;
- }
- }
- }
-
- if !whitespace {
- break;
- }
- }
- } else {
break;
}
inject_index -= 1;
}
+ }
- let ref_point = if inject_index == tokenizer.events.len() {
- tokenizer.point.clone()
+ // Move past data starts that are just whitespace only without
+ // container starts.
+ while inject_index > 0 {
+ let event = &tokenizer.events[inject_index - 1];
+
+ if event.token_type == Token::Data {
+ if event.event_type == EventType::Exit {
+ let slice = Slice::from_position(
+ tokenizer.parse_state.bytes,
+ &Position::from_exit_event(&tokenizer.events, inject_index - 1),
+ );
+ let bytes = slice.bytes;
+ let mut whitespace = true;
+ let mut index = 0;
+ while index < bytes.len() {
+ match bytes[index] {
+ b'\t' | b'\n' | b'\r' | b' ' => index += 1,
+ _ => {
+ whitespace = false;
+ break;
+ }
+ }
+ }
+
+ if !whitespace {
+ break;
+ }
+ }
} else {
- tokenizer.events[inject_index].point.clone()
- };
+ break;
+ }
- let mut exits = Vec::with_capacity(stack_close.len());
+ inject_index -= 1;
+ }
- while !stack_close.is_empty() {
- let container = stack_close.pop().unwrap();
- let token_type = match container.kind {
- Container::BlockQuote => Token::BlockQuote,
- Container::ListItem => Token::ListItem,
- };
+ let ref_point = if inject_index == tokenizer.events.len() {
+ tokenizer.point.clone()
+ } else {
+ tokenizer.events[inject_index].point.clone()
+ };
- exits.push(Event {
- event_type: EventType::Exit,
- token_type: token_type.clone(),
- point: ref_point.clone(),
- link: None,
- });
+ let mut exits = Vec::with_capacity(stack_close.len());
- let mut stack_index = tokenizer.stack.len();
- let mut found = false;
+ while !stack_close.is_empty() {
+ let container = stack_close.pop().unwrap();
+ let token_type = match container.kind {
+ Container::BlockQuote => Token::BlockQuote,
+ Container::ListItem => Token::ListItem,
+ };
- while stack_index > 0 {
- stack_index -= 1;
+ exits.push(Event {
+ event_type: EventType::Exit,
+ token_type: token_type.clone(),
+ point: ref_point.clone(),
+ link: None,
+ });
- if tokenizer.stack[stack_index] == token_type {
- tokenizer.stack.remove(stack_index);
- found = true;
- break;
- }
- }
+ let mut stack_index = tokenizer.stack.len();
+ let mut found = false;
+
+ while stack_index > 0 {
+ stack_index -= 1;
- debug_assert!(found, "expected to find container token to exit");
+ if tokenizer.stack[stack_index] == token_type {
+ tokenizer.stack.remove(stack_index);
+ found = true;
+ break;
+ }
}
- tokenizer.map.add(inject_index, 0, exits);
+ debug_assert!(found, "expected to find container token to exit");
}
+
+ tokenizer.map.add(inject_index, 0, exits);
}
- tokenizer.tokenize_state.document_interrupt_before = false;
+ child.interrupt = false;
}
// Inject the container events.
fn resolve(tokenizer: &mut Tokenizer) {
- let mut child = tokenizer.tokenize_state.child_tokenizer.take().unwrap();
+ let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
// To do: see if we can do this less.
tokenizer.map.consume(&mut tokenizer.events);
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index dff97dd..7b8c9a5 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -203,12 +203,11 @@ pub enum StateName {
DestinationRawEscape,
DocumentStart,
- DocumentLineStart,
DocumentContainerExistingBefore,
DocumentContainerExistingAfter,
- DocumentContainerExistingMissing,
DocumentContainerNewBefore,
DocumentContainerNewBeforeNotBlockQuote,
+ DocumentContainerNewBeforeNotList,
DocumentContainerNewAfter,
DocumentContainersAfter,
DocumentFlowInside,
@@ -476,8 +475,6 @@ pub struct TokenizeState<'a> {
/// To do.
pub document_continued: usize,
/// To do.
- pub document_interrupt_before: bool,
- /// To do.
pub document_paragraph_before: bool,
/// To do.
pub document_data_index: Option<usize>,
@@ -575,8 +572,6 @@ pub struct Tokenizer<'a> {
///
/// Used when tokenizing [text content][crate::content::text].
pub media_list: Vec<Media>,
- /// Current container state.
- pub container: Option<ContainerState>,
/// Whether we would be interrupting something.
///
/// Used when tokenizing [flow content][crate::content::flow].
@@ -613,7 +608,6 @@ impl<'a> Tokenizer<'a> {
connect: false,
document_container_stack: vec![],
document_continued: 0,
- document_interrupt_before: false,
document_paragraph_before: false,
document_data_index: None,
document_child_state: None,
@@ -647,7 +641,6 @@ impl<'a> Tokenizer<'a> {
label_start_stack: vec![],
label_start_list_loose: vec![],
media_list: vec![],
- container: None,
interrupt: false,
concrete: false,
lazy: false,
@@ -1200,16 +1193,15 @@ fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State {
StateName::DestinationRawEscape => construct::partial_destination::raw_escape,
StateName::DocumentStart => content::document::start,
- StateName::DocumentLineStart => content::document::line_start,
StateName::DocumentContainerExistingBefore => content::document::container_existing_before,
StateName::DocumentContainerExistingAfter => content::document::container_existing_after,
- StateName::DocumentContainerExistingMissing => {
- content::document::container_existing_missing
- }
StateName::DocumentContainerNewBefore => content::document::container_new_before,
StateName::DocumentContainerNewBeforeNotBlockQuote => {
content::document::container_new_before_not_block_quote
}
+ StateName::DocumentContainerNewBeforeNotList => {
+ content::document::container_new_before_not_list
+ }
StateName::DocumentContainerNewAfter => content::document::container_new_after,
StateName::DocumentContainersAfter => content::document::containers_after,
StateName::DocumentFlowEnd => content::document::flow_end,