//! The document content type. //! //! **Document** represents the containers, such as block quotes and lists, //! which structure the document and contain other sections. //! //! The constructs found in flow are: //! //! * [Block quote][crate::construct::block_quote] //! * [List][crate::construct::list] use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; use crate::tokenizer::{ Container, ContainerState, ContentType, Event, EventType, Link, Point, State, StateName, Tokenizer, }; use crate::util::{ normalize_identifier::normalize_identifier, skip, slice::{Position, Slice}, }; /// Phases where we can exit containers. #[derive(Debug, PartialEq)] enum Phase { /// After parsing a line of lazy flow which resulted in something that /// exits containers before the line. /// /// ```markdown /// | * a /// > | ```js /// ^ /// | b /// | ``` /// ``` After, /// When a new container replaces an existing container. /// /// ```markdown /// | * a /// > | > b /// ^ /// ``` Prefix, /// After everything. /// /// ```markdown /// > | * a /// ^ /// ``` Eof, } /// Turn `codes` as the document content type into events. pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let mut tokenizer = Tokenizer::new(point, parse_state); let state = tokenizer.push(0, parse_state.bytes.len(), StateName::DocumentStart); tokenizer.flush(state, true); let mut index = 0; let mut definitions = vec![]; while index < tokenizer.events.len() { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { // Note: we don‘t care about virtual spaces, so `as_str` is fine. let id = normalize_identifier( Slice::from_position( tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, index), ) .as_str(), ); if !definitions.contains(&id) { definitions.push(id); } } index += 1; } let mut events = tokenizer.events; parse_state.definitions = definitions; while !subtokenize(&mut events, parse_state) {} events } /// At the beginning. /// /// Perhaps a BOM? /// /// ```markdown /// > | a /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.child_tokenizer = Some(Box::new(Tokenizer::new( tokenizer.point.clone(), tokenizer.parse_state, ))); tokenizer.tokenize_state.document_child_state = Some(State::Fn(StateName::FlowStart)); tokenizer.attempt( StateName::BomStart, State::Fn(StateName::DocumentLineStart), State::Fn(StateName::DocumentLineStart), ) } /// Start of a line. // /// ```markdown /// > | * a /// ^ /// > | > b /// ^ /// ``` pub fn line_start(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.document_continued = 0; // Containers would only be interrupting if we’ve continued. tokenizer.interrupt = false; container_existing_before(tokenizer) } /// Before existing containers. // /// ```markdown /// | * a /// > | > b /// ^ /// ``` pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State { // If there are more existing containers, check whether the next one continues. if tokenizer.tokenize_state.document_continued < tokenizer.tokenize_state.document_container_stack.len() { let container = tokenizer .tokenize_state .document_container_stack .remove(tokenizer.tokenize_state.document_continued); let state_name = match container.kind { Container::BlockQuote => StateName::BlockQuoteContStart, Container::ListItem => StateName::ListContStart, }; tokenizer.container = Some(container); tokenizer.attempt( state_name, State::Fn(StateName::DocumentContainerExistingAfter), State::Fn(StateName::DocumentContainerExistingMissing), ) } // Otherwise, check new containers. else { container_new_before(tokenizer) } } /// At a missing, existing containers. // /// ```markdown /// | * a /// > | > b /// ^ /// ``` pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); tokenizer .tokenize_state .document_container_stack .insert(tokenizer.tokenize_state.document_continued, container); container_new_before(tokenizer) } /// After an existing container. // /// ```markdown /// | * a /// > | b /// ^ /// ``` pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); tokenizer .tokenize_state .document_container_stack .insert(tokenizer.tokenize_state.document_continued, container); tokenizer.tokenize_state.document_continued += 1; container_existing_before(tokenizer) } /// Before a new container. // /// ```markdown /// > | * a /// ^ /// > | > b /// ^ /// ``` pub fn container_new_before(tokenizer: &mut Tokenizer) -> State { // If we have completely continued, restore the flow’s past `interrupt` // status. if tokenizer.tokenize_state.document_continued == tokenizer.tokenize_state.document_container_stack.len() { tokenizer.interrupt = tokenizer .tokenize_state .child_tokenizer .as_ref() .unwrap() .interrupt; // …and if we’re in a concrete construct, new containers can’t “pierce” // into them. if tokenizer .tokenize_state .child_tokenizer .as_ref() .unwrap() .concrete { return containers_after(tokenizer); } } // Check for a new container. // Block quote? tokenizer.container = Some(ContainerState { kind: Container::BlockQuote, blank_initial: false, size: 0, }); tokenizer.attempt( StateName::BlockQuoteStart, State::Fn(StateName::DocumentContainerNewAfter), State::Fn(StateName::DocumentContainerNewBeforeNotBlockQuote), ) } /// To do. pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State { // List item? tokenizer.container = Some(ContainerState { kind: Container::ListItem, blank_initial: false, size: 0, }); tokenizer.attempt( StateName::ListStart, State::Fn(StateName::DocumentContainerNewAfter), State::Fn(StateName::DocumentContainersAfter), ) } /// After a new container. /// /// ```markdown /// > | * a /// ^ /// > | > b /// ^ /// ``` pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { let container = tokenizer.container.take().unwrap(); // If we did not continue all existing containers, and there is a new one, // close the flow and those containers. if tokenizer.tokenize_state.document_continued != tokenizer.tokenize_state.document_container_stack.len() { exit_containers(tokenizer, &Phase::Prefix); } // Try another new container. tokenizer .tokenize_state .document_container_stack .push(container); tokenizer.tokenize_state.document_continued += 1; tokenizer.tokenize_state.document_interrupt_before = false; tokenizer.interrupt = false; container_new_before(tokenizer) } /// After containers, before flow. // /// ```markdown /// > | * a /// ^ /// > | > b /// ^ /// ``` pub fn containers_after(tokenizer: &mut Tokenizer) -> State { if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer { child.lazy = tokenizer.tokenize_state.document_continued != tokenizer.tokenize_state.document_container_stack.len(); child.interrupt = tokenizer.tokenize_state.document_interrupt_before; child.define_skip(tokenizer.point.clone()); } match tokenizer.current { // Note: EOL is part of data. None => flow_end(tokenizer), Some(_) => { let current = tokenizer.events.len(); let previous = tokenizer.tokenize_state.document_data_index.take(); if let Some(previous) = previous { tokenizer.events[previous].link.as_mut().unwrap().next = Some(current); } tokenizer.tokenize_state.document_data_index = Some(current); tokenizer.enter_with_link( Token::Data, Some(Link { previous, next: None, content_type: ContentType::Flow, }), ); flow_inside(tokenizer) } } } /// To do. pub fn flow_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => { tokenizer.exit(Token::Data); flow_end(tokenizer) } // Note: EOL is part of data. Some(b'\n') => { tokenizer.consume(); tokenizer.exit(Token::Data); State::Fn(StateName::DocumentFlowEnd) } Some(_) => { tokenizer.consume(); State::Fn(StateName::DocumentFlowInside) } } } /// After flow (after eol or at eof). // /// ```markdown /// | * a /// > | > b /// ^ ^ /// ``` pub fn flow_end(tokenizer: &mut Tokenizer) -> State { let mut paragraph = false; let mut interrupt = false; // We have new data. // Note that everything except for a `null` is data. if tokenizer.events.len() > 1 && tokenizer.events[tokenizer.events.len() - 1].token_type == Token::Data { let position = Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); let state = tokenizer .tokenize_state .document_child_state .take() .unwrap_or(State::Fn(StateName::FlowStart)); let state_name = match state { State::Fn(state_name) => state_name, _ => unreachable!("expected state name"), }; if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer { // To do: handle VS? // if position.start.vs > 0 { // } let state = child.push(position.start.index, position.end.index, state_name); interrupt = child.interrupt; paragraph = matches!(state, State::Fn(StateName::ParagraphInside)) || (!child.events.is_empty() && child.events[skip::opt_back( &child.events, child.events.len() - 1, &[Token::LineEnding], )] .token_type == Token::Paragraph); tokenizer.tokenize_state.document_child_state = Some(state); if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before { tokenizer.tokenize_state.document_continued = tokenizer.tokenize_state.document_container_stack.len(); } if tokenizer.tokenize_state.document_continued != tokenizer.tokenize_state.document_container_stack.len() { exit_containers(tokenizer, &Phase::After); } } } match tokenizer.current { None => { tokenizer.tokenize_state.document_continued = 0; exit_containers(tokenizer, &Phase::Eof); resolve(tokenizer); State::Ok } Some(_) => { tokenizer.tokenize_state.document_paragraph_before = paragraph; tokenizer.tokenize_state.document_interrupt_before = interrupt; line_start(tokenizer) } } } /// Close containers (and flow if needed). fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) { let mut stack_close = tokenizer .tokenize_state .document_container_stack .split_off(tokenizer.tokenize_state.document_continued); // So, we’re at the end of a line, but we need to close the *previous* line. if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer { if *phase != Phase::After { let state = tokenizer .tokenize_state .document_child_state .take() .unwrap_or(State::Fn(StateName::FlowStart)); child.flush(state, false); } if !stack_close.is_empty() { let mut inject_index = tokenizer.events.len(); // Move past the current data to find the last container start if we’re // closing due to a potential lazy flow that was not lazy. if *phase == Phase::After { inject_index -= 2; } // Move past the container starts to find the last data if we’re // closing due to a different container or lazy flow like above. if *phase == Phase::After || *phase == Phase::Prefix { while inject_index > 0 { let event = &tokenizer.events[inject_index - 1]; if event.token_type == Token::Data { break; } inject_index -= 1; } } // Move past data starts that are just whitespace only without // container starts. while inject_index > 0 { let event = &tokenizer.events[inject_index - 1]; if event.token_type == Token::Data { if event.event_type == EventType::Exit { let slice = Slice::from_position( tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, inject_index - 1), ); let bytes = slice.bytes; let mut whitespace = true; let mut index = 0; while index < bytes.len() { match bytes[index] { b'\t' | b'\n' | b'\r' | b' ' => index += 1, _ => { whitespace = false; break; } } } if !whitespace { break; } } } else { break; } inject_index -= 1; } let ref_point = if inject_index == tokenizer.events.len() { tokenizer.point.clone() } else { tokenizer.events[inject_index].point.clone() }; let mut exits = Vec::with_capacity(stack_close.len()); while !stack_close.is_empty() { let container = stack_close.pop().unwrap(); let token_type = match container.kind { Container::BlockQuote => Token::BlockQuote, Container::ListItem => Token::ListItem, }; exits.push(Event { event_type: EventType::Exit, token_type: token_type.clone(), point: ref_point.clone(), link: None, }); let mut stack_index = tokenizer.stack.len(); let mut found = false; while stack_index > 0 { stack_index -= 1; if tokenizer.stack[stack_index] == token_type { tokenizer.stack.remove(stack_index); found = true; break; } } debug_assert!(found, "expected to find container token to exit"); } tokenizer.map.add(inject_index, 0, exits); } } tokenizer.tokenize_state.document_interrupt_before = false; } // Inject the container events. fn resolve(tokenizer: &mut Tokenizer) { let mut child = tokenizer.tokenize_state.child_tokenizer.take().unwrap(); child.map.consume(&mut child.events); // To do: see if we can do this less. tokenizer.map.consume(&mut tokenizer.events); let mut link_index = skip::to(&tokenizer.events, 0, &[Token::Data]); // To do: share this code with `subtokenize`. // Now, loop through all subevents to figure out which parts // belong where and fix deep links. let mut subindex = 0; let mut slices = vec![]; let mut slice_start = 0; let mut old_prev: Option = None; while subindex < child.events.len() { // Find the first event that starts after the end we’re looking // for. if child.events[subindex].event_type == EventType::Enter && child.events[subindex].point.index >= tokenizer.events[link_index + 1].point.index { slices.push((link_index, slice_start)); slice_start = subindex; link_index = tokenizer.events[link_index] .link .as_ref() .unwrap() .next .unwrap(); } // Fix sublinks. if let Some(sublink_curr) = &child.events[subindex].link { if sublink_curr.previous.is_some() { let old_prev = old_prev.unwrap(); let prev_event = &mut child.events[old_prev]; // The `index` in `events` where the current link is, // minus one to get the previous link, // minus 2 events (the enter and exit) for each removed // link. let new_link = if slices.is_empty() { old_prev + link_index + 2 } else { old_prev + link_index - (slices.len() - 1) * 2 }; prev_event.link.as_mut().unwrap().next = Some(new_link); } } // If there is a `next` link in the subevents, we have to change // its `previous` index to account for the shifted events. // If it points to a next event, we also change the next event’s // reference back to *this* event. if let Some(sublink_curr) = &child.events[subindex].link { if let Some(next) = sublink_curr.next { let sublink_next = child.events[next].link.as_mut().unwrap(); old_prev = sublink_next.previous; sublink_next.previous = sublink_next .previous // The `index` in `events` where the current link is, // minus 2 events (the enter and exit) for each removed // link. .map(|previous| previous + link_index - (slices.len() * 2)); } } subindex += 1; } if !child.events.is_empty() { slices.push((link_index, slice_start)); } // Finally, inject the subevents. let mut index = slices.len(); while index > 0 { index -= 1; let start = slices[index].0; tokenizer.map.add( start, if start == tokenizer.events.len() { 0 } else { 2 }, child.events.split_off(slices[index].1), ); } // To do: share the above code with `subtokenize`. let mut resolvers = child.resolvers.split_off(0); let mut resolver_ids = child.resolver_ids.split_off(0); tokenizer.resolvers.append(&mut resolvers); tokenizer.resolver_ids.append(&mut resolver_ids); // To do: see if we can do this less. tokenizer.map.consume(&mut tokenizer.events); let mut index = 0; let mut last_eol_enter: Option = None; while index < tokenizer.events.len() { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit { if event.token_type == Token::BlockQuote || event.token_type == Token::ListItem { if let Some(inject) = last_eol_enter { let point = tokenizer.events[inject].point.clone(); let mut clone = event.clone(); clone.point = point; // Inject a fixed exit. tokenizer.map.add(inject, 0, vec![clone]); // Remove this exit. tokenizer.map.add(index, 1, vec![]); } } else if event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding { last_eol_enter = Some(index - 1); } else { last_eol_enter = None; } } index += 1; } tokenizer.map.consume(&mut tokenizer.events); }