aboutsummaryrefslogblamecommitdiffstats
path: root/src/content/document.rs
blob: 7a43d485a018b0df12ca1c47f0ee518985bd4473 (plain) (tree)
1
2
3
4
5
6
7
8
9







                                                                           
                                      
 

                                    
                        



                                                                                            

                                               
         
                             
  
 
                                        
                           


























                                                                          

 
                                                          

                                                                           
 
                                                                                     
                                 

                      
                                 



                                             
                                                                                                    
                                                                               
                                          
                                     
                                                
                                                                         
                 
                          

              

                                           
             




                   
                                      
 
                                          
 
                                                   
 
          

 







                     






                                                                                          

 







                    
                                                       
                                                    

                                                                  
                                        

 
                               





               
                                                                      
                                                                                   






                                                                 


                                                                    

          
                                              


                                                         
                    
                                                           
              
          


                                       
                                       


     






                                      
                                                                       
                                                        




                                                                        

 






                                
                                                                     
                                                        





                                                                        
 
 







                           
                                                                 

                                                                             


                                                                  





                                       


                                                                                         






                            
                                               
         

     







                                               


                                                        
                
                                                              
          
      


          
                                                                                 






                                               


                                                  
                
                                              
          
      

 
                          
   





               
                                                                

                                                        

                                                                              



                                                                  

     
                                 





                                                               
                                
                                   

 







                                  






                                                                             
 





















                                                                                       
 

















                                                        

 






                                     







                                                                                 
     



































                                                                                                
 



                                                                          
             

         
 



                                                            

                               
         
                    
                                                                           
                                                                           
                                 



         
                                          




                                                                

                                                                                  








                                                                           

         

                                                          
 




                                                                                     
 





















































































                                                                                            
 

                                                      

     
                                                               

 
                               
                                       



                                                                             
 























                                                                                                 
 










                                                                       
                        


                                                                        
             
         
 
















                                                                                

         

                      
 




















                                                      
 






























                                                                                             

         
                   
     
 
                                                 
 
//! The document content type.
//!
//! **Document** represents the containers, such as block quotes and lists,
//! which structure the document and contain other sections.
//!
//! The constructs found in flow are:
//!
//! *   [Block quote][crate::construct::block_quote]
//! *   [List][crate::construct::list]

use crate::parser::ParseState;
use crate::subtokenize::subtokenize;
use crate::token::Token;
use crate::tokenizer::{
    Container, ContainerState, ContentType, Event, EventType, Link, Point, State, StateName,
    Tokenizer,
};
use crate::util::{
    normalize_identifier::normalize_identifier,
    skip,
    slice::{Position, Slice},
};

/// Phases where we can exit containers.
#[derive(Debug, PartialEq)]
enum Phase {
    /// After parsing a line of lazy flow which resulted in something that
    /// exits containers before the line.
    ///
    /// ```markdown
    ///   | * a
    /// > | ```js
    ///          ^
    ///   | b
    ///   | ```
    /// ```
    After,
    /// When a new container replaces an existing container.
    ///
    /// ```markdown
    ///   | * a
    /// > | > b
    ///     ^
    /// ```
    Prefix,
    /// After everything.
    ///
    /// ```markdown
    /// > | * a
    ///        ^
    /// ```
    Eof,
}

/// Turn `codes` as the document content type into events.
pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
    let mut tokenizer = Tokenizer::new(point, parse_state);

    let state = tokenizer.push(0, parse_state.bytes.len(), StateName::DocumentStart);
    tokenizer.flush(state, true);

    let mut index = 0;
    let mut definitions = vec![];

    while index < tokenizer.events.len() {
        let event = &tokenizer.events[index];

        if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
            // Note: we don‘t care about virtual spaces, so `as_str` is fine.
            let id = normalize_identifier(
                Slice::from_position(
                    tokenizer.parse_state.bytes,
                    &Position::from_exit_event(&tokenizer.events, index),
                )
                .as_str(),
            );

            if !definitions.contains(&id) {
                definitions.push(id);
            }
        }

        index += 1;
    }

    let mut events = tokenizer.events;

    parse_state.definitions = definitions;

    while !subtokenize(&mut events, parse_state) {}

    events
}

/// At the beginning.
///
/// Perhaps a BOM?
///
/// ```markdown
/// > | a
///     ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
    tokenizer.tokenize_state.child_tokenizer = Some(Box::new(Tokenizer::new(
        tokenizer.point.clone(),
        tokenizer.parse_state,
    )));
    tokenizer.tokenize_state.document_child_state = Some(State::Fn(StateName::FlowStart));
    tokenizer.attempt_opt(StateName::BomStart, StateName::DocumentLineStart)
}

/// Start of a line.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
pub fn line_start(tokenizer: &mut Tokenizer) -> State {
    tokenizer.tokenize_state.document_continued = 0;
    // Containers would only be interrupting if we’ve continued.
    tokenizer.interrupt = false;
    container_existing_before(tokenizer)
}

/// Before existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
    // If there are more existing containers, check whether the next one continues.
    if tokenizer.tokenize_state.document_continued
        < tokenizer.tokenize_state.document_container_stack.len()
    {
        let container = tokenizer
            .tokenize_state
            .document_container_stack
            .remove(tokenizer.tokenize_state.document_continued);
        let state_name = match container.kind {
            Container::BlockQuote => StateName::BlockQuoteContStart,
            Container::ListItem => StateName::ListContStart,
        };

        tokenizer.container = Some(container);
        tokenizer.attempt(state_name, |ok| {
            State::Fn(if ok {
                StateName::DocumentContainerExistingAfter
            } else {
                StateName::DocumentContainerExistingMissing
            })
        })
    }
    // Otherwise, check new containers.
    else {
        container_new_before(tokenizer)
    }
}

/// At a missing, existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
pub fn container_existing_missing(tokenizer: &mut Tokenizer) -> State {
    let container = tokenizer.container.take().unwrap();
    tokenizer
        .tokenize_state
        .document_container_stack
        .insert(tokenizer.tokenize_state.document_continued, container);
    container_new_before(tokenizer)
}

/// After an existing container.
//
/// ```markdown
///   | * a
/// > |   b
///       ^
/// ```
pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State {
    let container = tokenizer.container.take().unwrap();
    tokenizer
        .tokenize_state
        .document_container_stack
        .insert(tokenizer.tokenize_state.document_continued, container);
    tokenizer.tokenize_state.document_continued += 1;
    container_existing_before(tokenizer)
}

/// Before a new container.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
    // If we have completely continued, restore the flow’s past `interrupt`
    // status.
    if tokenizer.tokenize_state.document_continued
        == tokenizer.tokenize_state.document_container_stack.len()
    {
        tokenizer.interrupt = tokenizer
            .tokenize_state
            .child_tokenizer
            .as_ref()
            .unwrap()
            .interrupt;

        // …and if we’re in a concrete construct, new containers can’t “pierce”
        // into them.
        if tokenizer
            .tokenize_state
            .child_tokenizer
            .as_ref()
            .unwrap()
            .concrete
        {
            return containers_after(tokenizer);
        }
    }

    // Check for a new container.
    // Block quote?
    tokenizer.container = Some(ContainerState {
        kind: Container::BlockQuote,
        blank_initial: false,
        size: 0,
    });

    tokenizer.attempt(StateName::BlockQuoteStart, |ok| {
        State::Fn(if ok {
            StateName::DocumentContainerNewAfter
        } else {
            StateName::DocumentContainerNewBeforeNotBlockQuote
        })
    })
}

/// To do.
pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {
    // List item?
    tokenizer.container = Some(ContainerState {
        kind: Container::ListItem,
        blank_initial: false,
        size: 0,
    });

    tokenizer.attempt(StateName::ListStart, |ok| {
        State::Fn(if ok {
            StateName::DocumentContainerNewAfter
        } else {
            StateName::DocumentContainersAfter
        })
    })
}

/// After a new container.
///
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
    let container = tokenizer.container.take().unwrap();

    // If we did not continue all existing containers, and there is a new one,
    // close the flow and those containers.
    if tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len()
    {
        exit_containers(tokenizer, &Phase::Prefix);
    }

    // Try another new container.
    tokenizer
        .tokenize_state
        .document_container_stack
        .push(container);
    tokenizer.tokenize_state.document_continued += 1;
    tokenizer.tokenize_state.document_interrupt_before = false;
    tokenizer.interrupt = false;
    container_new_before(tokenizer)
}

/// After containers, before flow.
//
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
    if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
        child.lazy = tokenizer.tokenize_state.document_continued
            != tokenizer.tokenize_state.document_container_stack.len();
        child.interrupt = tokenizer.tokenize_state.document_interrupt_before;
        child.define_skip(tokenizer.point.clone());
    }

    match tokenizer.current {
        // Note: EOL is part of data.
        None => flow_end(tokenizer),
        Some(_) => {
            let current = tokenizer.events.len();
            let previous = tokenizer.tokenize_state.document_data_index.take();
            if let Some(previous) = previous {
                tokenizer.events[previous].link.as_mut().unwrap().next = Some(current);
            }
            tokenizer.tokenize_state.document_data_index = Some(current);
            tokenizer.enter_with_link(
                Token::Data,
                Some(Link {
                    previous,
                    next: None,
                    content_type: ContentType::Flow,
                }),
            );
            flow_inside(tokenizer)
        }
    }
}

/// To do.
pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
    match tokenizer.current {
        None => {
            tokenizer.exit(Token::Data);
            flow_end(tokenizer)
        }
        // Note: EOL is part of data.
        Some(b'\n') => {
            tokenizer.consume();
            tokenizer.exit(Token::Data);
            State::Fn(StateName::DocumentFlowEnd)
        }
        Some(_) => {
            tokenizer.consume();
            State::Fn(StateName::DocumentFlowInside)
        }
    }
}

/// After flow (after eol or at eof).
//
/// ```markdown
///   | * a
/// > | > b
///     ^  ^
/// ```
pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
    let mut paragraph = false;
    let mut interrupt = false;

    // We have new data.
    // Note that everything except for a `null` is data.
    if tokenizer.events.len() > 1
        && tokenizer.events[tokenizer.events.len() - 1].token_type == Token::Data
    {
        let position = Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);

        let state = tokenizer
            .tokenize_state
            .document_child_state
            .take()
            .unwrap_or(State::Fn(StateName::FlowStart));

        let state_name = match state {
            State::Fn(state_name) => state_name,
            _ => unreachable!("expected state name"),
        };

        if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
            // To do: handle VS?
            // if position.start.vs > 0 {
            // }
            let state = child.push(position.start.index, position.end.index, state_name);

            interrupt = child.interrupt;
            paragraph = matches!(state, State::Fn(StateName::ParagraphInside))
                || (!child.events.is_empty()
                    && child.events[skip::opt_back(
                        &child.events,
                        child.events.len() - 1,
                        &[Token::LineEnding],
                    )]
                    .token_type
                        == Token::Paragraph);

            tokenizer.tokenize_state.document_child_state = Some(state);

            if child.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before {
                tokenizer.tokenize_state.document_continued =
                    tokenizer.tokenize_state.document_container_stack.len();
            }

            if tokenizer.tokenize_state.document_continued
                != tokenizer.tokenize_state.document_container_stack.len()
            {
                exit_containers(tokenizer, &Phase::After);
            }
        }
    }

    match tokenizer.current {
        None => {
            tokenizer.tokenize_state.document_continued = 0;
            exit_containers(tokenizer, &Phase::Eof);
            resolve(tokenizer);
            State::Ok
        }
        Some(_) => {
            tokenizer.tokenize_state.document_paragraph_before = paragraph;
            tokenizer.tokenize_state.document_interrupt_before = interrupt;
            line_start(tokenizer)
        }
    }
}

/// Close containers (and flow if needed).
fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
    let mut stack_close = tokenizer
        .tokenize_state
        .document_container_stack
        .split_off(tokenizer.tokenize_state.document_continued);

    // So, we’re at the end of a line, but we need to close the *previous* line.
    if let Some(ref mut child) = tokenizer.tokenize_state.child_tokenizer {
        if *phase != Phase::After {
            let state = tokenizer
                .tokenize_state
                .document_child_state
                .take()
                .unwrap_or(State::Fn(StateName::FlowStart));

            child.flush(state, false);
        }

        if !stack_close.is_empty() {
            let mut inject_index = tokenizer.events.len();

            // Move past the current data to find the last container start if we’re
            // closing due to a potential lazy flow that was not lazy.
            if *phase == Phase::After {
                inject_index -= 2;
            }

            // Move past the container starts to find the last data if we’re
            // closing due to a different container or lazy flow like above.
            if *phase == Phase::After || *phase == Phase::Prefix {
                while inject_index > 0 {
                    let event = &tokenizer.events[inject_index - 1];

                    if event.token_type == Token::Data {
                        break;
                    }

                    inject_index -= 1;
                }
            }

            // Move past data starts that are just whitespace only without
            // container starts.
            while inject_index > 0 {
                let event = &tokenizer.events[inject_index - 1];

                if event.token_type == Token::Data {
                    if event.event_type == EventType::Exit {
                        let slice = Slice::from_position(
                            tokenizer.parse_state.bytes,
                            &Position::from_exit_event(&tokenizer.events, inject_index - 1),
                        );
                        let bytes = slice.bytes;
                        let mut whitespace = true;
                        let mut index = 0;
                        while index < bytes.len() {
                            match bytes[index] {
                                b'\t' | b'\n' | b'\r' | b' ' => index += 1,
                                _ => {
                                    whitespace = false;
                                    break;
                                }
                            }
                        }

                        if !whitespace {
                            break;
                        }
                    }
                } else {
                    break;
                }

                inject_index -= 1;
            }

            let ref_point = if inject_index == tokenizer.events.len() {
                tokenizer.point.clone()
            } else {
                tokenizer.events[inject_index].point.clone()
            };

            let mut exits = Vec::with_capacity(stack_close.len());

            while !stack_close.is_empty() {
                let container = stack_close.pop().unwrap();
                let token_type = match container.kind {
                    Container::BlockQuote => Token::BlockQuote,
                    Container::ListItem => Token::ListItem,
                };

                exits.push(Event {
                    event_type: EventType::Exit,
                    token_type: token_type.clone(),
                    point: ref_point.clone(),
                    link: None,
                });

                let mut stack_index = tokenizer.stack.len();
                let mut found = false;

                while stack_index > 0 {
                    stack_index -= 1;

                    if tokenizer.stack[stack_index] == token_type {
                        tokenizer.stack.remove(stack_index);
                        found = true;
                        break;
                    }
                }

                debug_assert!(found, "expected to find container token to exit");
            }

            tokenizer.map.add(inject_index, 0, exits);
        }
    }

    tokenizer.tokenize_state.document_interrupt_before = false;
}

// Inject the container events.
fn resolve(tokenizer: &mut Tokenizer) {
    let mut child = tokenizer.tokenize_state.child_tokenizer.take().unwrap();
    child.map.consume(&mut child.events);
    // To do: see if we can do this less.
    tokenizer.map.consume(&mut tokenizer.events);

    let mut link_index = skip::to(&tokenizer.events, 0, &[Token::Data]);
    // To do: share this code with `subtokenize`.
    // Now, loop through all subevents to figure out which parts
    // belong where and fix deep links.
    let mut subindex = 0;
    let mut slices = vec![];
    let mut slice_start = 0;
    let mut old_prev: Option<usize> = None;

    while subindex < child.events.len() {
        // Find the first event that starts after the end we’re looking
        // for.
        if child.events[subindex].event_type == EventType::Enter
            && child.events[subindex].point.index >= tokenizer.events[link_index + 1].point.index
        {
            slices.push((link_index, slice_start));
            slice_start = subindex;
            link_index = tokenizer.events[link_index]
                .link
                .as_ref()
                .unwrap()
                .next
                .unwrap();
        }

        // Fix sublinks.
        if let Some(sublink_curr) = &child.events[subindex].link {
            if sublink_curr.previous.is_some() {
                let old_prev = old_prev.unwrap();
                let prev_event = &mut child.events[old_prev];
                // The `index` in `events` where the current link is,
                // minus one to get the previous link,
                // minus 2 events (the enter and exit) for each removed
                // link.
                let new_link = if slices.is_empty() {
                    old_prev + link_index + 2
                } else {
                    old_prev + link_index - (slices.len() - 1) * 2
                };
                prev_event.link.as_mut().unwrap().next = Some(new_link);
            }
        }

        // If there is a `next` link in the subevents, we have to change
        // its `previous` index to account for the shifted events.
        // If it points to a next event, we also change the next event’s
        // reference back to *this* event.
        if let Some(sublink_curr) = &child.events[subindex].link {
            if let Some(next) = sublink_curr.next {
                let sublink_next = child.events[next].link.as_mut().unwrap();

                old_prev = sublink_next.previous;

                sublink_next.previous = sublink_next
                    .previous
                    // The `index` in `events` where the current link is,
                    // minus 2 events (the enter and exit) for each removed
                    // link.
                    .map(|previous| previous + link_index - (slices.len() * 2));
            }
        }

        subindex += 1;
    }

    if !child.events.is_empty() {
        slices.push((link_index, slice_start));
    }

    // Finally, inject the subevents.
    let mut index = slices.len();

    while index > 0 {
        index -= 1;
        let start = slices[index].0;
        tokenizer.map.add(
            start,
            if start == tokenizer.events.len() {
                0
            } else {
                2
            },
            child.events.split_off(slices[index].1),
        );
    }
    // To do: share the above code with `subtokenize`.

    let mut resolvers = child.resolvers.split_off(0);
    let mut resolver_ids = child.resolver_ids.split_off(0);
    tokenizer.resolvers.append(&mut resolvers);
    tokenizer.resolver_ids.append(&mut resolver_ids);

    // To do: see if we can do this less.
    tokenizer.map.consume(&mut tokenizer.events);

    let mut index = 0;
    let mut last_eol_enter: Option<usize> = None;
    while index < tokenizer.events.len() {
        let event = &tokenizer.events[index];

        if event.event_type == EventType::Exit {
            if event.token_type == Token::BlockQuote || event.token_type == Token::ListItem {
                if let Some(inject) = last_eol_enter {
                    let point = tokenizer.events[inject].point.clone();
                    let mut clone = event.clone();
                    clone.point = point;
                    // Inject a fixed exit.
                    tokenizer.map.add(inject, 0, vec![clone]);
                    // Remove this exit.
                    tokenizer.map.add(index, 1, vec![]);
                }
            } else if event.token_type == Token::LineEnding
                || event.token_type == Token::BlankLineEnding
            {
                last_eol_enter = Some(index - 1);
            } else {
                last_eol_enter = None;
            }
        }

        index += 1;
    }

    tokenizer.map.consume(&mut tokenizer.events);
}