aboutsummaryrefslogblamecommitdiffstats
path: root/src/construct/document.rs
blob: 656525c6a5d495f31d8a04e8fbba86c9d7bb5430 (plain) (tree)
1
2
3
4
5
6
7
8
9

                              

                                                                              



                                                    
                                                
                                                                             
 
                                                     
                                             
                                      
                                                             
                      
                                                  
 
                                        
                           


























                                                                          

 
                                          




               
                                                  
                                                                           


                                
 
                      

                                                          


                                     

 















                                                               
                                    





               
                                                                      
                                                                                   


                                                                 

                                                                          
 

                                                                    
                                                                                          
                                                                

          
                          

                                                                   


                          


                                       
                                                           


     
                                       





               
                                                                     
                                                     
                                                            
 
 
                      






               
                                                                 

                                                                             


                                                                  
                                                                              

                                              


                                                                                         
                           
                                                                    
         

     

                                 














                                                                       
 
                      

                                                                        

                                            

 
                                            




               
                                                                                 
                 
                                                                             

                                                                        


                                  
      
 
                      

                                                                  
      
                                          

 
                                                         




               
                                                                          
























                                                                                         






                                                                                 
                                                    

 
                        
   





               
                                                                
                                                                   





                                                                                 
 

                                                                              


                                                                  


                                                                          

     







                                                                




                                                     
                                
                                                       

 
                              






               
                                                             
                                                                          



                                                                   
 

                                     
                                                         

                                                 
                                                                        



                                                                                       
                                 
                           
                      

                               

                                           
              
                                                       


         
 





               


                                                        

                                                    



                                     

                                                   


                                
                                                      

         

 






                                     
                                                     
                                                                          


                             
               
                                                      
 




                                                       
              
      
 
                                                                
 
































                                                                                  
                                                                            


                                                                  
                                                                  








                                                                                                
                                                                            



                                                      


                            



                                                                  






                                                                    




                                                                               
     
 


                                                            


                                                                           

                               
         
                    
                                                            

                                                                     

                                                                          
                                                                    



         
                                          
                                                                                    



                                                                
 
                                                                          
 





                                 
                                                          
 
                                   
     
 
                                

                                                                 
                                                              
 

                                                       

                                                          
                                                                                
                                                      
              
 
                              

                                   
                                               

                           
 




                                                        
 
                                                         



                                                        
             
 
                                                                             
         
 




                                                                     

     
                            

          

 
                              
                                       
                                                                          
 




                                                   




                                                        
         












                                                                   


                                                                                           
                                                            


                                    
                                                      









                                         








                                                                                            
                                                                    


                           
                   
                          
               
      
 
                                                

                                                 








                                                                                       
             
 

                                                
     
 



                                                   




                                                                    
 
//! The document content type.
//!
//! **Document** represents the containers, such as block quotes, list items,
//! or GFM footnotes, which structure the document and contain other sections.
//!
//! The constructs found in flow are:
//!
//! *   [Block quote][crate::construct::block_quote]
//! *   [List item][crate::construct::list_item]
//! *   [GFM: Footnote definition][crate::construct::gfm_footnote_definition]

use crate::event::{Content, Event, Kind, Link, Name};
use crate::state::{Name as StateName, State};
use crate::subtokenize::divide_events;
use crate::tokenizer::{Container, ContainerState, Tokenizer};
use crate::util::skip;
use alloc::{boxed::Box, string::String, vec::Vec};

/// Phases where we can exit containers.
#[derive(Debug, PartialEq)]
enum Phase {
    /// After parsing a line of lazy flow which resulted in something that
    /// exits containers before the line.
    ///
    /// ```markdown
    ///   | * a
    /// > | ```js
    ///          ^
    ///   | b
    ///   | ```
    /// ```
    After,
    /// When a new container replaces an existing container.
    ///
    /// ```markdown
    ///   | * a
    /// > | > b
    ///     ^
    /// ```
    Prefix,
    /// After everything.
    ///
    /// ```markdown
    /// > | * a
    ///        ^
    /// ```
    Eof,
}

/// Start of document, at an optional BOM.
///
/// ```markdown
/// > | a
///     ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
    tokenizer.tokenize_state.document_child = Some(Box::new(Tokenizer::new(
        tokenizer.point.clone(),
        tokenizer.parse_state,
    )));

    tokenizer.attempt(
        State::Next(StateName::DocumentBeforeFrontmatter),
        State::Next(StateName::DocumentBeforeFrontmatter),
    );

    State::Retry(StateName::BomStart)
}

/// At optional frontmatter.
///
/// ```markdown
/// > | ---
///     ^
///   | title: Venus
///   | ---
/// ```
pub fn before_frontmatter(tokenizer: &mut Tokenizer) -> State {
    tokenizer.attempt(
        State::Next(StateName::DocumentContainerNewBefore),
        State::Next(StateName::DocumentContainerNewBefore),
    );
    State::Retry(StateName::FrontmatterStart)
}

/// At optional existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
    // If there are more existing containers, check whether the next one continues.
    if tokenizer.tokenize_state.document_continued
        < tokenizer.tokenize_state.document_container_stack.len()
    {
        let container = &tokenizer.tokenize_state.document_container_stack
            [tokenizer.tokenize_state.document_continued];

        let name = match container.kind {
            Container::BlockQuote => StateName::BlockQuoteContStart,
            Container::GfmFootnoteDefinition => StateName::GfmFootnoteDefinitionContStart,
            Container::ListItem => StateName::ListItemContStart,
        };

        tokenizer.attempt(
            State::Next(StateName::DocumentContainerExistingAfter),
            State::Next(StateName::DocumentContainerNewBefore),
        );

        State::Retry(name)
    }
    // Otherwise, check new containers.
    else {
        State::Retry(StateName::DocumentContainerNewBefore)
    }
}

/// After continued existing container.
//
/// ```markdown
///   | * a
/// > |   b
///       ^
/// ```
pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State {
    tokenizer.tokenize_state.document_continued += 1;
    State::Retry(StateName::DocumentContainerExistingBefore)
}

/// At new containers.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
    // If we have completely continued, restore the flow’s past `interrupt`
    // status.
    if tokenizer.tokenize_state.document_continued
        == tokenizer.tokenize_state.document_container_stack.len()
    {
        let child = tokenizer.tokenize_state.document_child.as_ref().unwrap();

        tokenizer.interrupt = child.interrupt;

        // …and if we’re in a concrete construct, new containers can’t “pierce”
        // into them.
        if child.concrete {
            return State::Retry(StateName::DocumentContainersAfter);
        }
    }

    // Check for a new container.
    // Block quote?
    // Add a new container at the end of the stack.
    let tail = tokenizer.tokenize_state.document_container_stack.len();
    tokenizer
        .tokenize_state
        .document_container_stack
        .push(ContainerState {
            kind: Container::BlockQuote,
            blank_initial: false,
            size: 0,
        });
    // Swap the existing container with the new one.
    tokenizer
        .tokenize_state
        .document_container_stack
        .swap(tokenizer.tokenize_state.document_continued, tail);

    tokenizer.attempt(
        State::Next(StateName::DocumentContainerNewAfter),
        State::Next(StateName::DocumentContainerNewBeforeNotBlockQuote),
    );
    State::Retry(StateName::BlockQuoteStart)
}

/// At new container, but not a block quote.
//
/// ```markdown
/// > | * a
///     ^
/// ```
pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {
    // List item?
    // We replace the empty block quote container for this new list item one.
    tokenizer.tokenize_state.document_container_stack
        [tokenizer.tokenize_state.document_continued] = ContainerState {
        kind: Container::ListItem,
        blank_initial: false,
        size: 0,
    };

    tokenizer.attempt(
        State::Next(StateName::DocumentContainerNewAfter),
        State::Next(StateName::DocumentContainerNewBeforeNotList),
    );
    State::Retry(StateName::ListItemStart)
}

/// At new container, but not a block quote or list item.
//
/// ```markdown
/// > | a
///     ^
/// ```
pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State {
    // Footnote definition?
    // We replace the empty list item container for this new footnote
    // definition one.
    tokenizer.tokenize_state.document_container_stack
        [tokenizer.tokenize_state.document_continued] = ContainerState {
        kind: Container::GfmFootnoteDefinition,
        blank_initial: false,
        size: 0,
    };

    tokenizer.attempt(
        State::Next(StateName::DocumentContainerNewAfter),
        State::Next(StateName::DocumentContainerNewBeforeNotGfmFootnoteDefinition),
    );
    State::Retry(StateName::GfmFootnoteDefinitionStart)
}

/// At new container, but not a block quote, list item, or footnote definition.
//
/// ```markdown
/// > | a
///     ^
/// ```
pub fn container_new_before_not_footnote_definition(tokenizer: &mut Tokenizer) -> State {
    // It wasn’t a new block quote, list item, or footnote definition.
    // Swap the new container (in the middle) with the existing one (at the end).
    // Drop what was in the middle.
    tokenizer
        .tokenize_state
        .document_container_stack
        .swap_remove(tokenizer.tokenize_state.document_continued);

    State::Retry(StateName::DocumentContainersAfter)
}

/// After new container.
///
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
    // It was a new block quote, list item, or footnote definition.
    // Swap the new container (in the middle) with the existing one (at the end).
    // Take the new container.
    let container = tokenizer
        .tokenize_state
        .document_container_stack
        .swap_remove(tokenizer.tokenize_state.document_continued);

    // If we did not continue all existing containers, and there is a new one,
    // close the flow and those containers.
    if tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len()
    {
        if let Err(message) = exit_containers(tokenizer, &Phase::Prefix) {
            return State::Error(message);
        }
    }

    // We are “piercing” into the flow with a new container.
    tokenizer
        .tokenize_state
        .document_child
        .as_mut()
        .unwrap()
        .pierce = true;

    tokenizer
        .tokenize_state
        .document_container_stack
        .push(container);
    tokenizer.tokenize_state.document_continued += 1;
    tokenizer.interrupt = false;
    State::Retry(StateName::DocumentContainerNewBefore)
}

/// After containers, at flow.
//
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
    let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();

    child.lazy = tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len();
    child.define_skip(tokenizer.point.clone());

    match tokenizer.current {
        // Note: EOL is part of data.
        None => State::Retry(StateName::DocumentFlowEnd),
        Some(_) => {
            let current = tokenizer.events.len();
            let previous = tokenizer.tokenize_state.document_data_index;
            if let Some(previous) = previous {
                tokenizer.events[previous].link.as_mut().unwrap().next = Some(current);
            }
            tokenizer.tokenize_state.document_data_index = Some(current);
            tokenizer.enter_link(
                Name::Data,
                Link {
                    previous,
                    next: None,
                    content: Content::Flow,
                },
            );
            State::Retry(StateName::DocumentFlowInside)
        }
    }
}

/// In flow.
//
/// ```markdown
/// > | * ab
///       ^
/// ```
pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
    match tokenizer.current {
        None => {
            tokenizer.exit(Name::Data);
            State::Retry(StateName::DocumentFlowEnd)
        }
        // Note: EOL is part of data.
        Some(b'\n') => {
            tokenizer.consume();
            tokenizer.exit(Name::Data);
            State::Next(StateName::DocumentFlowEnd)
        }
        Some(_) => {
            tokenizer.consume();
            State::Next(StateName::DocumentFlowInside)
        }
    }
}

/// After flow (after eol or at eof).
//
/// ```markdown
///   | * a
/// > | > b
///     ^  ^
/// ```
pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
    let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
    let state = tokenizer
        .tokenize_state
        .document_child_state
        .take()
        .unwrap_or(State::Next(StateName::FlowStart));

    tokenizer.tokenize_state.document_exits.push(None);

    let state = child.push(
        (child.point.index, child.point.vs),
        (tokenizer.point.index, tokenizer.point.vs),
        state,
    );

    tokenizer.tokenize_state.document_child_state = Some(state);

    // If we’re in a lazy line, and the previous (lazy or not) line is something
    // that can be lazy, and this line is that too, allow it.
    //
    // Accept:
    //
    // ```markdown
    //   | * a
    // > | b
    //     ^
    //   | ```
    // ```
    //
    // Do not accept:
    //
    // ```markdown
    //   | * # a
    // > | b
    //     ^
    //   | ```
    // ```
    //
    // Do not accept:
    //
    // ```markdown
    //   | * a
    // > | # b
    //     ^
    //   | ```
    // ```
    let mut document_lazy_continuation_current = false;
    let mut stack_index = child.stack.len();

    // Use two algo’s: one for when we’re suspended or in multiline things
    // like definitions, another for when we fed the line ending and closed.
    while !document_lazy_continuation_current && stack_index > 0 {
        stack_index -= 1;
        let name = &child.stack[stack_index];
        if name == &Name::Content || name == &Name::GfmTableHead {
            document_lazy_continuation_current = true;
        }
    }

    // …another because we parse each “rest” line as a paragraph, and we passed
    // a EOL already.
    if !document_lazy_continuation_current && !child.events.is_empty() {
        let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]);
        let name = &child.events[before].name;
        if name == &Name::Content || name == &Name::HeadingSetextUnderline {
            document_lazy_continuation_current = true;
        }
    }

    // Reset “piercing”.
    child.pierce = false;

    if child.lazy
        && tokenizer.tokenize_state.document_lazy_accepting_before
        && document_lazy_continuation_current
    {
        tokenizer.tokenize_state.document_continued =
            tokenizer.tokenize_state.document_container_stack.len();
    }

    if tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len()
    {
        let result = exit_containers(tokenizer, &Phase::After);
        // `Phase::After` doesn’t deal with flow: it only generates exits for
        // containers.
        // And that never errors.
        debug_assert!(result.is_ok(), "did not expect error when exiting");
    }

    match tokenizer.current {
        None => {
            tokenizer.tokenize_state.document_continued = 0;
            if let Err(message) = exit_containers(tokenizer, &Phase::Eof) {
                return State::Error(message);
            }
            resolve(tokenizer);
            State::Ok
        }
        Some(_) => {
            tokenizer.tokenize_state.document_continued = 0;
            tokenizer.tokenize_state.document_lazy_accepting_before =
                document_lazy_continuation_current;
            // Containers would only be interrupting if we’ve continued.
            tokenizer.interrupt = false;
            State::Retry(StateName::DocumentContainerExistingBefore)
        }
    }
}

/// Close containers (and flow if needed).
fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) -> Result<(), String> {
    let mut stack_close = tokenizer
        .tokenize_state
        .document_container_stack
        .split_off(tokenizer.tokenize_state.document_continued);

    let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();

    // Flush if needed.
    if *phase != Phase::After {
        let state = tokenizer
            .tokenize_state
            .document_child_state
            .take()
            .unwrap_or(State::Next(StateName::FlowStart));

        child.flush(state, false)?;
    }

    if !stack_close.is_empty() {
        let index = tokenizer.tokenize_state.document_exits.len()
            - (if *phase == Phase::After { 2 } else { 1 });
        let mut exits = Vec::with_capacity(stack_close.len());

        while !stack_close.is_empty() {
            let container = stack_close.pop().unwrap();
            let name = match container.kind {
                Container::BlockQuote => Name::BlockQuote,
                Container::GfmFootnoteDefinition => Name::GfmFootnoteDefinition,
                Container::ListItem => Name::ListItem,
            };

            exits.push(Event {
                kind: Kind::Exit,
                name: name.clone(),
                point: tokenizer.point.clone(),
                link: None,
            });

            let mut stack_index = tokenizer.stack.len();
            let mut found = false;

            while stack_index > 0 {
                stack_index -= 1;

                if tokenizer.stack[stack_index] == name {
                    tokenizer.stack.remove(stack_index);
                    found = true;
                    break;
                }
            }

            debug_assert!(found, "expected to find container event to exit");
        }

        debug_assert!(
            tokenizer.tokenize_state.document_exits[index].is_none(),
            "expected no exits yet"
        );
        tokenizer.tokenize_state.document_exits[index] = Some(exits);
    }

    child.interrupt = false;

    Ok(())
}

// Inject everything together.
fn resolve(tokenizer: &mut Tokenizer) {
    let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();

    // First, add the container exits into `child`.
    let mut child_index = 0;
    let mut line = 0;

    while child_index < child.events.len() {
        if child.events[child_index].kind == Kind::Exit
            && matches!(
                child.events[child_index].name,
                Name::LineEnding | Name::BlankLineEnding
            )
        {
            // Inject before `Enter:LineEnding`.
            let mut inject_index = child_index - 1;
            let mut point = &child.events[inject_index].point;

            while child_index + 1 < child.events.len()
                && child.events[child_index + 1].kind == Kind::Exit
            {
                child_index += 1;
                point = &child.events[child_index].point;
                // Inject after `Exit:*`.
                inject_index = child_index + 1;
            }

            if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
                let mut exit_index = 0;
                while exit_index < exits.len() {
                    exits[exit_index].point = point.clone();
                    exit_index += 1;
                }

                child.map.add(inject_index, 0, exits);
            }

            line += 1;
        }

        child_index += 1;
    }

    child.map.consume(&mut child.events);

    let mut flow_index = skip::to(&tokenizer.events, 0, &[Name::Data]);
    while flow_index < tokenizer.events.len()
        // To do: use `!is_some_and()` when that’s stable.
        && (tokenizer.events[flow_index].link.is_none()
            || tokenizer.events[flow_index].link.as_ref().unwrap().content != Content::Flow)
    {
        flow_index = skip::to(&tokenizer.events, flow_index + 1, &[Name::Data]);
    }

    // Now, add all child events into our parent document tokenizer.
    divide_events(
        &mut tokenizer.map,
        &tokenizer.events,
        flow_index,
        &mut child.events,
        (0, 0),
    );

    // Replace the flow data with actual events.
    tokenizer.map.consume(&mut tokenizer.events);

    // Now, add some final container exits due to the EOF.
    // We can’t inject them into the child earlier, as they are “outside” its
    // linked data.
    if line < tokenizer.tokenize_state.document_exits.len() {
        if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
            let mut exit_index = 0;
            while exit_index < exits.len() {
                exits[exit_index].point = tokenizer.point.clone();
                exit_index += 1;
            }

            tokenizer.events.append(&mut exits);
        }
    }

    // Add the resolvers from child.
    tokenizer
        .resolvers
        .append(&mut child.resolvers.split_off(0));

    tokenizer
        .tokenize_state
        .definitions
        .append(&mut child.tokenize_state.definitions.split_off(0));
}