aboutsummaryrefslogblamecommitdiffstats
path: root/src/content/document.rs
blob: 33c8ff9768465ea3aeef1d7a471fbd177a683584 (plain) (tree)
1
2
3
4
5
6
7
8
9







                                                                           
                                      
 
                       

                                                                  
                              



                                        
                        
                                                                                             

                                               
         
                             
  
 
                                        
                           


























                                                                          

 
                                                          

                                                                           
 
                                                                            
                                 

                      
                                 



                                             
                                                                                                    
                                                                               
                                          
                                     
                                                
                                                                         
                 
                          

              

                                           
             




                   
                                      
 
                                          
 
                                                   
 
          

 







                     
                                              
                                                     

 







                    






                                                                     

                                                                  
                                        

 
                               





               
                                                                  
                                                                                   






                                                                 
                                         

                                                      

          
                                              


                                        
                    

                                          
                     


                                       
                                       


     






                                      
                                                                   
                                                        




                                                                        

 






                                
                                                                 
                                                        





                                                                        
 
 







                           
                                                             

                                                                             



                                                                                 


                                                                                         
                               
                                               
         

     







                                               


                                         
                



















                                                                            
                 

 
                          
   





               
                                                            

                                                        

                                                               


                                                   

      

                                                
 

                           
 




                                                       

     
                                                                     
 

                                                                              



                                                                  

     
                                 





                                                               
                                
                                   

 







                                  
                                                         
                                            


                        


                   








                                                                             
                                    
 




                                           



                                      
                                           
                                                 
                

 






                                     
                                                                








                                                


                                                                                          

     



                                                                  

     
                  
                      


                                                                              

             

                               
         
                                                                 
                            



                                                                                     



         
                                          




                                                                

                                                                                  
                             
                                        








                                                                
 
                                    
                                                                             




                                                     
                                                          

                                   
                                                   


                                                       

          




                                               
                       
           

     





                                                                                                    

 
                               
                                       
                      
                                                                           
                     
                                            
 


                                                        
                                                
         
 






                                                                                                    
                               
                          
                 



                                                             
             

                       

         




                                                                       
 




                                                                    


                                
                                                 
     
 
                                                 
 
//! The document content type.
//!
//! **Document** represents the containers, such as block quotes and lists,
//! which structure the document and contain other sections.
//!
//! The constructs found in flow are:
//!
//! *   [Block quote][crate::construct::block_quote]
//! *   [List][crate::construct::list]

use crate::construct::{
    block_quote::{cont as block_quote_cont, start as block_quote},
    list::{cont as list_item_const, start as list_item},
    partial_bom::start as bom,
};
use crate::content::flow::start as flow;
use crate::parser::ParseState;
use crate::subtokenize::subtokenize;
use crate::token::Token;
use crate::tokenizer::{Container, ContainerState, Event, EventType, Point, State, Tokenizer};
use crate::util::{
    normalize_identifier::normalize_identifier,
    skip,
    slice::{Position, Slice},
};

/// Phases where we can exit containers.
#[derive(Debug, PartialEq)]
enum Phase {
    /// After parsing a line of lazy flow which resulted in something that
    /// exits containers before the line.
    ///
    /// ```markdown
    ///   | * a
    /// > | ```js
    ///          ^
    ///   | b
    ///   | ```
    /// ```
    After,
    /// When a new container replaces an existing container.
    ///
    /// ```markdown
    ///   | * a
    /// > | > b
    ///     ^
    /// ```
    Prefix,
    /// After everything.
    ///
    /// ```markdown
    /// > | * a
    ///        ^
    /// ```
    Eof,
}

/// Turn `codes` as the document content type into events.
pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
    let mut tokenizer = Tokenizer::new(point, parse_state);

    let state = tokenizer.push(0, parse_state.bytes.len(), Box::new(start));
    tokenizer.flush(state, true);

    let mut index = 0;
    let mut definitions = vec![];

    while index < tokenizer.events.len() {
        let event = &tokenizer.events[index];

        if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
            // Note: we don‘t care about virtual spaces, so `as_str` is fine.
            let id = normalize_identifier(
                Slice::from_position(
                    tokenizer.parse_state.bytes,
                    &Position::from_exit_event(&tokenizer.events, index),
                )
                .as_str(),
            );

            if !definitions.contains(&id) {
                definitions.push(id);
            }
        }

        index += 1;
    }

    let mut events = tokenizer.events;

    parse_state.definitions = definitions;

    while !subtokenize(&mut events, parse_state) {}

    events
}

/// At the beginning.
///
/// Perhaps a BOM?
///
/// ```markdown
/// > | a
///     ^
/// ```
fn start(tokenizer: &mut Tokenizer) -> State {
    tokenizer.attempt_opt(bom, line_start)(tokenizer)
}

/// Start of a line.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
fn line_start(tokenizer: &mut Tokenizer) -> State {
    tokenizer.tokenize_state.document_continued = 0;
    tokenizer.tokenize_state.document_index = tokenizer.events.len();
    tokenizer
        .tokenize_state
        .document_inject
        .push((vec![], vec![]));
    // Containers would only be interrupting if we’ve continued.
    tokenizer.interrupt = false;
    container_existing_before(tokenizer)
}

/// Before existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
    // If there are more existing containers, check whether the next one continues.
    if tokenizer.tokenize_state.document_continued
        < tokenizer.tokenize_state.document_container_stack.len()
    {
        let container = tokenizer
            .tokenize_state
            .document_container_stack
            .remove(tokenizer.tokenize_state.document_continued);
        let cont = match container.kind {
            Container::BlockQuote => block_quote_cont,
            Container::ListItem => list_item_const,
        };

        tokenizer.container = Some(container);
        tokenizer.attempt(cont, |ok| {
            Box::new(if ok {
                container_existing_after
            } else {
                container_existing_missing
            })
        })(tokenizer)
    }
    // Otherwise, check new containers.
    else {
        container_new_before(tokenizer)
    }
}

/// At a missing, existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
fn container_existing_missing(tokenizer: &mut Tokenizer) -> State {
    let container = tokenizer.container.take().unwrap();
    tokenizer
        .tokenize_state
        .document_container_stack
        .insert(tokenizer.tokenize_state.document_continued, container);
    container_new_before(tokenizer)
}

/// After an existing container.
//
/// ```markdown
///   | * a
/// > |   b
///       ^
/// ```
fn container_existing_after(tokenizer: &mut Tokenizer) -> State {
    let container = tokenizer.container.take().unwrap();
    tokenizer
        .tokenize_state
        .document_container_stack
        .insert(tokenizer.tokenize_state.document_continued, container);
    tokenizer.tokenize_state.document_continued += 1;
    container_existing_before(tokenizer)
}

/// Before a new container.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
fn container_new_before(tokenizer: &mut Tokenizer) -> State {
    // If we have completely continued, restore the flow’s past `interrupt`
    // status.
    if tokenizer.tokenize_state.document_continued
        == tokenizer.tokenize_state.document_container_stack.len()
    {
        tokenizer.interrupt = tokenizer.tokenize_state.document_interrupt_before;

        // …and if we’re in a concrete construct, new containers can’t “pierce”
        // into them.
        if tokenizer.concrete {
            return containers_after(tokenizer);
        }
    }

    // Check for a new container.
    // Block quote?
    tokenizer.container = Some(ContainerState {
        kind: Container::BlockQuote,
        blank_initial: false,
        size: 0,
    });

    tokenizer.attempt(block_quote, |ok| {
        Box::new(if ok {
            container_new_after
        } else {
            container_new_before_not_blockquote
        })
    })(tokenizer)
}

/// To do.
fn container_new_before_not_blockquote(tokenizer: &mut Tokenizer) -> State {
    // List item?
    tokenizer.container = Some(ContainerState {
        kind: Container::ListItem,
        blank_initial: false,
        size: 0,
    });

    tokenizer.attempt(list_item, |ok| {
        Box::new(if ok {
            container_new_after
        } else {
            containers_after
        })
    })(tokenizer)
}

/// After a new container.
///
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
fn container_new_after(tokenizer: &mut Tokenizer) -> State {
    let container = tokenizer.container.take().unwrap();

    // Remove from the event stack.
    // We’ll properly add exits at different points manually.
    let token_type = match container.kind {
        Container::BlockQuote => Token::BlockQuote,
        Container::ListItem => Token::ListItem,
    };

    let mut stack_index = tokenizer.stack.len();
    let mut found = false;

    while stack_index > 0 {
        stack_index -= 1;

        if tokenizer.stack[stack_index] == token_type {
            tokenizer.stack.remove(stack_index);
            found = true;
            break;
        }
    }

    debug_assert!(found, "expected to find container token to exit");

    // If we did not continue all existing containers, and there is a new one,
    // close the flow and those containers.
    if tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len()
    {
        exit_containers(tokenizer, &Phase::Prefix);
    }

    // Try another new container.
    tokenizer
        .tokenize_state
        .document_container_stack
        .push(container);
    tokenizer.tokenize_state.document_continued += 1;
    tokenizer.tokenize_state.document_interrupt_before = false;
    tokenizer.interrupt = false;
    container_new_before(tokenizer)
}

/// After containers, before flow.
//
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
fn containers_after(tokenizer: &mut Tokenizer) -> State {
    // Store the container events we parsed.
    tokenizer
        .tokenize_state
        .document_inject
        .last_mut()
        .unwrap()
        .0
        .append(
            &mut tokenizer
                .events
                .split_off(tokenizer.tokenize_state.document_index),
        );

    tokenizer.lazy = tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len();
    tokenizer.interrupt = tokenizer.tokenize_state.document_interrupt_before;
    tokenizer.define_skip_current();

    let state = tokenizer
        .tokenize_state
        .document_next
        .take()
        .unwrap_or_else(|| Box::new(flow));

    // Parse flow, pausing after eols.
    tokenizer.go_until(
        state,
        |code| matches!(code, Some(b'\n')),
        |state| Box::new(|t| flow_end(t, state)),
    )(tokenizer)
}

/// After flow (after eol or at eof).
//
/// ```markdown
///   | * a
/// > | > b
///     ^  ^
/// ```
fn flow_end(tokenizer: &mut Tokenizer, result: State) -> State {
    let paragraph = !tokenizer.events.is_empty()
        && tokenizer.events[skip::opt_back(
            &tokenizer.events,
            tokenizer.events.len() - 1,
            &[Token::LineEnding],
        )]
        .token_type
            == Token::Paragraph;

    if tokenizer.lazy && paragraph && tokenizer.tokenize_state.document_paragraph_before {
        tokenizer.tokenize_state.document_continued =
            tokenizer.tokenize_state.document_container_stack.len();
    }

    if tokenizer.tokenize_state.document_continued
        != tokenizer.tokenize_state.document_container_stack.len()
    {
        exit_containers(tokenizer, &Phase::After);
    }

    match result {
        State::Ok => {
            if !tokenizer.tokenize_state.document_container_stack.is_empty() {
                tokenizer.tokenize_state.document_continued = 0;
                exit_containers(tokenizer, &Phase::Eof);
            }

            resolve(tokenizer);
            State::Ok
        }
        State::Nok => unreachable!("unexpected `nok` from flow"),
        State::Fn(func) => {
            tokenizer.tokenize_state.document_paragraph_before = paragraph;
            tokenizer.tokenize_state.document_interrupt_before = tokenizer.interrupt;
            tokenizer.tokenize_state.document_next = Some(func);
            line_start(tokenizer)
        }
    }
}

/// Close containers (and flow if needed).
fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
    let mut stack_close = tokenizer
        .tokenize_state
        .document_container_stack
        .split_off(tokenizer.tokenize_state.document_continued);

    // So, we’re at the end of a line, but we need to close the *previous* line.
    if *phase != Phase::Eof {
        tokenizer.define_skip_current();
        let mut current_events = tokenizer
            .events
            .split_off(tokenizer.tokenize_state.document_index);
        let state = tokenizer
            .tokenize_state
            .document_next
            .take()
            .unwrap_or_else(|| Box::new(flow));
        tokenizer.flush(State::Fn(state), false);

        if *phase == Phase::Prefix {
            tokenizer.tokenize_state.document_index = tokenizer.events.len();
        }

        tokenizer.events.append(&mut current_events);
    }

    let mut exits = Vec::with_capacity(stack_close.len());

    while !stack_close.is_empty() {
        let container = stack_close.pop().unwrap();
        let token_type = match container.kind {
            Container::BlockQuote => Token::BlockQuote,
            Container::ListItem => Token::ListItem,
        };

        exits.push(Event {
            event_type: EventType::Exit,
            token_type: token_type.clone(),
            // Note: positions are fixed later.
            point: tokenizer.point.clone(),
            link: None,
        });
    }

    let index =
        tokenizer.tokenize_state.document_inject.len() - (if *phase == Phase::Eof { 1 } else { 2 });
    tokenizer.tokenize_state.document_inject[index]
        .1
        .append(&mut exits);
    tokenizer.tokenize_state.document_interrupt_before = false;
}

// Inject the container events.
fn resolve(tokenizer: &mut Tokenizer) {
    let mut index = 0;
    let mut inject = tokenizer.tokenize_state.document_inject.split_off(0);
    inject.reverse();
    let mut first_line_ending_in_run = None;

    while let Some((before, mut after)) = inject.pop() {
        if !before.is_empty() {
            first_line_ending_in_run = None;
            tokenizer.map.add(index, 0, before);
        }

        while index < tokenizer.events.len() {
            let event = &tokenizer.events[index];

            if event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding {
                if event.event_type == EventType::Enter {
                    first_line_ending_in_run = first_line_ending_in_run.or(Some(index));
                } else {
                    index += 1;
                    break;
                }
            } else if event.token_type == Token::SpaceOrTab {
                // Empty to allow whitespace in blank lines.
            } else if first_line_ending_in_run.is_some() {
                first_line_ending_in_run = None;
            }

            index += 1;
        }

        let point_rel = if let Some(index) = first_line_ending_in_run {
            &tokenizer.events[index].point
        } else {
            &tokenizer.point
        };

        let close_index = first_line_ending_in_run.unwrap_or(index);

        let mut subevent_index = 0;
        while subevent_index < after.len() {
            after[subevent_index].point = point_rel.clone();
            subevent_index += 1;
        }

        tokenizer.map.add(close_index, 0, after);
    }

    tokenizer.map.consume(&mut tokenizer.events);
}