aboutsummaryrefslogblamecommitdiffstats
path: root/src/content/document.rs
blob: 76d510a2f0828a71eb8ec4276d380300b88e1d4a (plain) (tree)
1
2
3
4
5
6
7
8
9







                                                                           
                                      
 
                       

                                                                  
                              



                                        
                        
                       
                                                                                  
  

                                               
         
                             
  
 
                                        
                           


























                                                                          

 
                                   
                     
                                                 
                     
                                                       
                 
                                              
                                          






                                                               
                       


                                                          

                                                                           
 
                                                                             
                                 

                      
                                 



                                             
                                                                                                    
                                                                               
                                          
                                     
                                                
                                                                         
                 
                          

              

                                           
             




                   
                                      
 
                                          
 
                                                   
 
          

 








                                               
                                                

 






                    
                                              
                             
                 
                     
                       
                             
                                
                                
                      
      
                               

 







                    
                                                                           


                                        

                                                                  
                                              

 
                               





               
                                                                                          
                                                                                   
                                          

                                                          

                                                      

          
                                              

                                           
                                                               
                    
                                                                 
             
                     


                                       
                                             


     






                                      
                                                                                           
                                                        
                                                 
                                         

 






                                
                                                                                         
                                                        
                                                 
                        
                                              
 
 







                           
                                                                                 

                                                                             
                                           



                                                                                         
                               
                                                     
         

     







                                               

                                              
                                                      
                
                                  







                                                           






                                                        
                             
              
         
                 

 







                          
                                                                                    

                                                        

                                                               


                                                   

      

                                                
 

                           
 




                                                       

     
                                                                     
 



                                                                              

     


                                 
                                  
                                
                                         

 







                                  
                                                                                 




                                            
                                                             
 
                                                        
                                                
                                    
 
                          




                                      
                                           

                                                                 

 






                                     
                                                                                        








                                                
                                                             
                                          

     

                                                               

     
                                      
                                                
 
                  
                      


                                                                     

             
                                          
                  
         
                                                                 

                             
                                       



         

                                          

                              
                  
                   
                                                               

                                                                                  
                             
                                        
                                                                        

                                                                                     
                                                
 
                                    





                                                     
                                                          

                                   
                                                   


                                                       

          




                                               
                       
           

     
                                                                               
                                            
                                  



        
                               
                                                                
                      

                                              
                                            
 


                                                        
                                                
         
 






                                                                                                    
                               
                          
                 



                                                             
             

                       

         




                                                                       
 




                                                                    


                                
                                                 
     
 
                                                 
 
//! The document content type.
//!
//! **Document** represents the containers, such as block quotes and lists,
//! which structure the document and contain other sections.
//!
//! The constructs found in flow are:
//!
//! *   [Block quote][crate::construct::block_quote]
//! *   [List][crate::construct::list]

use crate::construct::{
    block_quote::{cont as block_quote_cont, start as block_quote},
    list::{cont as list_item_const, start as list_item},
    partial_bom::start as bom,
};
use crate::content::flow::start as flow;
use crate::parser::ParseState;
use crate::subtokenize::subtokenize;
use crate::token::Token;
use crate::tokenizer::{
    Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
};
use crate::util::{
    normalize_identifier::normalize_identifier,
    skip,
    slice::{Position, Slice},
};

/// Phases where we can exit containers.
#[derive(Debug, PartialEq)]
enum Phase {
    /// After parsing a line of lazy flow which resulted in something that
    /// exits containers before the line.
    ///
    /// ```markdown
    ///   | * a
    /// > | ```js
    ///          ^
    ///   | b
    ///   | ```
    /// ```
    After,
    /// When a new container replaces an existing container.
    ///
    /// ```markdown
    ///   | * a
    /// > | > b
    ///     ^
    /// ```
    Prefix,
    /// After everything.
    ///
    /// ```markdown
    /// > | * a
    ///        ^
    /// ```
    Eof,
}

/// State needed to parse document.
struct DocumentInfo {
    /// Number of containers that have continued.
    continued: usize,
    /// Index into `tokenizer.events` we need to track.
    index: usize,
    /// Events of containers added back later.
    inject: Vec<(Vec<Event>, Vec<Event>)>,
    /// The value of the previous line of flow’s `interrupt`.
    interrupt_before: bool,
    /// Whether the previous line of flow was a paragraph.
    paragraph_before: bool,
    /// Current containers.
    stack: Vec<ContainerState>,
    /// Current flow state function.
    next: Box<StateFn>,
}

/// Turn `codes` as the document content type into events.
pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
    let mut tokenizer = Tokenizer::new(point, parse_state);

    let state = tokenizer.push(0, parse_state.bytes.len(), Box::new(before));
    tokenizer.flush(state, true);

    let mut index = 0;
    let mut definitions = vec![];

    while index < tokenizer.events.len() {
        let event = &tokenizer.events[index];

        if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
            // Note: we don‘t care about virtual spaces, so `as_str` is fine.
            let id = normalize_identifier(
                Slice::from_position(
                    tokenizer.parse_state.bytes,
                    &Position::from_exit_event(&tokenizer.events, index),
                )
                .as_str(),
            );

            if !definitions.contains(&id) {
                definitions.push(id);
            }
        }

        index += 1;
    }

    let mut events = tokenizer.events;

    parse_state.definitions = definitions;

    while !subtokenize(&mut events, parse_state) {}

    events
}

/// At the beginning.
///
/// Perhaps a BOM?
///
/// ```markdown
/// > | a
///     ^
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
    tokenizer.attempt_opt(bom, start)(tokenizer)
}

/// Before document.
//
/// ```markdown
/// > | * a
///     ^
///   | > b
/// ```
fn start(tokenizer: &mut Tokenizer) -> State {
    let info = DocumentInfo {
        index: 0,
        continued: 0,
        inject: vec![],
        next: Box::new(flow),
        paragraph_before: false,
        interrupt_before: false,
        stack: vec![],
    };
    line_start(tokenizer, info)
}

/// Start of a line.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
fn line_start(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State {
    info.index = tokenizer.events.len();
    info.inject.push((vec![], vec![]));
    info.continued = 0;
    // Containers would only be interrupting if we’ve continued.
    tokenizer.interrupt = false;
    container_existing_before(tokenizer, info)
}

/// Before existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
fn container_existing_before(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State {
    // If there are more existing containers, check whether the next one continues.
    if info.continued < info.stack.len() {
        let container = info.stack.remove(info.continued);
        let cont = match container.kind {
            Container::BlockQuote => block_quote_cont,
            Container::ListItem => list_item_const,
        };

        tokenizer.container = Some(container);
        tokenizer.attempt(cont, move |ok| {
            if ok {
                Box::new(|t| container_existing_after(t, info))
            } else {
                Box::new(|t| container_existing_missing(t, info))
            }
        })(tokenizer)
    }
    // Otherwise, check new containers.
    else {
        container_new_before(tokenizer, info)
    }
}

/// At a missing, existing containers.
//
/// ```markdown
///   | * a
/// > | > b
///     ^
/// ```
fn container_existing_missing(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State {
    let container = tokenizer.container.take().unwrap();
    info.stack.insert(info.continued, container);
    container_new_before(tokenizer, info)
}

/// After an existing container.
//
/// ```markdown
///   | * a
/// > |   b
///       ^
/// ```
fn container_existing_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State {
    let container = tokenizer.container.take().unwrap();
    info.stack.insert(info.continued, container);
    info.continued += 1;
    container_existing_before(tokenizer, info)
}

/// Before a new container.
//
/// ```markdown
/// > | * a
///     ^
/// > | > b
///     ^
/// ```
fn container_new_before(tokenizer: &mut Tokenizer, info: DocumentInfo) -> State {
    // If we have completely continued, restore the flow’s past `interrupt`
    // status.
    if info.continued == info.stack.len() {
        tokenizer.interrupt = info.interrupt_before;

        // …and if we’re in a concrete construct, new containers can’t “pierce”
        // into them.
        if tokenizer.concrete {
            return containers_after(tokenizer, info);
        }
    }

    // Check for a new container.
    // Block quote?
    tokenizer.container = Some(ContainerState {
        kind: Container::BlockQuote,
        blank_initial: false,
        size: 0,
    });

    tokenizer.attempt(block_quote, move |ok| {
        if ok {
            Box::new(|t| container_new_after(t, info))
        } else {
            Box::new(|tokenizer| {
                // List item?
                tokenizer.container = Some(ContainerState {
                    kind: Container::ListItem,
                    blank_initial: false,
                    size: 0,
                });

                tokenizer.attempt(list_item, |ok| {
                    Box::new(move |t| {
                        if ok {
                            container_new_after(t, info)
                        } else {
                            containers_after(t, info)
                        }
                    })
                })(tokenizer)
            })
        }
    })(tokenizer)
}

/// After a new container.
//
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
fn container_new_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State {
    let container = tokenizer.container.take().unwrap();

    // Remove from the event stack.
    // We’ll properly add exits at different points manually.
    let token_type = match container.kind {
        Container::BlockQuote => Token::BlockQuote,
        Container::ListItem => Token::ListItem,
    };

    let mut stack_index = tokenizer.stack.len();
    let mut found = false;

    while stack_index > 0 {
        stack_index -= 1;

        if tokenizer.stack[stack_index] == token_type {
            tokenizer.stack.remove(stack_index);
            found = true;
            break;
        }
    }

    debug_assert!(found, "expected to find container token to exit");

    // If we did not continue all existing containers, and there is a new one,
    // close the flow and those containers.
    if info.continued != info.stack.len() {
        info = exit_containers(tokenizer, info, &Phase::Prefix);
    }

    // Try another new container.
    info.stack.push(container);
    info.continued += 1;
    info.interrupt_before = false;
    tokenizer.interrupt = false;
    container_new_before(tokenizer, info)
}

/// After containers, before flow.
//
/// ```markdown
/// > | * a
///       ^
/// > | > b
///       ^
/// ```
fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State {
    // Store the container events we parsed.
    info.inject
        .last_mut()
        .unwrap()
        .0
        .append(&mut tokenizer.events.split_off(info.index));

    tokenizer.lazy = info.continued != info.stack.len();
    tokenizer.interrupt = info.interrupt_before;
    tokenizer.define_skip_current();

    let state = info.next;
    info.next = Box::new(flow);

    // Parse flow, pausing after eols.
    tokenizer.go_until(
        state,
        |code| matches!(code, Some(b'\n')),
        move |state| Box::new(move |t| flow_end(t, info, state)),
    )(tokenizer)
}

/// After flow (after eol or at eof).
//
/// ```markdown
///   | * a
/// > | > b
///     ^  ^
/// ```
fn flow_end(tokenizer: &mut Tokenizer, mut info: DocumentInfo, result: State) -> State {
    let paragraph = !tokenizer.events.is_empty()
        && tokenizer.events[skip::opt_back(
            &tokenizer.events,
            tokenizer.events.len() - 1,
            &[Token::LineEnding],
        )]
        .token_type
            == Token::Paragraph;

    if tokenizer.lazy && info.paragraph_before && paragraph {
        info.continued = info.stack.len();
    }

    if info.continued != info.stack.len() {
        info = exit_containers(tokenizer, info, &Phase::After);
    }

    info.paragraph_before = paragraph;
    info.interrupt_before = tokenizer.interrupt;

    match result {
        State::Ok => {
            if !info.stack.is_empty() {
                info.continued = 0;
                info = exit_containers(tokenizer, info, &Phase::Eof);
            }

            resolve(tokenizer, &mut info);
            result
        }
        State::Nok => unreachable!("unexpected `nok` from flow"),
        State::Fn(func) => {
            info.next = func;
            line_start(tokenizer, info)
        }
    }
}

/// Close containers (and flow if needed).
fn exit_containers(
    tokenizer: &mut Tokenizer,
    mut info: DocumentInfo,
    phase: &Phase,
) -> DocumentInfo {
    let mut stack_close = info.stack.split_off(info.continued);

    // So, we’re at the end of a line, but we need to close the *previous* line.
    if *phase != Phase::Eof {
        tokenizer.define_skip_current();
        let mut current_events = tokenizer.events.split_off(info.index);
        let next = info.next;
        info.next = Box::new(flow); // This is weird but Rust needs a function there.
        tokenizer.flush(State::Fn(next), false);

        if *phase == Phase::Prefix {
            info.index = tokenizer.events.len();
        }

        tokenizer.events.append(&mut current_events);
    }

    let mut exits = Vec::with_capacity(stack_close.len());

    while !stack_close.is_empty() {
        let container = stack_close.pop().unwrap();
        let token_type = match container.kind {
            Container::BlockQuote => Token::BlockQuote,
            Container::ListItem => Token::ListItem,
        };

        exits.push(Event {
            event_type: EventType::Exit,
            token_type: token_type.clone(),
            // Note: positions are fixed later.
            point: tokenizer.point.clone(),
            link: None,
        });
    }

    let index = info.inject.len() - (if *phase == Phase::Eof { 1 } else { 2 });
    info.inject[index].1.append(&mut exits);
    info.interrupt_before = false;

    info
}

// Inject the container events.
fn resolve(tokenizer: &mut Tokenizer, info: &mut DocumentInfo) {
    let mut index = 0;
    let mut inject = info.inject.split_off(0);
    inject.reverse();
    let mut first_line_ending_in_run = None;

    while let Some((before, mut after)) = inject.pop() {
        if !before.is_empty() {
            first_line_ending_in_run = None;
            tokenizer.map.add(index, 0, before);
        }

        while index < tokenizer.events.len() {
            let event = &tokenizer.events[index];

            if event.token_type == Token::LineEnding || event.token_type == Token::BlankLineEnding {
                if event.event_type == EventType::Enter {
                    first_line_ending_in_run = first_line_ending_in_run.or(Some(index));
                } else {
                    index += 1;
                    break;
                }
            } else if event.token_type == Token::SpaceOrTab {
                // Empty to allow whitespace in blank lines.
            } else if first_line_ending_in_run.is_some() {
                first_line_ending_in_run = None;
            }

            index += 1;
        }

        let point_rel = if let Some(index) = first_line_ending_in_run {
            &tokenizer.events[index].point
        } else {
            &tokenizer.point
        };

        let close_index = first_line_ending_in_run.unwrap_or(index);

        let mut subevent_index = 0;
        while subevent_index < after.len() {
            after[subevent_index].point = point_rel.clone();
            subevent_index += 1;
        }

        tokenizer.map.add(close_index, 0, after);
    }

    tokenizer.map.consume(&mut tokenizer.events);
}