aboutsummaryrefslogblamecommitdiffstats
path: root/src/subtokenize.rs
blob: 4a29a01fe6b43112d8f6eafa0c87eed270aaab5b (plain) (tree)
1
2
3
4
5
6
7
8
9
                                                                   


                                                                                
                      
                              
 
          
                                                                              

                            



                                                                                 
                        



                                   

                                                      
                                                               



                                                    
                         





                                                                                 
                                                                                  
                          

                        




                                                         
 


                                                                             
                                                               
                                       



                                                           
 


                                                                          
 




                                                          
                                                                                             
















                                                                          
                                                               



                                                           
                                                               












                                                                                                 















                                                                                  


                              
                                                                                   
                                                       
         
























                                                                                     

     
                  
 
use crate::content::{string::start as string, text::start as text};
use crate::tokenizer::{
    Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer,
};
use crate::util::span;
use std::collections::HashMap;

/// To do.
pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
    let mut events = events;
    let mut index = 0;
    // Map of first chunks its tokenizer.
    let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new();
    // Map of chunks to their head and corresponding range of events.
    let mut link_to_info: HashMap<usize, (usize, usize, usize)> = HashMap::new();
    let mut done = true;

    while index < events.len() {
        let event = &events[index];

        // Find each first opening chunk.
        if (event.token_type == TokenType::ChunkString
                || event.token_type == TokenType::ChunkText) &&
            event.event_type == EventType::Enter &&
            // No need to enter linked events again.
            event.previous == None
        {
            done = false;
            // Index into `events` pointing to a chunk.
            let mut index_opt: Option<usize> = Some(index);
            // Subtokenizer.
            let mut tokenizer = Tokenizer::new(event.point.clone(), event.index);
            // Substate.
            let mut result: StateFnResult = (
                State::Fn(Box::new(if event.token_type == TokenType::ChunkString {
                    string
                } else {
                    text
                })),
                None,
            );
            // Indices into `codes` of each end of chunk.
            let mut ends: Vec<usize> = vec![];

            // Loop through chunks to pass them in order to the subtokenizer.
            while let Some(index_ptr) = index_opt {
                let enter = &events[index_ptr];
                assert_eq!(enter.event_type, EventType::Enter);
                let span = span::Span {
                    start_index: enter.index,
                    end_index: events[index_ptr + 1].index,
                };
                ends.push(span.end_index);

                if enter.previous != None {
                    tokenizer.define_skip(&enter.point, span.start_index);
                }

                let func: Box<StateFn> = match result.0 {
                    State::Fn(func) => func,
                    _ => unreachable!("cannot be ok/nok"),
                };

                result = tokenizer.feed(span::codes(codes, &span), func, enter.next == None);

                if let Some(ref x) = result.1 {
                    if !x.is_empty() {
                        // To do: handle?
                        unreachable!("subtokenize:remainder {:?}", x);
                    }
                }

                index_opt = enter.next;
            }

            // Now, loop through all subevents (and `ends`), to figure out
            // which parts belong where.
            // Current index.
            let mut subindex = 0;
            // Index into subevents that starts the current slice.
            let mut last_start = 0;
            // Counter into `ends`: the linked token we are at.
            let mut end_index = 0;
            let mut index_opt: Option<usize> = Some(index);

            while subindex < tokenizer.events.len() {
                let subevent = &mut tokenizer.events[subindex];

                // Find the first event that starts after the end we’re looking
                // for.
                // To do: is this logic correct?
                if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] {
                    let link = index_opt.unwrap();
                    link_to_info.insert(link, (index, last_start, subindex));

                    last_start = subindex;
                    end_index += 1;
                    index_opt = events[link].next;
                }

                // If there is a `next` link in the subevents, we have to change
                // its index to account for the shifted events.
                // If it points to a next event, we also change the next event’s
                // reference back to *this* event.
                if let Some(next) = subevent.next {
                    // The `index` in `events` where the current link is,
                    // minus 2 events (the enter and exit) for each removed
                    // link.
                    let shift = index_opt.unwrap() - (end_index * 2);

                    subevent.next = Some(next + shift);
                    let next_ev = &mut tokenizer.events[next];
                    let previous = next_ev.previous.unwrap();
                    next_ev.previous = Some(previous + shift);
                }

                subindex += 1;
            }

            link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
            head_to_tokenizer.insert(index, tokenizer);
        }

        index += 1;
    }

    // Now that we fed everything into a tokenizer, and we know which parts
    // belong where, the final task is to splice the events from each
    // tokenizer into the current events.
    // To do: instead of splicing, it might be possible to create a new `events`
    // from each slice and slices from events?
    let mut index = events.len() - 1;

    while index > 0 {
        let slice_opt = link_to_info.get(&index);

        if let Some(slice) = slice_opt {
            let (head, start, end) = *slice;
            // If there’s a slice at this index, it must also point to a head,
            // and that head must have a tokenizer.
            let tokenizer = head_to_tokenizer.get(&head).unwrap();

            // To do: figure out a way that moves instead of clones?
            events.splice(index..(index + 2), tokenizer.events[start..end].to_vec());
        }

        index -= 1;
    }

    (events, done)
}