use crate::content::{string::start as string, text::start as text}; use crate::tokenizer::{ Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, }; use crate::util::span; use std::collections::HashMap; /// To do. pub fn subtokenize(events: Vec, codes: &[Code]) -> (Vec, bool) { let mut events = events; let mut index = 0; // Map of first chunks its tokenizer. let mut head_to_tokenizer: HashMap = HashMap::new(); // Map of chunks to their head and corresponding range of events. let mut link_to_info: HashMap = HashMap::new(); let mut done = true; if events.is_empty() { return (events, true); } while index < events.len() { let event = &events[index]; // Find each first opening chunk. if (event.token_type == TokenType::ChunkString || event.token_type == TokenType::ChunkText) && event.event_type == EventType::Enter && // No need to enter linked events again. event.previous == None { done = false; // Index into `events` pointing to a chunk. let mut index_opt: Option = Some(index); // Subtokenizer. let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); // Substate. let mut result: StateFnResult = ( State::Fn(Box::new(if event.token_type == TokenType::ChunkString { string } else { text })), None, ); // Indices into `codes` of each end of chunk. let mut ends: Vec = vec![]; // Loop through chunks to pass them in order to the subtokenizer. while let Some(index_ptr) = index_opt { let enter = &events[index_ptr]; assert_eq!(enter.event_type, EventType::Enter); let span = span::Span { start_index: enter.index, end_index: events[index_ptr + 1].index, }; ends.push(span.end_index); if enter.previous != None { tokenizer.define_skip(&enter.point, span.start_index); } let func: Box = match result.0 { State::Fn(func) => func, _ => unreachable!("cannot be ok/nok"), }; result = tokenizer.feed(span::codes(codes, &span), func, enter.next == None); assert!(result.1.is_none(), "expected no remainder"); index_opt = enter.next; } // Now, loop through all subevents (and `ends`), to figure out // which parts belong where. // Current index. let mut subindex = 0; // Index into subevents that starts the current slice. let mut last_start = 0; // Counter into `ends`: the linked token we are at. let mut end_index = 0; let mut index_opt: Option = Some(index); while subindex < tokenizer.events.len() { let subevent = &mut tokenizer.events[subindex]; // Find the first event that starts after the end we’re looking // for. // To do: is this logic correct? if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] { let link = index_opt.unwrap(); link_to_info.insert(link, (index, last_start, subindex)); last_start = subindex; end_index += 1; index_opt = events[link].next; } // If there is a `next` link in the subevents, we have to change // its index to account for the shifted events. // If it points to a next event, we also change the next event’s // reference back to *this* event. if let Some(next) = subevent.next { // The `index` in `events` where the current link is, // minus 2 events (the enter and exit) for each removed // link. let shift = index_opt.unwrap() - (end_index * 2); subevent.next = Some(next + shift); let next_ev = &mut tokenizer.events[next]; let previous = next_ev.previous.unwrap(); next_ev.previous = Some(previous + shift); } subindex += 1; } link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex)); head_to_tokenizer.insert(index, tokenizer); } index += 1; } // Now that we fed everything into a tokenizer, and we know which parts // belong where, the final task is to splice the events from each // tokenizer into the current events. // To do: instead of splicing, it might be possible to create a new `events` // from each slice and slices from events? let mut index = events.len() - 1; while index > 0 { let slice_opt = link_to_info.get(&index); if let Some(slice) = slice_opt { let (head, start, end) = *slice; // If there’s a slice at this index, it must also point to a head, // and that head must have a tokenizer. let tokenizer = head_to_tokenizer.get(&head).unwrap(); // To do: figure out a way that moves instead of clones? events.splice(index..(index + 2), tokenizer.events[start..end].to_vec()); } index -= 1; } (events, done) }