diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-10 16:29:56 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-10 16:29:56 +0200 |
commit | 5133042973f31a3992f216e591d840bb491bfd45 (patch) | |
tree | 810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src/subtokenize.rs | |
parent | 021d5f989ae41ae39a9b937b498141d9dc70d894 (diff) | |
download | markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2 markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip |
Add proper support for subtokenization
- Add “content” content type
- Add paragraph
- Add skips
- Add linked tokens
Diffstat (limited to 'src/subtokenize.rs')
-rw-r--r-- | src/subtokenize.rs | 166 |
1 files changed, 116 insertions, 50 deletions
diff --git a/src/subtokenize.rs b/src/subtokenize.rs index c1a8435..adf843f 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -1,66 +1,132 @@ -use crate::content::string::string; -use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::content::content::start as content; +use crate::content::string::start as string; +use crate::tokenizer::{ + Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, +}; use crate::util::{slice_codes, Span}; +use std::collections::HashMap; pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> Vec<Event> { let mut events = events; let mut index = 0; - - // println!("before"); - // while index < events.len() { - // let event = &events[index]; - // println!( - // "ev1: {:?} {:?} {:?}", - // event.event_type, event.token_type, index - // ); - // index += 1; - // } - // - // index = 0; - // - // println!("change"); + // Map of first chunks its tokenizer. + let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new(); + // Map of chunks to their head and corresponding range of events. + let mut link_to_info: HashMap<usize, (usize, usize, usize)> = HashMap::new(); while index < events.len() { let event = &events[index]; - // println!( - // "ev2: {:?} {:?} {:?}", - // event.event_type, event.token_type, index - // ); + // Find each first opening chunk. + if (event.token_type == TokenType::ChunkString + || event.token_type == TokenType::ContentChunk) && + event.event_type == EventType::Enter && + // No need to enter linked events again. + event.previous == None + { + // Index into `events` pointing to a chunk. + let mut index_opt: Option<usize> = Some(index); + // Subtokenizer. + let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); + // Substate. + let mut result: StateFnResult = ( + State::Fn(Box::new(if event.token_type == TokenType::ContentChunk { + content + } else { + string + })), + None, + ); + // Indices into `codes` of each end of chunk. + let mut ends: Vec<usize> = vec![]; - if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString { - let exit = &events[index + 1]; + // Loop through chunks to pass them in order to the subtokenizer. + while let Some(index_ptr) = index_opt { + let enter = &events[index_ptr]; + let span = Span { + start_index: enter.index, + end_index: events[index_ptr + 1].index, + }; + ends.push(span.end_index); - assert_eq!( - exit.event_type, - EventType::Exit, - "expected `enter` of `{:?}` to be follow by an `exit` event", - event.token_type - ); - assert_eq!( - exit.token_type, event.token_type, - "expected `exit` of `{:?}` to follow its `enter` event", - event.token_type - ); + if enter.previous != None { + tokenizer.define_skip(&enter.point, span.start_index); + } - let subevents = string( - slice_codes( - codes, - &Span { - start_index: event.index, - end_index: exit.index, - }, - ), - event.point.clone(), - event.index, - ); - let len = subevents.len(); - // To do: recursion needed? - events.splice(index..(index + 2), subevents); - index += len; - } else { - index += 1; + let func: Box<StateFn> = match result.0 { + State::Fn(func) => func, + _ => unreachable!("cannot be ok/nok"), + }; + + result = tokenizer.feed(slice_codes(codes, &span), func, enter.next == None); + + if let Some(ref x) = result.1 { + if !x.is_empty() { + // To do: handle? + unreachable!("subtokenize:remainder {:?}", x); + } + } + + index_opt = enter.next; + } + + // Now, loop through all subevents (and `ends`), to figure out + // which parts belong where. + // Current index. + let mut subindex = 0; + // Index into subevents that starts the current slice. + let mut last_start = 0; + // Counter into `ends`. + let mut end_index = 0; + let mut index_opt: Option<usize> = Some(index); + + while subindex < tokenizer.events.len() { + let subevent = &tokenizer.events[subindex]; + + // Find the first event that starts after the end we’re looking + // for. + // To do: is this logic correct? + if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] { + let link = index_opt.unwrap(); + link_to_info.insert(link, (index, last_start, subindex)); + + last_start = subindex; + end_index += 1; + index_opt = events[link].next; + } + + subindex += 1; + } + + let link = index_opt.unwrap(); + link_to_info.insert(link, (index, last_start, subindex)); + head_to_tokenizer.insert(index, tokenizer); } + + index += 1; + } + + // Now that we fed everything into a tokenizer, and we know which parts + // belong where, the final task is to splice the events from each + // tokenizer into the current events. + // To do: instead of splicing, it might be possible to create a new `events` + // from each slice and slices from events? + let mut index = events.len() - 1; + + while index > 0 { + let slice_opt = link_to_info.get(&index); + + if let Some(slice) = slice_opt { + let (head, start, end) = *slice; + // If there’s a slice at this index, it must also point to a head, + // and that head must have a tokenizer. + let tokenizer = head_to_tokenizer.get(&head).unwrap(); + + // To do: figure out a way that moves instead of clones? + events.splice(index..(index + 2), tokenizer.events[start..end].to_vec()); + } + + index -= 1; } events |