From dfd11b1bc155ae1fba9975a90c2dc83dc07697b4 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Tue, 28 Jun 2022 14:18:17 +0200 Subject: Fix jumps in `edit_map` * Use resolve more often (e.g., heading (atx, setext)) * Fix to link whole phrasing (e.g., one big chunk of text in heading (atx, setext), titles, labels) * Replace `ChunkText`, `ChunkString`, with `event.content_type: Option` * Refactor to externalize `edit_map` from `label` --- src/subtokenize.rs | 200 ++++++++++++++++++++++++++--------------------------- 1 file changed, 99 insertions(+), 101 deletions(-) (limited to 'src/subtokenize.rs') diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 58db3c6..92ada04 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -9,8 +9,7 @@ //! * …must occur on [`Enter`][EventType::Enter] events only //! * …must occur on void events (they are followed by their corresponding //! [`Exit`][EventType::Exit] event) -//! * …must be headed by a [`ChunkString`][TokenType::ChunkString] or -//! [`ChunkText`][TokenType::ChunkText] event +//! * …must have `content_type` field to define the kind of subcontent //! //! Links will then be passed through a tokenizer for the corresponding content //! type by `subtokenize`. @@ -21,15 +20,13 @@ //! us from doing so due to definitions, which can occur after references, and //! thus the whole document needs to be parsed up to the level of definitions, //! before any level that can include references can be parsed. -//! -//! /// To do: could we do without `HashMap`, so we don’t need `std`? use std::collections::HashMap; use crate::content::{string::start as string, text::start as text}; use crate::parser::ParseState; -use crate::tokenizer::{Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{ContentType, Event, EventType, State, StateFn, StateFnResult, Tokenizer}; use crate::util::span; /// Create a link between two [`Event`][]s. @@ -44,19 +41,19 @@ pub fn link(events: &mut [Event], index: usize) { /// To do pub fn link_to(events: &mut [Event], pevious: usize, next: usize) { let prev = &mut events[pevious]; - // To do: force chunks? - // assert!( - // prev.token_type == TokenType::ChunkString || prev.token_type == TokenType::ChunkText, - // "{:?}", - // prev.token_type.to_owned() - // ); + assert!( + prev.content_type.is_some(), + "expected `content_type` on previous" + ); assert_eq!(prev.event_type, EventType::Enter); prev.next = Some(next); let prev_ref = &events[pevious]; let prev_exit_ref = &events[pevious + 1]; + let curr_ref = &events[next]; assert_eq!(prev_exit_ref.event_type, EventType::Exit); assert_eq!(prev_exit_ref.token_type, prev_ref.token_type); + assert_eq!(curr_ref.content_type, prev_ref.content_type); let curr = &mut events[next]; assert_eq!(curr.event_type, EventType::Enter); @@ -83,103 +80,104 @@ pub fn subtokenize(mut events: Vec, parse_state: &ParseState) -> (Vec = Some(index); - // Subtokenizer. - let mut tokenizer = Tokenizer::new(event.point.clone(), event.index, parse_state); - // Substate. - let mut result: StateFnResult = ( - State::Fn(Box::new(if event.token_type == TokenType::ChunkString { - string - } else { - text - })), - None, - ); - // Indices into `codes` of each end of chunk. - let mut ends: Vec = vec![]; - - // Loop through chunks to pass them in order to the subtokenizer. - while let Some(index_ptr) = index_opt { - let enter = &events[index_ptr]; - assert_eq!(enter.event_type, EventType::Enter); - let span = span::Span { - start_index: enter.index, - end_index: events[index_ptr + 1].index, - }; - ends.push(span.end_index); - - if enter.previous != None { - tokenizer.define_skip(&enter.point, span.start_index); - } - - let func: Box = match result.0 { - State::Fn(func) => func, - _ => unreachable!("cannot be ok/nok"), - }; + if let Some(ref content_type) = event.content_type { + assert_eq!(event.event_type, EventType::Enter); - result = tokenizer.push( - span::codes(&parse_state.codes, &span), - func, - enter.next == None, + // No need to enter linked events again. + if event.previous == None { + done = false; + // Index into `events` pointing to a chunk. + let mut index_opt: Option = Some(index); + // Subtokenizer. + let mut tokenizer = Tokenizer::new(event.point.clone(), event.index, parse_state); + // Substate. + let mut result: StateFnResult = ( + State::Fn(Box::new(if *content_type == ContentType::String { + string + } else { + text + })), + None, ); - assert!(result.1.is_none(), "expected no remainder"); - index_opt = enter.next; - } - - // Now, loop through all subevents (and `ends`), to figure out - // which parts belong where. - // Current index. - let mut subindex = 0; - // Index into subevents that starts the current slice. - let mut last_start = 0; - // Counter into `ends`: the linked token we are at. - let mut end_index = 0; - let mut index_opt: Option = Some(index); - - while subindex < tokenizer.events.len() { - let subevent = &mut tokenizer.events[subindex]; - - // Find the first event that starts after the end we’re looking - // for. - // To do: is this logic correct? - if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] { - let link = index_opt.unwrap(); - link_to_info.insert(link, (index, last_start, subindex)); - - last_start = subindex; - end_index += 1; - index_opt = events[link].next; + // Indices into `codes` of each end of chunk. + let mut ends: Vec = vec![]; + + // Loop through chunks to pass them in order to the subtokenizer. + while let Some(index_ptr) = index_opt { + let enter = &events[index_ptr]; + assert_eq!(enter.event_type, EventType::Enter); + let span = span::Span { + start_index: enter.index, + end_index: events[index_ptr + 1].index, + }; + ends.push(span.end_index); + + if enter.previous != None { + tokenizer.define_skip(&enter.point, span.start_index); + } + + let func: Box = match result.0 { + State::Fn(func) => func, + _ => unreachable!("cannot be ok/nok"), + }; + + result = tokenizer.push( + span::codes(&parse_state.codes, &span), + func, + enter.next == None, + ); + assert!(result.1.is_none(), "expected no remainder"); + index_opt = enter.next; } - // If there is a `next` link in the subevents, we have to change - // its index to account for the shifted events. - // If it points to a next event, we also change the next event’s - // reference back to *this* event. - if let Some(next) = subevent.next { - // The `index` in `events` where the current link is, - // minus 2 events (the enter and exit) for each removed - // link. - let shift = index_opt.unwrap() - (end_index * 2); - - subevent.next = Some(next + shift); - let next_ev = &mut tokenizer.events[next]; - let previous = next_ev.previous.unwrap(); - next_ev.previous = Some(previous + shift); + // Now, loop through all subevents (and `ends`), to figure out + // which parts belong where. + // Current index. + let mut subindex = 0; + // Index into subevents that starts the current slice. + let mut last_start = 0; + // Counter into `ends`: the linked token we are at. + let mut end_index = 0; + let mut index_opt: Option = Some(index); + + while subindex < tokenizer.events.len() { + let subevent = &mut tokenizer.events[subindex]; + + // Find the first event that starts after the end we’re looking + // for. + // To do: is this logic correct? + if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] + { + let link = index_opt.unwrap(); + link_to_info.insert(link, (index, last_start, subindex)); + + last_start = subindex; + end_index += 1; + index_opt = events[link].next; + } + + // If there is a `next` link in the subevents, we have to change + // its index to account for the shifted events. + // If it points to a next event, we also change the next event’s + // reference back to *this* event. + if let Some(next) = subevent.next { + // The `index` in `events` where the current link is, + // minus 2 events (the enter and exit) for each removed + // link. + let shift = index_opt.unwrap() - (end_index * 2); + + subevent.next = Some(next + shift); + let next_ev = &mut tokenizer.events[next]; + let previous = next_ev.previous.unwrap(); + next_ev.previous = Some(previous + shift); + } + + subindex += 1; } - subindex += 1; + link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex)); + head_to_tokenizer.insert(index, tokenizer); } - - link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex)); - head_to_tokenizer.insert(index, tokenizer); } index += 1; -- cgit