aboutsummaryrefslogtreecommitdiffstats
path: root/src/subtokenize.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/subtokenize.rs')
-rw-r--r--src/subtokenize.rs200
1 files changed, 99 insertions, 101 deletions
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index 58db3c6..92ada04 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -9,8 +9,7 @@
//! * …must occur on [`Enter`][EventType::Enter] events only
//! * …must occur on void events (they are followed by their corresponding
//! [`Exit`][EventType::Exit] event)
-//! * …must be headed by a [`ChunkString`][TokenType::ChunkString] or
-//! [`ChunkText`][TokenType::ChunkText] event
+//! * …must have `content_type` field to define the kind of subcontent
//!
//! Links will then be passed through a tokenizer for the corresponding content
//! type by `subtokenize`.
@@ -21,15 +20,13 @@
//! us from doing so due to definitions, which can occur after references, and
//! thus the whole document needs to be parsed up to the level of definitions,
//! before any level that can include references can be parsed.
-//!
-//! <!-- To do: `ChunkFlow` when it exists. -->
/// To do: could we do without `HashMap`, so we don’t need `std`?
use std::collections::HashMap;
use crate::content::{string::start as string, text::start as text};
use crate::parser::ParseState;
-use crate::tokenizer::{Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer};
+use crate::tokenizer::{ContentType, Event, EventType, State, StateFn, StateFnResult, Tokenizer};
use crate::util::span;
/// Create a link between two [`Event`][]s.
@@ -44,19 +41,19 @@ pub fn link(events: &mut [Event], index: usize) {
/// To do
pub fn link_to(events: &mut [Event], pevious: usize, next: usize) {
let prev = &mut events[pevious];
- // To do: force chunks?
- // assert!(
- // prev.token_type == TokenType::ChunkString || prev.token_type == TokenType::ChunkText,
- // "{:?}",
- // prev.token_type.to_owned()
- // );
+ assert!(
+ prev.content_type.is_some(),
+ "expected `content_type` on previous"
+ );
assert_eq!(prev.event_type, EventType::Enter);
prev.next = Some(next);
let prev_ref = &events[pevious];
let prev_exit_ref = &events[pevious + 1];
+ let curr_ref = &events[next];
assert_eq!(prev_exit_ref.event_type, EventType::Exit);
assert_eq!(prev_exit_ref.token_type, prev_ref.token_type);
+ assert_eq!(curr_ref.content_type, prev_ref.content_type);
let curr = &mut events[next];
assert_eq!(curr.event_type, EventType::Enter);
@@ -83,103 +80,104 @@ pub fn subtokenize(mut events: Vec<Event>, parse_state: &ParseState) -> (Vec<Eve
let event = &events[index];
// Find each first opening chunk.
- if (event.token_type == TokenType::ChunkString
- || event.token_type == TokenType::ChunkText) &&
- event.event_type == EventType::Enter &&
- // No need to enter linked events again.
- event.previous == None
- {
- done = false;
- // Index into `events` pointing to a chunk.
- let mut index_opt: Option<usize> = Some(index);
- // Subtokenizer.
- let mut tokenizer = Tokenizer::new(event.point.clone(), event.index, parse_state);
- // Substate.
- let mut result: StateFnResult = (
- State::Fn(Box::new(if event.token_type == TokenType::ChunkString {
- string
- } else {
- text
- })),
- None,
- );
- // Indices into `codes` of each end of chunk.
- let mut ends: Vec<usize> = vec![];
-
- // Loop through chunks to pass them in order to the subtokenizer.
- while let Some(index_ptr) = index_opt {
- let enter = &events[index_ptr];
- assert_eq!(enter.event_type, EventType::Enter);
- let span = span::Span {
- start_index: enter.index,
- end_index: events[index_ptr + 1].index,
- };
- ends.push(span.end_index);
-
- if enter.previous != None {
- tokenizer.define_skip(&enter.point, span.start_index);
- }
-
- let func: Box<StateFn> = match result.0 {
- State::Fn(func) => func,
- _ => unreachable!("cannot be ok/nok"),
- };
+ if let Some(ref content_type) = event.content_type {
+ assert_eq!(event.event_type, EventType::Enter);
- result = tokenizer.push(
- span::codes(&parse_state.codes, &span),
- func,
- enter.next == None,
+ // No need to enter linked events again.
+ if event.previous == None {
+ done = false;
+ // Index into `events` pointing to a chunk.
+ let mut index_opt: Option<usize> = Some(index);
+ // Subtokenizer.
+ let mut tokenizer = Tokenizer::new(event.point.clone(), event.index, parse_state);
+ // Substate.
+ let mut result: StateFnResult = (
+ State::Fn(Box::new(if *content_type == ContentType::String {
+ string
+ } else {
+ text
+ })),
+ None,
);
- assert!(result.1.is_none(), "expected no remainder");
- index_opt = enter.next;
- }
-
- // Now, loop through all subevents (and `ends`), to figure out
- // which parts belong where.
- // Current index.
- let mut subindex = 0;
- // Index into subevents that starts the current slice.
- let mut last_start = 0;
- // Counter into `ends`: the linked token we are at.
- let mut end_index = 0;
- let mut index_opt: Option<usize> = Some(index);
-
- while subindex < tokenizer.events.len() {
- let subevent = &mut tokenizer.events[subindex];
-
- // Find the first event that starts after the end we’re looking
- // for.
- // To do: is this logic correct?
- if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] {
- let link = index_opt.unwrap();
- link_to_info.insert(link, (index, last_start, subindex));
-
- last_start = subindex;
- end_index += 1;
- index_opt = events[link].next;
+ // Indices into `codes` of each end of chunk.
+ let mut ends: Vec<usize> = vec![];
+
+ // Loop through chunks to pass them in order to the subtokenizer.
+ while let Some(index_ptr) = index_opt {
+ let enter = &events[index_ptr];
+ assert_eq!(enter.event_type, EventType::Enter);
+ let span = span::Span {
+ start_index: enter.index,
+ end_index: events[index_ptr + 1].index,
+ };
+ ends.push(span.end_index);
+
+ if enter.previous != None {
+ tokenizer.define_skip(&enter.point, span.start_index);
+ }
+
+ let func: Box<StateFn> = match result.0 {
+ State::Fn(func) => func,
+ _ => unreachable!("cannot be ok/nok"),
+ };
+
+ result = tokenizer.push(
+ span::codes(&parse_state.codes, &span),
+ func,
+ enter.next == None,
+ );
+ assert!(result.1.is_none(), "expected no remainder");
+ index_opt = enter.next;
}
- // If there is a `next` link in the subevents, we have to change
- // its index to account for the shifted events.
- // If it points to a next event, we also change the next event’s
- // reference back to *this* event.
- if let Some(next) = subevent.next {
- // The `index` in `events` where the current link is,
- // minus 2 events (the enter and exit) for each removed
- // link.
- let shift = index_opt.unwrap() - (end_index * 2);
-
- subevent.next = Some(next + shift);
- let next_ev = &mut tokenizer.events[next];
- let previous = next_ev.previous.unwrap();
- next_ev.previous = Some(previous + shift);
+ // Now, loop through all subevents (and `ends`), to figure out
+ // which parts belong where.
+ // Current index.
+ let mut subindex = 0;
+ // Index into subevents that starts the current slice.
+ let mut last_start = 0;
+ // Counter into `ends`: the linked token we are at.
+ let mut end_index = 0;
+ let mut index_opt: Option<usize> = Some(index);
+
+ while subindex < tokenizer.events.len() {
+ let subevent = &mut tokenizer.events[subindex];
+
+ // Find the first event that starts after the end we’re looking
+ // for.
+ // To do: is this logic correct?
+ if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index]
+ {
+ let link = index_opt.unwrap();
+ link_to_info.insert(link, (index, last_start, subindex));
+
+ last_start = subindex;
+ end_index += 1;
+ index_opt = events[link].next;
+ }
+
+ // If there is a `next` link in the subevents, we have to change
+ // its index to account for the shifted events.
+ // If it points to a next event, we also change the next event’s
+ // reference back to *this* event.
+ if let Some(next) = subevent.next {
+ // The `index` in `events` where the current link is,
+ // minus 2 events (the enter and exit) for each removed
+ // link.
+ let shift = index_opt.unwrap() - (end_index * 2);
+
+ subevent.next = Some(next + shift);
+ let next_ev = &mut tokenizer.events[next];
+ let previous = next_ev.previous.unwrap();
+ next_ev.previous = Some(previous + shift);
+ }
+
+ subindex += 1;
}
- subindex += 1;
+ link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
+ head_to_tokenizer.insert(index, tokenizer);
}
-
- link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
- head_to_tokenizer.insert(index, tokenizer);
}
index += 1;