1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
|
//! Deal with content in other content.
//!
//! To deal with content in content, *you* (a `micromark-rs` contributor) add
//! information on events.
//! Events are a flat list, but they can be connected to each other by setting
//! `previous` and `next` links.
//! These links:
//!
//! * …must occur on [`Enter`][EventType::Enter] events only
//! * …must occur on void events (they are followed by their corresponding
//! [`Exit`][EventType::Exit] event)
//! * …must be headed by a [`ChunkString`][TokenType::ChunkString] or
//! [`ChunkText`][TokenType::ChunkText] event
//!
//! Links will then be passed through a tokenizer for the corresponding content
//! type by `subtokenize`.
//! The subevents they result in are split up into slots for each linked token
//! and replace those links.
//!
//! Subevents are not immediately subtokenized again because markdown prevents
//! us from doing so due to definitions, which can occur after references, and
//! thus the whole document needs to be parsed up to the level of definitions,
//! before any level that can include references can be parsed.
//!
//! <!-- To do: `ChunkFlow` when it exists. -->
/// To do: could we do without `HashMap`, so we don’t need `std`?
use std::collections::HashMap;
use crate::content::{string::start as string, text::start as text};
use crate::tokenizer::{
Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer,
};
use crate::util::span;
/// Create a link between two [`Event`][]s.
///
/// Arbitrary (void) events can be linked together.
/// This optimizes for the common case where the token at `index` is connected
/// to the previous void token.
pub fn link(events: &mut [Event], index: usize) {
let prev = &mut events[index - 2];
assert_eq!(prev.event_type, EventType::Enter);
prev.next = Some(index);
let prev_ref = &events[index - 2];
let prev_exit_ref = &events[index - 1];
assert_eq!(prev_exit_ref.event_type, EventType::Exit);
assert_eq!(prev_exit_ref.token_type, prev_ref.token_type);
let curr = &mut events[index];
assert_eq!(curr.event_type, EventType::Enter);
curr.previous = Some(index - 2);
// Note: the exit of this event may not exist, so don’t check for that.
}
/// Parse linked events.
///
/// Supposed to be called repeatedly, returns `1: true` when done.
pub fn subtokenize(mut events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) {
let mut index = 0;
// Map of first chunks to their tokenizer.
let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new();
// Map of chunks to their head and corresponding range of events.
let mut link_to_info: HashMap<usize, (usize, usize, usize)> = HashMap::new();
let mut done = true;
if events.is_empty() {
return (events, true);
}
while index < events.len() {
let event = &events[index];
// Find each first opening chunk.
if (event.token_type == TokenType::ChunkString
|| event.token_type == TokenType::ChunkText) &&
event.event_type == EventType::Enter &&
// No need to enter linked events again.
event.previous == None
{
done = false;
// Index into `events` pointing to a chunk.
let mut index_opt: Option<usize> = Some(index);
// Subtokenizer.
let mut tokenizer = Tokenizer::new(event.point.clone(), event.index);
// Substate.
let mut result: StateFnResult = (
State::Fn(Box::new(if event.token_type == TokenType::ChunkString {
string
} else {
text
})),
None,
);
// Indices into `codes` of each end of chunk.
let mut ends: Vec<usize> = vec![];
// Loop through chunks to pass them in order to the subtokenizer.
while let Some(index_ptr) = index_opt {
let enter = &events[index_ptr];
assert_eq!(enter.event_type, EventType::Enter);
let span = span::Span {
start_index: enter.index,
end_index: events[index_ptr + 1].index,
};
ends.push(span.end_index);
if enter.previous != None {
tokenizer.define_skip(&enter.point, span.start_index);
}
let func: Box<StateFn> = match result.0 {
State::Fn(func) => func,
_ => unreachable!("cannot be ok/nok"),
};
result = tokenizer.feed(span::codes(codes, &span), func, enter.next == None);
assert!(result.1.is_none(), "expected no remainder");
index_opt = enter.next;
}
// Now, loop through all subevents (and `ends`), to figure out
// which parts belong where.
// Current index.
let mut subindex = 0;
// Index into subevents that starts the current slice.
let mut last_start = 0;
// Counter into `ends`: the linked token we are at.
let mut end_index = 0;
let mut index_opt: Option<usize> = Some(index);
while subindex < tokenizer.events.len() {
let subevent = &mut tokenizer.events[subindex];
// Find the first event that starts after the end we’re looking
// for.
// To do: is this logic correct?
if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] {
let link = index_opt.unwrap();
link_to_info.insert(link, (index, last_start, subindex));
last_start = subindex;
end_index += 1;
index_opt = events[link].next;
}
// If there is a `next` link in the subevents, we have to change
// its index to account for the shifted events.
// If it points to a next event, we also change the next event’s
// reference back to *this* event.
if let Some(next) = subevent.next {
// The `index` in `events` where the current link is,
// minus 2 events (the enter and exit) for each removed
// link.
let shift = index_opt.unwrap() - (end_index * 2);
subevent.next = Some(next + shift);
let next_ev = &mut tokenizer.events[next];
let previous = next_ev.previous.unwrap();
next_ev.previous = Some(previous + shift);
}
subindex += 1;
}
link_to_info.insert(index_opt.unwrap(), (index, last_start, subindex));
head_to_tokenizer.insert(index, tokenizer);
}
index += 1;
}
// Now that we fed everything into a tokenizer, and we know which parts
// belong where, the final task is to splice the events from each
// tokenizer into the current events.
// To do: instead of splicing, it might be possible to create a new `events`
// from each slice and slices from events?
let mut index = events.len() - 1;
while index > 0 {
let slice_opt = link_to_info.get(&index);
if let Some(slice) = slice_opt {
let (head, start, end) = *slice;
// If there’s a slice at this index, it must also point to a head,
// and that head must have a tokenizer.
let tokenizer = head_to_tokenizer.get(&head).unwrap();
// To do: figure out a way that moves instead of clones?
events.splice(index..(index + 2), tokenizer.events[start..end].to_vec());
}
index -= 1;
}
(events, done)
}
|