aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizer.rs
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-11 13:31:20 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-11 13:31:20 +0200
commit2d35cbfceace81a217cd0fbdae7a8777c7a6465e (patch)
treee5e69d44c5c00d1dc70f4e3a227f67fd5c771389 /src/tokenizer.rs
parent053a2603e4bd5ec9caf40617b52136e5ef3fcf0a (diff)
downloadmarkdown-rs-2d35cbfceace81a217cd0fbdae7a8777c7a6465e.tar.gz
markdown-rs-2d35cbfceace81a217cd0fbdae7a8777c7a6465e.tar.bz2
markdown-rs-2d35cbfceace81a217cd0fbdae7a8777c7a6465e.zip
Refactor internal docs, code style of tokenizer
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r--src/tokenizer.rs518
1 files changed, 256 insertions, 262 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3cdd2d3..04a8cc3 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -29,11 +29,16 @@ pub enum ContentType {
Text,
}
-/// To do.
+/// How to handle a byte.
#[derive(Debug, PartialEq)]
pub enum ByteAction {
+ /// This is a normal byte.
+ ///
+ /// Includes replaced bytes.
Normal(u8),
+ /// This is a new byte.
Insert(u8),
+ /// This byte must be ignored.
Ignore,
}
@@ -84,22 +89,6 @@ pub struct Event {
pub link: Option<Link>,
}
-#[derive(Debug, PartialEq)]
-enum AttemptKind {
- Attempt,
- Check,
-}
-
-/// To do.
-#[derive(Debug)]
-struct Attempt {
- /// To do.
- ok: State,
- nok: State,
- kind: AttemptKind,
- state: Option<InternalState>,
-}
-
/// Callback that can be registered and is called when the tokenizer is done.
///
/// Resolvers are supposed to change the list of events, because parsing is
@@ -107,6 +96,7 @@ struct Attempt {
/// the compiler and other users.
pub type Resolver = dyn FnOnce(&mut Tokenizer);
+/// Names of functions to move to.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum StateName {
AttentionStart,
@@ -447,62 +437,73 @@ pub struct ContainerState {
pub size: usize,
}
+/// Different kinds of attempts.
+#[derive(Debug, PartialEq)]
+enum AttemptKind {
+ /// Discard what was tokenizer when unsuccessful.
+ Attempt,
+ /// Discard always.
+ Check,
+}
+
+/// How to handle [`State::Ok`][] or [`State::Nok`][].
+#[derive(Debug)]
+struct Attempt {
+ /// Where to go to when successful.
+ ok: State,
+ /// Where to go to when unsuccessful.
+ nok: State,
+ /// Kind of attempt.
+ kind: AttemptKind,
+ /// If needed, the progress to revert to.
+ ///
+ /// It is not needed to discard an [`AttemptKind::Attempt`] that has a
+ /// `nok` of [`State::Nok`][], because that means it is used in *another*
+ /// attempt, which will receive that `Nok`, and has to handle it.
+ progress: Option<Progress>,
+}
+
/// The internal state of a tokenizer, not to be confused with states from the
/// state machine, this instead is all the information about where we currently
/// are and what’s going on.
#[derive(Debug, Clone)]
-struct InternalState {
- /// Length of `events`. We only add to events, so reverting will just pop stuff off.
+struct Progress {
+ /// Length of `events`.
+ ///
+ /// It’s not allowed to remove events, so reverting will just pop stuff off.
events_len: usize,
- /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt.
+ /// Length of the stack.
+ ///
+ /// It’s not allowed to decrease the stack in an attempt.
stack_len: usize,
/// Previous code.
previous: Option<u8>,
/// Current code.
current: Option<u8>,
- /// Current relative and absolute position in the file.
+ /// Current place in the file.
point: Point,
}
-/// To do
+/// A lot of shared fields used to tokenize things.
#[allow(clippy::struct_excessive_bools)]
pub struct TokenizeState<'a> {
- /// To do.
- pub connect: bool,
- /// To do.
+ // Couple complex fields used to tokenize the document.
+ /// Tokenizer, used to tokenize flow in document.
+ pub document_child: Option<Box<Tokenizer<'a>>>,
+ /// State, used to tokenize containers.
+ pub document_child_state: Option<State>,
+ /// Stack of currently active containers.
pub document_container_stack: Vec<ContainerState>,
- /// To do.
- pub document_exits: Vec<Option<Vec<Event>>>,
- /// To do.
+ /// How many active containers continued.
pub document_continued: usize,
- /// To do.
- pub document_paragraph_before: bool,
- /// To do.
+ /// Index of last `data`.
pub document_data_index: Option<usize>,
- /// To do.
- pub document_child_state: Option<State>,
- /// To do.
- pub child_tokenizer: Option<Box<Tokenizer<'a>>>,
- /// To do.
- pub marker: u8,
- /// To do.
- pub marker_other: u8,
- /// To do.
- pub prefix: usize,
- /// To do.
- pub return_state: Option<StateName>,
- /// To do.
- pub seen: bool,
- /// To do.
- pub size: usize,
- /// To do.
- pub size_other: usize,
- /// To do.
- pub start: usize,
- /// To do.
- pub end: usize,
- /// To do.
- pub stop: &'static [u8],
+ /// Container exits by line number.
+ pub document_exits: Vec<Option<Vec<Event>>>,
+ /// Whether the previous flow was a paragraph.
+ pub document_paragraph_before: bool,
+
+ // Couple of very frequent settings for parsing whitespace.
pub space_or_tab_eol_content_type: Option<ContentType>,
pub space_or_tab_eol_connect: bool,
pub space_or_tab_eol_ok: bool,
@@ -512,11 +513,50 @@ pub struct TokenizeState<'a> {
pub space_or_tab_max: usize,
pub space_or_tab_size: usize,
pub space_or_tab_token: Token,
- /// To do.
+
+ // Couple of media related fields.
+ /// Stack of label (start) that could form images and links.
+ ///
+ /// Used when tokenizing [text content][crate::content::text].
+ pub label_start_stack: Vec<LabelStart>,
+ /// Stack of label (start) that cannot form images and links.
+ ///
+ /// Used when tokenizing [text content][crate::content::text].
+ pub label_start_list_loose: Vec<LabelStart>,
+ /// Stack of images and links.
+ ///
+ /// Used when tokenizing [text content][crate::content::text].
+ pub media_list: Vec<Media>,
+
+ /// Whether to connect tokens.
+ pub connect: bool,
+ /// Marker.
+ pub marker: u8,
+ /// Secondary marker.
+ pub marker_b: u8,
+ /// Several markers.
+ pub markers: &'static [u8],
+ /// Whether something was seen.
+ pub seen: bool,
+ /// Size.
+ pub size: usize,
+ /// Secondary size.
+ pub size_b: usize,
+ /// Tertiary size.
+ pub size_c: usize,
+ /// Index.
+ pub start: usize,
+ /// Index.
+ pub end: usize,
+ /// Slot for a token type.
pub token_1: Token,
+ /// Slot for a token type.
pub token_2: Token,
+ /// Slot for a token type.
pub token_3: Token,
+ /// Slot for a token type.
pub token_4: Token,
+ /// Slot for a token type.
pub token_5: Token,
}
@@ -525,9 +565,9 @@ pub struct TokenizeState<'a> {
pub struct Tokenizer<'a> {
/// Jump between line endings.
column_start: Vec<(usize, usize)>,
- // First line.
+ // First line where this tokenizer starts.
first_line: usize,
- /// First point after the last line ending.
+ /// Current point after the last line ending (excluding jump).
line_start: Point,
/// Track whether the current byte is already consumed (`true`) or expected
/// to be consumed (`false`).
@@ -536,7 +576,7 @@ pub struct Tokenizer<'a> {
consumed: bool,
/// Track whether this tokenizer is done.
resolved: bool,
- /// To do.
+ /// Stack of how to handle attempts.
attempts: Vec<Attempt>,
/// Current byte.
pub current: Option<u8>,
@@ -544,7 +584,7 @@ pub struct Tokenizer<'a> {
pub previous: Option<u8>,
/// Current relative and absolute place in the file.
pub point: Point,
- /// Semantic labels of one or more codes in `codes`.
+ /// Semantic labels.
pub events: Vec<Event>,
/// Hierarchy of semantic labels.
///
@@ -559,20 +599,8 @@ pub struct Tokenizer<'a> {
pub resolver_ids: Vec<String>,
/// Shared parsing state across tokenizers.
pub parse_state: &'a ParseState<'a>,
- /// To do.
+ /// A lot of shared fields used to tokenize things.
pub tokenize_state: TokenizeState<'a>,
- /// Stack of label (start) that could form images and links.
- ///
- /// Used when tokenizing [text content][crate::content::text].
- pub label_start_stack: Vec<LabelStart>,
- /// Stack of label (start) that cannot form images and links.
- ///
- /// Used when tokenizing [text content][crate::content::text].
- pub label_start_list_loose: Vec<LabelStart>,
- /// Stack of images and links.
- ///
- /// Used when tokenizing [text content][crate::content::text].
- pub media_list: Vec<Media>,
/// Whether we would be interrupting something.
///
/// Used when tokenizing [flow content][crate::content::flow].
@@ -613,17 +641,19 @@ impl<'a> Tokenizer<'a> {
document_paragraph_before: false,
document_data_index: None,
document_child_state: None,
- child_tokenizer: None,
+ document_child: None,
marker: 0,
- marker_other: 0,
- prefix: 0,
+ marker_b: 0,
+ markers: &[],
seen: false,
size: 0,
- size_other: 0,
+ size_b: 0,
+ size_c: 0,
start: 0,
end: 0,
- stop: &[],
- return_state: None,
+ label_start_stack: vec![],
+ label_start_list_loose: vec![],
+ media_list: vec![],
space_or_tab_eol_content_type: None,
space_or_tab_eol_connect: false,
space_or_tab_eol_ok: false,
@@ -640,15 +670,11 @@ impl<'a> Tokenizer<'a> {
token_5: Token::Data,
},
map: EditMap::new(),
- label_start_stack: vec![],
- label_start_list_loose: vec![],
- media_list: vec![],
interrupt: false,
concrete: false,
lazy: false,
- // Assume about 10 resolvers.
- resolvers: Vec::with_capacity(10),
- resolver_ids: Vec::with_capacity(10),
+ resolvers: vec![],
+ resolver_ids: vec![],
}
}
@@ -698,7 +724,7 @@ impl<'a> Tokenizer<'a> {
}
/// Prepare for a next code to get consumed.
- pub fn expect(&mut self, byte: Option<u8>) {
+ fn expect(&mut self, byte: Option<u8>) {
debug_assert!(self.consumed, "expected previous byte to be consumed");
self.consumed = false;
self.current = byte;
@@ -721,7 +747,7 @@ impl<'a> Tokenizer<'a> {
}
/// Move to the next (virtual) byte.
- pub fn move_one(&mut self) {
+ fn move_one(&mut self) {
match byte_action(self.parse_state.bytes, &self.point) {
ByteAction::Ignore => {
self.point.index += 1;
@@ -756,7 +782,7 @@ impl<'a> Tokenizer<'a> {
}
/// Move (virtual) bytes.
- pub fn move_to(&mut self, to: (usize, usize)) {
+ fn move_to(&mut self, to: (usize, usize)) {
let (to_index, to_vs) = to;
while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs {
self.move_one();
@@ -838,9 +864,9 @@ impl<'a> Tokenizer<'a> {
});
}
- /// Capture the internal state.
- fn capture(&mut self) -> InternalState {
- InternalState {
+ /// Capture the tokenizer progress.
+ fn capture(&mut self) -> Progress {
+ Progress {
previous: self.previous,
current: self.current,
point: self.point.clone(),
@@ -849,8 +875,8 @@ impl<'a> Tokenizer<'a> {
}
}
- /// Apply the internal state.
- fn free(&mut self, previous: InternalState) {
+ /// Apply tokenizer progress.
+ fn free(&mut self, previous: Progress) {
self.previous = previous.previous;
self.current = previous.current;
self.point = previous.point;
@@ -866,123 +892,168 @@ impl<'a> Tokenizer<'a> {
self.stack.truncate(previous.stack_len);
}
- /// Parse with `name` and its future states, to check if it result in
- /// [`State::Ok`][] or [`State::Nok`][], revert on both cases, and then
- /// call `done` with whether it was successful or not.
- ///
- /// This captures the current state of the tokenizer, returns a wrapped
- /// state that captures all codes and feeds them to `name` and its
- /// future states until it yields `State::Ok` or `State::Nok`.
- /// It then applies the captured state, calls `done`, and feeds all
- /// captured codes to its future states.
+ /// Parse with `name` and its future states, to see if that results in
+ /// [`State::Ok`][] or [`State::Nok`][], then revert in both cases.
pub fn check(&mut self, name: StateName, ok: State, nok: State) -> State {
- attempt_impl(self, name, ok, nok, AttemptKind::Check)
+ // Always capture (and restore) when checking.
+ // No need to capture (and restore) when `nok` is `State::Nok`, because the
+ // parent attempt will do it.
+ let progress = Some(self.capture());
+
+ self.attempts.push(Attempt {
+ kind: AttemptKind::Check,
+ progress,
+ ok,
+ nok,
+ });
+
+ call_impl(self, name)
}
- /// Parse with `name` and its future states, to check if it results in
- /// [`State::Ok`][] or [`State::Nok`][], revert on the case of
- /// `State::Nok`, and then call `done` with whether it was successful or
- /// not.
- ///
- /// This captures the current state of the tokenizer, returns a wrapped
- /// state that captures all codes and feeds them to `name` and its
- /// future states until it yields `State::Ok`, at which point it calls
- /// `done` and yields its result.
- /// If instead `State::Nok` was yielded, the captured state is applied,
- /// `done` is called, and all captured codes are fed to its future states.
+ /// Parse with `name` and its future states, to see if that results in
+ /// [`State::Ok`][] or [`State::Nok`][], revert in the case of
+ /// `State::Nok`.
pub fn attempt(&mut self, name: StateName, ok: State, nok: State) -> State {
- attempt_impl(self, name, ok, nok, AttemptKind::Attempt)
- }
+ // Always capture (and restore) when checking.
+ // No need to capture (and restore) when `nok` is `State::Nok`, because the
+ // parent attempt will do it.
+ let progress = if nok == State::Nok {
+ None
+ } else {
+ Some(self.capture())
+ };
- /// Feed a list of `codes` into `start`.
- ///
- /// This is set up to support repeatedly calling `feed`, and thus streaming
- /// markdown into the state machine, and normally pauses after feeding.
- // Note: if needed: accept `vs`?
- pub fn push(&mut self, min: (usize, usize), max: (usize, usize), name: StateName) -> State {
- debug_assert!(!self.resolved, "cannot feed after drain");
+ self.attempts.push(Attempt {
+ kind: AttemptKind::Attempt,
+ progress,
+ ok,
+ nok,
+ });
- // debug_assert!(min >= self.point.index, "cannot move backwards");
+ call_impl(self, name)
+ }
- if min.0 > self.point.index || (min.0 == self.point.index && min.1 > self.point.vs) {
- self.move_to(min);
- }
+ /// Tokenize.
+ pub fn push(&mut self, from: (usize, usize), to: (usize, usize), state: State) -> State {
+ push_impl(self, from, to, state, false)
+ }
- let mut state = State::Next(name);
+ /// Flush.
+ pub fn flush(&mut self, state: State, resolve: bool) {
+ let to = (self.point.index, self.point.vs);
+ push_impl(self, to, to, state, true);
- while self.point.index < max.0 || (self.point.index == max.0 && self.point.vs < max.1) {
- match state {
- State::Ok | State::Nok => {
- if let Some(attempt) = self.attempts.pop() {
- state = attempt_done_impl(self, attempt, state);
- } else {
- break;
- }
- }
- State::Next(name) => {
- let action = byte_action(self.parse_state.bytes, &self.point);
- state = feed_action_impl(self, &Some(action), name);
- }
- State::Retry(name) => {
- log::debug!(" retry {:?}", name);
- state = call_impl(self, name);
- }
+ if resolve {
+ self.resolved = true;
+
+ while !self.resolvers.is_empty() {
+ let resolver = self.resolvers.remove(0);
+ resolver(self);
}
+
+ self.map.consume(&mut self.events);
}
+ }
+}
- state
+/// Move back past ignored bytes.
+fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) {
+ while point.index > 0 {
+ point.index -= 1;
+ let action = byte_action(tokenizer.parse_state.bytes, point);
+ if !matches!(action, ByteAction::Ignore) {
+ point.index += 1;
+ break;
+ }
}
+}
- /// Flush the tokenizer.
- pub fn flush(&mut self, mut state: State, resolve: bool) {
- let max = self.point.index;
+/// Run the tokenizer.
+fn push_impl(
+ tokenizer: &mut Tokenizer,
+ from: (usize, usize),
+ to: (usize, usize),
+ mut state: State,
+ flush: bool,
+) -> State {
+ debug_assert!(!tokenizer.resolved, "cannot feed after drain");
+ debug_assert!(
+ from.0 > tokenizer.point.index
+ || (from.0 == tokenizer.point.index && from.1 >= tokenizer.point.vs),
+ "cannot move backwards"
+ );
+
+ tokenizer.move_to(from);
+
+ loop {
+ match state {
+ State::Ok | State::Nok => {
+ if let Some(attempt) = tokenizer.attempts.pop() {
+ if attempt.kind == AttemptKind::Check || state == State::Nok {
+ if let Some(progress) = attempt.progress {
+ tokenizer.free(progress);
+ }
+ }
- self.consumed = true;
+ tokenizer.consumed = true;
- loop {
- match state {
- State::Ok | State::Nok => {
- if let Some(attempt) = self.attempts.pop() {
- state = attempt_done_impl(self, attempt, state);
+ let next = if state == State::Ok {
+ attempt.ok
} else {
- break;
- }
- }
- State::Next(name) => {
- // We sometimes move back when flushing, so then we use those codes.
- state = feed_action_impl(
- self,
- &if self.point.index == max {
- None
- } else {
- Some(byte_action(self.parse_state.bytes, &self.point))
- },
- name,
- );
- }
- State::Retry(name) => {
- log::debug!(" retry {:?}", name);
- state = call_impl(self, name);
+ attempt.nok
+ };
+
+ log::debug!("attempt: `{:?}` -> `{:?}`", state, next);
+ state = next;
+ } else {
+ break;
}
}
- }
-
- self.consumed = true;
- debug_assert!(matches!(state, State::Ok), "must be ok");
+ State::Next(name) => {
+ let action = if tokenizer.point.index < to.0
+ || (tokenizer.point.index == to.0 && tokenizer.point.vs < to.1)
+ {
+ Some(byte_action(tokenizer.parse_state.bytes, &tokenizer.point))
+ } else if flush {
+ None
+ } else {
+ break;
+ };
- if resolve {
- self.resolved = true;
+ if let Some(ByteAction::Ignore) = action {
+ tokenizer.move_one();
+ } else {
+ let byte =
+ if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action {
+ Some(byte)
+ } else {
+ None
+ };
- while !self.resolvers.is_empty() {
- let resolver = self.resolvers.remove(0);
- resolver(self);
+ log::debug!("feed: `{:?}` to {:?}", byte, name);
+ tokenizer.expect(byte);
+ state = call_impl(tokenizer, name);
+ };
+ }
+ State::Retry(name) => {
+ log::debug!("retry: {:?}", name);
+ state = call_impl(tokenizer, name);
}
-
- self.map.consume(&mut self.events);
}
}
+
+ tokenizer.consumed = true;
+
+ if flush {
+ debug_assert!(matches!(state, State::Ok), "must be ok");
+ } else {
+ debug_assert!(matches!(state, State::Next(_)), "must have a next state");
+ }
+
+ state
}
+/// Figure out how to handle a byte.
fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
if point.index < bytes.len() {
let byte = bytes[point.index];
@@ -1024,73 +1095,8 @@ fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
}
}
-/// Internal utility to wrap states to also capture codes.
-///
-/// Recurses into itself.
-/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check].
-fn attempt_impl(
- tokenizer: &mut Tokenizer,
- name: StateName,
- ok: State,
- nok: State,
- kind: AttemptKind,
-) -> State {
- // Always capture (and restore) when checking.
- // No need to capture (and restore) when `nok` is `State::Nok`, because the
- // parent attempt will do it.
- let state = if kind == AttemptKind::Check || nok != State::Nok {
- Some(tokenizer.capture())
- } else {
- None
- };
-
- tokenizer.attempts.push(Attempt {
- ok,
- nok,
- kind,
- state,
- });
-
- call_impl(tokenizer, name)
-}
-
-fn attempt_done_impl(tokenizer: &mut Tokenizer, attempt: Attempt, state: State) -> State {
- if attempt.kind == AttemptKind::Check || state == State::Nok {
- if let Some(state) = attempt.state {
- tokenizer.free(state);
- }
- }
-
- tokenizer.consumed = true;
- if state == State::Ok {
- attempt.ok
- } else {
- attempt.nok
- }
-}
-
-fn feed_action_impl(
- tokenizer: &mut Tokenizer,
- action: &Option<ByteAction>,
- name: StateName,
-) -> State {
- if let Some(ByteAction::Ignore) = action {
- tokenizer.move_one();
- State::Next(name)
- } else {
- let byte = if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action {
- Some(*byte)
- } else {
- None
- };
-
- log::debug!("feed: `{:?}` to {:?}", byte, name);
- tokenizer.expect(byte);
- call_impl(tokenizer, name)
- }
-}
-
#[allow(clippy::too_many_lines)]
+/// Call the corresponding function for a state name.
fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State {
let func = match name {
StateName::AttentionStart => construct::attention::start,
@@ -1422,15 +1428,3 @@ fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State {
func(tokenizer)
}
-
-fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) {
- // Move back past ignored bytes.
- while point.index > 0 {
- point.index -= 1;
- let action = byte_action(tokenizer.parse_state.bytes, point);
- if !matches!(action, ByteAction::Ignore) {
- point.index += 1;
- break;
- }
- }
-}