//! The tokenizer glues states from the state machine together. //! //! It facilitates everything needed to turn codes into tokens and events with //! a state machine. //! It also enables logic needed for parsing markdown, such as an [`attempt`][] //! to parse something, which can succeed or, when unsuccessful, revert the //! attempt. //! Similarly, a [`check`][] exists, which does the same as an `attempt` but //! reverts even if successful. //! //! [`attempt`]: Tokenizer::attempt //! [`check`]: Tokenizer::check use crate::parser::ParseState; use crate::token::{Token, VOID_TOKENS}; /// Embedded content type. #[derive(Debug, Clone, Copy, PartialEq)] pub enum ContentType { /// Represents [flow content][crate::content::flow]. Flow, /// Represents [text content][crate::content::text]. Text, /// Represents [string content][crate::content::string]. String, } /// Enum representing a character code. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Code { /// End of the input stream (called eof). None, /// Used to make parsing line endings easier as it represents both /// `Code::Char('\r')` and `Code::Char('\n')` combined. CarriageReturnLineFeed, /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s. VirtualSpace, /// The most frequent variant of this enum is `Code::Char(char)`, which just /// represents a char, but micromark adds meaning to certain other values. Char(char), } /// A location in the document (`line`/`column`/`offset`). /// /// The interface for the location in the document comes from unist `Point`: /// . #[derive(Debug, Clone, PartialEq)] pub struct Point { /// 1-indexed line number. pub line: usize, /// 1-indexed column number. /// This is increases up to a tab stop for tabs. /// Some editors count tabs as 1 character, so this position is not always /// the same as editors. pub column: usize, /// 0-indexed position in the document. pub offset: usize, } /// Possible event types. #[derive(Debug, PartialEq, Clone)] pub enum EventType { /// The start of something. Enter, /// The end of something. Exit, } /// Something semantic happening somewhere. #[derive(Debug, Clone)] pub struct Event { pub event_type: EventType, pub token_type: Token, pub point: Point, pub index: usize, pub previous: Option, pub next: Option, pub content_type: Option, } /// The essence of the state machine are functions: `StateFn`. /// It’s responsible for dealing with that single passed [`Code`][]. /// It yields a [`StateFnResult`][]. pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult; /// Each [`StateFn`][] yields something back: primarily the state. /// In certain cases, it can also yield back up parsed codes that were passed down. pub type StateFnResult = (State, Option>); /// Callback that can be registered and is called when the tokenizer is done. /// /// Resolvers are supposed to change the list of events, because parsing is /// sometimes messy, and they help expose a cleaner interface of events to /// the compiler and other users. pub type Resolver = dyn FnOnce(&mut Tokenizer) -> Vec; /// The result of a state. pub enum State { /// There is a future state: a boxed [`StateFn`][] to pass the next code to. Fn(Box), /// The state is successful. Ok, /// The state is not successful. Nok, } /// Loose label starts we found. #[derive(Debug)] pub struct LabelStart { /// Indices of where the label starts and ends in `events`. pub start: (usize, usize), /// A boolean used internally to figure out if a label start link can’t be /// used (because links in links are incorrect). pub inactive: bool, /// A boolean used internally to figure out if a label is balanced: they’re /// not media, it’s just balanced braces. pub balanced: bool, } /// Media we found. #[derive(Debug)] pub struct Media { /// Indices of where the media’s label start starts and ends in `events`. pub start: (usize, usize), /// Indices of where the media’s label end starts and ends in `events`. pub end: (usize, usize), /// Identifier pub id: String, } /// Supported containers. #[derive(Debug, PartialEq)] pub enum Container { BlockQuote, ListItem, } /// Info used to tokenize the current container. /// /// This info is shared between the initial construct and its continuation. /// It’s only used for list items. #[derive(Debug)] pub struct ContainerState { /// Kind. pub kind: Container, /// Whether the first line was blank. pub blank_initial: bool, /// The size of the initial construct. pub size: usize, } /// The internal state of a tokenizer, not to be confused with states from the /// state machine, this instead is all the information about where we currently /// are and what’s going on. #[derive(Debug, Clone)] struct InternalState { /// Length of `events`. We only add to events, so reverting will just pop stuff off. events_len: usize, /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. stack_len: usize, /// Previous code. previous: Code, /// Current code. current: Code, /// `index` in codes of the current code. index: usize, /// Current relative and absolute position in the file. point: Point, } // #[derive(Debug)] /// A tokenizer itself. #[allow(clippy::struct_excessive_bools)] pub struct Tokenizer<'a> { /// Jump between line endings. column_start: Vec>, // First line. line_start: usize, /// Track whether a character is expected to be consumed, and whether it’s /// actually consumed /// /// Tracked to make sure everything’s valid. consumed: bool, /// Track whether this tokenizer is done. drained: bool, /// Semantic labels of one or more codes in `codes`. pub events: Vec, /// Hierarchy of semantic labels. /// /// Tracked to make sure everything’s valid. pub stack: Vec, /// Previous character code. pub previous: Code, /// Current character code. current: Code, /// `index` in codes of the current code. pub index: usize, /// Current relative and absolute place in the file. pub point: Point, /// List of attached resolvers, which will be called when done feeding, /// to clean events. resolvers: Vec>, /// List of names associated with attached resolvers. resolver_ids: Vec, /// Shared parsing state across tokenizers. pub parse_state: &'a ParseState, /// Stack of label (start) that could form images and links. /// /// Used when tokenizing [text content][crate::content::text]. pub label_start_stack: Vec, /// Stack of label (start) that cannot form images and links. /// /// Used when tokenizing [text content][crate::content::text]. pub label_start_list_loose: Vec, /// Stack of images and links. /// /// Used when tokenizing [text content][crate::content::text]. pub media_list: Vec, /// Whether we would be interrupting something. /// /// Used when tokenizing [flow content][crate::content::flow]. pub interrupt: bool, /// Whether containers cannot “pierce” into the current construct. /// /// Used when tokenizing [document content][crate::content::document]. pub concrete: bool, /// Whether this line is lazy. /// /// The previous line was a paragraph, and this line’s containers did not /// match. pub lazy: bool, /// Current container state. pub container: Option, } impl<'a> Tokenizer<'a> { /// Create a new tokenizer. pub fn new(point: Point, index: usize, parse_state: &'a ParseState) -> Tokenizer { Tokenizer { previous: Code::None, current: Code::None, // To do: reserve size when feeding? column_start: vec![], line_start: point.line, index, consumed: true, drained: false, point, stack: vec![], events: vec![], parse_state, label_start_stack: vec![], label_start_list_loose: vec![], media_list: vec![], interrupt: false, concrete: false, lazy: false, container: None, resolvers: vec![], resolver_ids: vec![], } } /// Register a resolver. pub fn register_resolver(&mut self, id: String, resolver: Box) { if !self.resolver_ids.contains(&id) { self.resolver_ids.push(id); self.resolvers.push(resolver); } } /// Register a resolver, before others. pub fn register_resolver_before(&mut self, id: String, resolver: Box) { if !self.resolver_ids.contains(&id) { self.resolver_ids.push(id); self.resolvers.insert(0, resolver); } } /// Prepare for a next code to get consumed. pub fn expect(&mut self, code: Code, force: bool) { if !force { assert!(self.consumed, "expected previous character to be consumed"); } self.consumed = false; self.current = code; } /// Define a jump between two places. pub fn define_skip(&mut self, point: &Point, index: usize) { define_skip_current_impl(self, point.line, (point.column, point.offset, index)); } /// Define the current place as a jump between two places. pub fn define_skip_current(&mut self) { define_skip_current_impl( self, self.point.line, (self.point.column, self.point.offset, self.index), ); } /// Increment the current positional info if we’re right after a line /// ending, which has a skip defined. fn account_for_potential_skip(&mut self) { let at = self.point.line - self.line_start; if self.point.column == 1 && at < self.column_start.len() { match &self.column_start[at] { None => {} Some((column, offset, index)) => { self.point.column = *column; self.point.offset = *offset; self.index = *index; } }; } } /// Consume the current character. /// Each [`StateFn`][] is expected to call this to signal that this code is /// used, or call a next `StateFn`. pub fn consume(&mut self, code: Code) { assert_eq!( code, self.current, "expected given code to equal expected code" ); log::debug!("consume: `{:?}` ({:?})", code, self.point); assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); self.index += 1; match code { Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { self.point.line += 1; self.point.column = 1; self.point.offset += if code == Code::CarriageReturnLineFeed { 2 } else { 1 }; self.account_for_potential_skip(); log::debug!("position: after eol: `{:?}`", self.point); } Code::VirtualSpace => { // Empty. } _ => { self.point.column += 1; self.point.offset += 1; } } self.previous = code; // Mark as consumed. self.consumed = true; } /// Mark the start of a semantic label. pub fn enter(&mut self, token_type: Token) { self.enter_with_content(token_type, None); } pub fn enter_with_content(&mut self, token_type: Token, content_type: Option) { log::debug!("enter: `{:?}` ({:?})", token_type, self.point); self.events.push(Event { event_type: EventType::Enter, token_type: token_type.clone(), point: self.point.clone(), index: self.index, previous: None, next: None, content_type, }); self.stack.push(token_type); } /// Mark the end of a semantic label. pub fn exit(&mut self, token_type: Token) { let current_token = self.stack.pop().expect("cannot close w/o open tokens"); assert_eq!( current_token, token_type, "expected exit token to match current token" ); let previous = self.events.last().expect("cannot close w/o open event"); let mut index = self.index; let mut point = self.point.clone(); assert!( current_token != previous.token_type || previous.index != index, "expected non-empty token" ); if VOID_TOKENS.iter().any(|d| d == &token_type) { assert!( current_token == previous.token_type, "expected token to be void (`{:?}`), instead of including `{:?}`", current_token, previous.token_type ); } // A bit weird, but if we exit right after a line ending, we *don’t* want to consider // potential skips. if matches!( self.previous, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') ) { point.column = 1; point.offset = previous.point.offset + if self.previous == Code::CarriageReturnLineFeed { 2 } else { 1 }; index = previous.index + 1; } log::debug!("exit: `{:?}` ({:?})", token_type, point); self.events.push(Event { event_type: EventType::Exit, token_type, point, index, previous: None, next: None, content_type: None, }); } /// Capture the internal state. fn capture(&mut self) -> InternalState { InternalState { index: self.index, previous: self.previous, current: self.current, point: self.point.clone(), events_len: self.events.len(), stack_len: self.stack.len(), } } /// Apply the internal state. fn free(&mut self, previous: InternalState) { self.index = previous.index; self.previous = previous.previous; self.current = previous.current; self.point = previous.point; assert!( self.events.len() >= previous.events_len, "expected to restore less events than before" ); self.events.truncate(previous.events_len); assert!( self.stack.len() >= previous.stack_len, "expected to restore less stack items than before" ); self.stack.truncate(previous.stack_len); } /// Parse with `state_fn` and its future states, switching to `ok` when /// successful, and passing [`State::Nok`][] back up if it occurs. /// /// This function does not capture the current state, in case of /// `State::Nok`, as it is assumed that this `go` is itself wrapped in /// another `attempt`. #[allow(clippy::unused_self)] pub fn go( &mut self, state_fn: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, after: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, ) -> Box { attempt_impl( state_fn, |_code| false, vec![], |result: (Vec, Vec), ok, tokenizer: &mut Tokenizer, _state| { if ok { feed_impl(tokenizer, &if ok { result.1 } else { result.0 }, after) } else { (State::Nok, None) } }, ) } /// Like `go`, but this lets you *hijack* back to some other state after a /// certain code. #[allow(clippy::unused_self)] pub fn go_until( &mut self, state_fn: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, until: impl FnMut(Code) -> bool + 'static, done: impl FnOnce(StateFnResult) -> StateFnResult + 'static, ) -> Box { attempt_impl( state_fn, until, vec![], |result: (Vec, Vec), _ok, tokenizer: &mut Tokenizer, state| { tokenizer.consumed = true; done(check_statefn_result((state, Some(result.1)))) }, ) } /// Parse with `state_fn` and its future states, to check if it result in /// [`State::Ok`][] or [`State::Nok`][], revert on both cases, and then /// call `done` with whether it was successful or not. /// /// This captures the current state of the tokenizer, returns a wrapped /// state that captures all codes and feeds them to `state_fn` and its /// future states until it yields `State::Ok` or `State::Nok`. /// It then applies the captured state, calls `done`, and feeds all /// captured codes to its future states. pub fn check( &mut self, state_fn: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, done: impl FnOnce(bool) -> Box + 'static, ) -> Box { let previous = self.capture(); attempt_impl( state_fn, |_code| false, vec![], |result: (Vec, Vec), ok, tokenizer: &mut Tokenizer, _state| { tokenizer.free(previous); feed_impl(tokenizer, &result.0, done(ok)) }, ) } /// Parse with `state_fn` and its future states, to check if it results in /// [`State::Ok`][] or [`State::Nok`][], revert on the case of /// `State::Nok`, and then call `done` with whether it was successful or /// not. /// /// This captures the current state of the tokenizer, returns a wrapped /// state that captures all codes and feeds them to `state_fn` and its /// future states until it yields `State::Ok`, at which point it calls /// `done` and yields its result. /// If instead `State::Nok` was yielded, the captured state is applied, /// `done` is called, and all captured codes are fed to its future states. pub fn attempt( &mut self, state_fn: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, done: impl FnOnce(bool) -> Box + 'static, ) -> Box { let previous = self.capture(); attempt_impl( state_fn, |_code| false, vec![], |result: (Vec, Vec), ok, tokenizer: &mut Tokenizer, _state| { if !ok { tokenizer.free(previous); } let codes = if ok { result.1 } else { result.0 }; log::debug!( "attempt: {:?}, codes: {:?}, at {:?}", ok, codes, tokenizer.point ); feed_impl(tokenizer, &codes, done(ok)) }, ) } /// Just like [`attempt`][Tokenizer::attempt], but many. pub fn attempt_n( &mut self, mut state_fns: Vec>, done: impl FnOnce(bool) -> Box + 'static, ) -> Box { if state_fns.is_empty() { done(false) } else { let state_fn = state_fns.remove(0); self.attempt(state_fn, move |ok| { if ok { done(ok) } else { Box::new(|t, code| t.attempt_n(state_fns, done)(t, code)) } }) } } /// Just like [`attempt`][Tokenizer::attempt], but for when you don’t care /// about `ok`. pub fn attempt_opt( &mut self, state_fn: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, after: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, ) -> Box { self.attempt(state_fn, |_ok| Box::new(after)) } /// Feed a list of `codes` into `start`. /// /// This is set up to support repeatedly calling `feed`, and thus streaming /// markdown into the state machine, and normally pauses after feeding. pub fn push( &mut self, codes: &[Code], start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, drain: bool, ) -> StateFnResult { assert!(!self.drained, "cannot feed after drain"); let mut result = feed_impl(self, codes, start); if drain { let func = match result.0 { State::Fn(func) => func, _ => unreachable!("expected next state"), }; result = flush_impl(self, func); self.drained = true; while !self.resolvers.is_empty() { let resolver = self.resolvers.remove(0); self.events = resolver(self); } } result } /// Flush the tokenizer. pub fn flush( &mut self, start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, ) -> StateFnResult { flush_impl(self, start) } } /// Internal utility to wrap states to also capture codes. /// /// Recurses into itself. /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, mut pause: impl FnMut(Code) -> bool + 'static, mut codes: Vec, done: impl FnOnce((Vec, Vec), bool, &mut Tokenizer, State) -> StateFnResult + 'static, ) -> Box { Box::new(|tokenizer, code| { if !codes.is_empty() && pause(tokenizer.previous) { return done( (codes, vec![code]), false, tokenizer, State::Fn(Box::new(state)), ); } let (next, remainder) = check_statefn_result(state(tokenizer, code)); match code { Code::None => {} _ => { codes.push(code); } } if let Some(ref list) = remainder { assert!( list.len() <= codes.len(), "`remainder` must be less than or equal to `codes`" ); } match next { State::Ok => { let remaining = if let Some(x) = remainder { x } else { vec![] }; check_statefn_result(done((codes, remaining), true, tokenizer, next)) } State::Nok => check_statefn_result(done((codes, vec![]), false, tokenizer, next)), State::Fn(func) => { assert!(remainder.is_none(), "expected no remainder"); check_statefn_result((State::Fn(attempt_impl(func, pause, codes, done)), None)) } } }) } /// Feed a list of `codes` into `start`. fn feed_impl( tokenizer: &mut Tokenizer, codes: &[Code], start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, ) -> StateFnResult { let codes = codes; let mut state = State::Fn(Box::new(start)); let mut index = 0; tokenizer.consumed = true; while index < codes.len() { let code = codes[index]; match state { State::Nok | State::Ok => { break; } State::Fn(func) => { log::debug!("main: passing: `{:?}`", code); tokenizer.expect(code, false); let (next, remainder) = check_statefn_result(func(tokenizer, code)); state = next; index = index + 1 - (if let Some(ref x) = remainder { x.len() } else { 0 }); } } } // Yield to a higher loop. check_statefn_result((state, Some(codes[index..].to_vec()))) } /// Flush `start`: pass `eof`s to it until done. fn flush_impl( tokenizer: &mut Tokenizer, start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, ) -> StateFnResult { let mut state = State::Fn(Box::new(start)); tokenizer.consumed = true; loop { // Feed EOF. match state { State::Ok | State::Nok => break, State::Fn(func) => { let code = Code::None; log::debug!("main: passing eof"); tokenizer.expect(code, false); let (next, remainder) = check_statefn_result(func(tokenizer, code)); assert!(remainder.is_none(), "expected no remainder"); state = next; } } } match state { State::Ok => {} _ => unreachable!("expected final state to be `State::Ok`"), } check_statefn_result((state, None)) } /// Define a jump between two places. /// /// This defines how much columns, offsets, and the `index` are increased when /// consuming a line ending. fn define_skip_current_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize, usize)) { log::debug!("position: define skip: {:?} -> ({:?})", line, info); let at = line - tokenizer.line_start; if at + 1 > tokenizer.column_start.len() { tokenizer.column_start.resize(at, None); tokenizer.column_start.push(Some(info)); } else { tokenizer.column_start[at] = Some(info); } tokenizer.account_for_potential_skip(); } /// Check a [`StateFnResult`][], make sure its valid (that there are no bugs), /// and clean a final eof passed back in `remainder`. fn check_statefn_result(result: StateFnResult) -> StateFnResult { let (state, mut remainder) = result; // Remove an eof. // For convencience, feeding back an eof is allowed, but cleaned here. // Most states handle eof and eol in the same branch, and hence pass // all back. // This might not be needed, because if EOF is passed back, we’re at the EOF. // But they’re not supposed to be in codes, so here we remove them. if let Some(ref mut list) = remainder { if Some(&Code::None) == list.last() { list.pop(); } if list.is_empty() { return (state, None); } } (state, remainder) }