//! The tokenizer glues states from the state machine together. //! //! It facilitates everything needed to turn codes into tokens and with //! a state machine. //! It also enables logic needed for parsing markdown, such as an [`attempt`][] //! to parse something, which can succeed or, when unsuccessful, revert the //! attempt. //! Similarly, a [`check`][] exists, which does the same as an `attempt` but //! reverts even if successful. //! //! [`attempt`]: Tokenizer::attempt //! [`check`]: Tokenizer::check use crate::constant::TAB_SIZE; use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS}; use crate::parser::ParseState; use crate::resolve::{call as call_resolve, Name as ResolveName}; use crate::state::{call, State}; use crate::util::edit_map::EditMap; /// Info used to tokenize the current container. /// /// This info is shared between the initial construct and its continuation. /// It’s only used for list items. #[derive(Debug)] pub struct ContainerState { /// Kind. pub kind: Container, /// Whether the first line was blank. pub blank_initial: bool, /// The size of the initial construct. pub size: usize, } /// How to handle a byte. #[derive(Debug, PartialEq)] enum ByteAction { /// This is a normal byte. /// /// Includes replaced bytes. Normal(u8), /// This is a new byte. Insert(u8), /// This byte must be ignored. Ignore, } /// Supported containers. #[derive(Debug, PartialEq)] pub enum Container { BlockQuote, ListItem, } /// Loose label starts we found. #[derive(Debug)] pub struct LabelStart { /// Indices of where the label starts and ends in `events`. pub start: (usize, usize), /// A boolean used internally to figure out if a (link) label start link /// can’t be used anymore (because it would contain another link). /// That link start is still looking for a balanced closing bracket though, /// so we can’t remove it just yet. pub inactive: bool, } /// Valid label. #[derive(Debug)] pub struct Label { /// Indices of label start. pub start: (usize, usize), /// Indices of label end. pub end: (usize, usize), } /// Different kinds of attempts. #[derive(Debug, PartialEq)] enum AttemptKind { /// Discard what was tokenized when unsuccessful. Attempt, /// Discard always. Check, } /// How to handle [`State::Ok`][] or [`State::Nok`][]. #[derive(Debug)] struct Attempt { /// Where to go to when successful. ok: State, /// Where to go to when unsuccessful. nok: State, /// Kind of attempt. kind: AttemptKind, /// If needed, the progress to revert to. /// /// It is not needed to discard an [`AttemptKind::Attempt`] that has a /// `nok` of [`State::Nok`][], because that means it is used in *another* /// attempt, which will receive that `Nok`, and has to handle it. progress: Option, } /// The internal state of a tokenizer, not to be confused with states from the /// state machine, this instead is all the information about where we currently /// are and what’s going on. #[derive(Debug, Clone)] struct Progress { /// Length of `events`. /// /// It’s not allowed to remove events, so reverting will just pop stuff off. events_len: usize, /// Length of the stack. /// /// It’s not allowed to decrease the stack in an attempt. stack_len: usize, /// Previous code. previous: Option, /// Current code. current: Option, /// Current place in the file. point: Point, } /// A lot of shared fields used to tokenize things. #[allow(clippy::struct_excessive_bools)] #[derive(Debug)] pub struct TokenizeState<'a> { // Couple complex fields used to tokenize the document. /// Tokenizer, used to tokenize flow in document. pub document_child: Option>>, /// State, used to tokenize containers. pub document_child_state: Option, /// Stack of currently active containers. pub document_container_stack: Vec, /// How many active containers continued. pub document_continued: usize, /// Index of last `data`. pub document_data_index: Option, /// Container exits by line number. pub document_exits: Vec>>, /// Whether the previous flow was a paragraph. pub document_paragraph_before: bool, // Couple of very frequent settings for parsing whitespace. pub space_or_tab_eol_content: Option, pub space_or_tab_eol_connect: bool, pub space_or_tab_eol_ok: bool, pub space_or_tab_connect: bool, pub space_or_tab_content: Option, pub space_or_tab_min: usize, pub space_or_tab_max: usize, pub space_or_tab_size: usize, pub space_or_tab_token: Name, // Couple of media related fields. /// List of usable label starts. /// /// Used when tokenizing [text content][crate::construct::text]. pub label_starts: Vec, /// List of unusable label starts. /// /// Used when tokenizing [text content][crate::construct::text]. pub label_starts_loose: Vec, /// Stack of images and links. /// /// Used when tokenizing [text content][crate::construct::text]. pub labels: Vec