//! A tokenizer glues states from the state machine together. //! //! It facilitates everything needed to turn bytes into events with a state //! machine. //! It also enables the logic needed for parsing markdown, such as an //! [`attempt`][] to try and parse something, which can succeed or, when //! unsuccessful, revert the attempt. //! //! [`attempt`]: Tokenizer::attempt use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS}; use crate::parser::ParseState; use crate::resolve::{call as call_resolve, Name as ResolveName}; use crate::state::{call, State}; use crate::subtokenize::Subresult; #[cfg(feature = "log")] use crate::util::char::format_byte_opt; use crate::util::{constant::TAB_SIZE, edit_map::EditMap}; use alloc::{boxed::Box, string::String, vec, vec::Vec}; /// Containers. /// /// Containers are found when tokenizing /// [document content][crate::construct::document]. /// They parse a portion at the start of one or more lines. /// The rest of those lines is a different content type (specifically, flow), /// which they “contain”. #[derive(Debug, Eq, PartialEq)] pub enum Container { /// [Block quote][crate::construct::block_quote]. BlockQuote, /// [List item][crate::construct::list_item]. ListItem, /// [GFM: Footnote definition][crate::construct::gfm_footnote_definition]. GfmFootnoteDefinition, } /// Info used to tokenize a container. /// /// Practically, these fields are only used for list items. #[derive(Debug)] pub struct ContainerState { /// Kind. pub kind: Container, /// Whether the first line was blank. pub blank_initial: bool, /// Size. pub size: usize, } /// How to handle a byte. #[derive(Debug, PartialEq)] enum ByteAction { /// This is a normal byte. /// /// Includes replaced bytes. Normal(u8), /// This byte must be ignored. Ignore, /// This is a new byte. Insert(u8), } /// Label start kind. #[derive(Debug, PartialEq, Eq)] pub enum LabelKind { /// Label (image) start. /// /// ```markdown /// > | a ![b] c /// ^^ /// ``` /// /// Construct: [Label start (image)][crate::construct::label_start_image]. Image, /// Label (image) link. /// /// ```markdown /// > | a [b] c /// ^ /// ``` /// /// Construct: [Label start (link)][crate::construct::label_start_link]. Link, /// GFM: Label (footnote) link. /// /// ```markdown /// > | a [^b] c /// ^^ /// ``` /// /// Construct: [GFM: Label start (footnote)][crate::construct::gfm_label_start_footnote]. GfmFootnote, /// GFM: Label (footnote) link, not matching a footnote definition, so /// handled as a label (link) start. /// /// ```markdown /// > | a [^b](c) d /// ^^ /// ``` /// /// Construct: [Label end][crate::construct::label_end]. GfmUndefinedFootnote, } /// Label start, looking for an end. #[derive(Debug)] pub struct LabelStart { /// Kind of start. pub kind: LabelKind, /// Indices of where the label starts and ends in `events`. pub start: (usize, usize), /// A boolean used internally to figure out if a (link) label start can’t /// be used anymore (because it would contain another link). /// That link start is still looking for a balanced closing bracket though, /// so we can’t remove it just yet. pub inactive: bool, } /// Valid label. #[derive(Debug)] pub struct Label { pub kind: LabelKind, /// Indices of label start. pub start: (usize, usize), /// Indices of label end. pub end: (usize, usize), } /// Different kinds of attempts. #[derive(Debug, PartialEq)] enum AttemptKind { /// Discard what was tokenized when unsuccessful. Attempt, /// Discard always. Check, } /// How to handle [`State::Ok`][] or [`State::Nok`][]. #[derive(Debug)] struct Attempt { /// Where to go to when successful. ok: State, /// Where to go to when unsuccessful. nok: State, /// Kind of attempt. kind: AttemptKind, /// If needed, the progress to revert to. /// /// It is not needed to discard an [`AttemptKind::Attempt`] that has a /// `nok` of [`State::Nok`][], because that means it is used in *another* /// attempt, which will receive that `Nok`, and has to handle it. progress: Option, } /// The internal state of a tokenizer. /// /// Not to be confused with states from the state machine, this instead is all /// the information on where we currently are and what’s going on. #[derive(Clone, Debug)] struct Progress { /// Length of `events`. /// /// It’s not allowed to remove events, so reverting will just pop stuff off. events_len: usize, /// Length of the stack. /// /// It’s not allowed to decrease the stack in an attempt. stack_len: usize, /// Previous code. previous: Option, /// Current code. current: Option, /// Current place in the file. point: Point, } /// A lot of shared fields used to tokenize things. #[allow(clippy::struct_excessive_bools)] #[derive(Debug)] pub struct TokenizeState<'a> { // Couple complex fields used to tokenize the document. /// Tokenizer, used to tokenize flow in document. pub document_child: Option>>, /// State, used to tokenize containers. pub document_child_state: Option, /// Stack of currently active containers. pub document_container_stack: Vec, /// How many active containers continued. pub document_continued: usize, /// Index of last `data`. pub document_data_index: Option, /// Container exits by line number. pub document_exits: Vec>>, /// Whether the previous flow was a paragraph or a definition. pub document_lazy_accepting_before: bool, /// Whether this is the first paragraph (potentially after definitions) in /// a list item. /// Used for GFM task list items. pub document_at_first_paragraph_of_list_item: bool, // Couple of very frequent settings for parsing whitespace. pub space_or_tab_eol_content: Option, pub space_or_tab_eol_connect: bool, pub space_or_tab_eol_ok: bool, pub space_or_tab_connect: bool, pub space_or_tab_content: Option, pub space_or_tab_min: usize, pub space_or_tab_max: usize, pub space_or_tab_size: usize, pub space_or_tab_token: Name, // Couple of media related fields. /// List of usable label starts. /// /// Used when tokenizing [text content][crate::construct::text]. pub label_starts: Vec, /// List of unusable label starts. /// /// Used when tokenizing [text content][crate::construct::text]. pub label_starts_loose: Vec, /// Stack of images and links. /// /// Used when tokenizing [text content][crate::construct::text]. pub labels: Vec