diff options
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 134 |
1 files changed, 38 insertions, 96 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs index baad6ed..b48351d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,6 +1,6 @@ //! The tokenizer glues states from the state machine together. //! -//! It facilitates everything needed to turn codes into tokens and events with +//! It facilitates everything needed to turn codes into tokens and with //! a state machine. //! It also enables logic needed for parsing markdown, such as an [`attempt`][] //! to parse something, which can succeed or, when unsuccessful, revert the @@ -12,22 +12,11 @@ //! [`check`]: Tokenizer::check use crate::constant::TAB_SIZE; +use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS}; use crate::parser::ParseState; -use crate::state::{call, Name, State}; -use crate::token::{Token, VOID_TOKENS}; +use crate::state::{call, Name as StateName, State}; use crate::util::edit_map::EditMap; -/// Embedded content type. -#[derive(Debug, Clone, PartialEq)] -pub enum ContentType { - /// Represents [flow content][crate::content::flow]. - Flow, - /// Represents [string content][crate::content::string]. - String, - /// Represents [text content][crate::content::text]. - Text, -} - /// How to handle a byte. #[derive(Debug, PartialEq)] pub enum ByteAction { @@ -41,53 +30,6 @@ pub enum ByteAction { Ignore, } -/// A location in the document (`line`/`column`/`offset`). -/// -/// The interface for the location in the document comes from unist `Point`: -/// <https://github.com/syntax-tree/unist#point>. -#[derive(Debug, Clone)] -pub struct Point { - /// 1-indexed line number. - pub line: usize, - /// 1-indexed column number. - /// This is increases up to a tab stop for tabs. - /// Some editors count tabs as 1 character, so this position is not the - /// same as editors. - pub column: usize, - /// 0-indexed position in the document. - /// - /// Also an `index` into `bytes`. - pub index: usize, - /// Virtual step on the same `index`. - pub vs: usize, -} - -/// Possible event types. -#[derive(Debug, PartialEq, Clone)] -pub enum EventType { - /// The start of something. - Enter, - /// The end of something. - Exit, -} - -/// A link to another event. -#[derive(Debug, Clone)] -pub struct Link { - pub previous: Option<usize>, - pub next: Option<usize>, - pub content_type: ContentType, -} - -/// Something semantic happening somewhere. -#[derive(Debug, Clone)] -pub struct Event { - pub event_type: EventType, - pub token_type: Token, - pub point: Point, - pub link: Option<Link>, -} - /// Callback that can be registered and is called when the tokenizer is done. /// /// Resolvers are supposed to change the list of events, because parsing is @@ -205,15 +147,15 @@ pub struct TokenizeState<'a> { pub document_paragraph_before: bool, // Couple of very frequent settings for parsing whitespace. - pub space_or_tab_eol_content_type: Option<ContentType>, + pub space_or_tab_eol_content_type: Option<Content>, pub space_or_tab_eol_connect: bool, pub space_or_tab_eol_ok: bool, pub space_or_tab_connect: bool, - pub space_or_tab_content_type: Option<ContentType>, + pub space_or_tab_content_type: Option<Content>, pub space_or_tab_min: usize, pub space_or_tab_max: usize, pub space_or_tab_size: usize, - pub space_or_tab_token: Token, + pub space_or_tab_token: Name, // Couple of media related fields. /// Stack of label (start) that could form images and links. @@ -250,15 +192,15 @@ pub struct TokenizeState<'a> { /// Index. pub end: usize, /// Slot for a token type. - pub token_1: Token, + pub token_1: Name, /// Slot for a token type. - pub token_2: Token, + pub token_2: Name, /// Slot for a token type. - pub token_3: Token, + pub token_3: Name, /// Slot for a token type. - pub token_4: Token, + pub token_4: Name, /// Slot for a token type. - pub token_5: Token, + pub token_5: Name, } /// A tokenizer itself. @@ -290,7 +232,7 @@ pub struct Tokenizer<'a> { /// Hierarchy of semantic labels. /// /// Tracked to make sure everything’s valid. - pub stack: Vec<Token>, + pub stack: Vec<Name>, /// Edit map, to batch changes. pub map: EditMap, /// List of attached resolvers, which will be called when done feeding, @@ -363,12 +305,12 @@ impl<'a> Tokenizer<'a> { space_or_tab_min: 0, space_or_tab_max: 0, space_or_tab_size: 0, - space_or_tab_token: Token::SpaceOrTab, - token_1: Token::Data, - token_2: Token::Data, - token_3: Token::Data, - token_4: Token::Data, - token_5: Token::Data, + space_or_tab_token: Name::SpaceOrTab, + token_1: Name::Data, + token_2: Name::Data, + token_3: Name::Data, + token_4: Name::Data, + token_5: Name::Data, }, map: EditMap::new(), interrupt: false, @@ -491,13 +433,13 @@ impl<'a> Tokenizer<'a> { } /// Mark the start of a semantic label. - pub fn enter(&mut self, token_type: Token) { - self.enter_with_link(token_type, None); + pub fn enter(&mut self, name: Name) { + self.enter_with_link(name, None); } - pub fn enter_with_content(&mut self, token_type: Token, content_type_opt: Option<ContentType>) { + pub fn enter_with_content(&mut self, name: Name, content_type_opt: Option<Content>) { self.enter_with_link( - token_type, + name, content_type_opt.map(|content_type| Link { content_type, previous: None, @@ -506,26 +448,26 @@ impl<'a> Tokenizer<'a> { ); } - pub fn enter_with_link(&mut self, token_type: Token, link: Option<Link>) { + pub fn enter_with_link(&mut self, name: Name, link: Option<Link>) { let mut point = self.point.clone(); move_point_back(self, &mut point); - log::debug!("enter: `{:?}`", token_type); + log::debug!("enter: `{:?}`", name); self.events.push(Event { - event_type: EventType::Enter, - token_type: token_type.clone(), + kind: Kind::Enter, + name: name.clone(), point, link, }); - self.stack.push(token_type); + self.stack.push(name); } /// Mark the end of a semantic label. - pub fn exit(&mut self, token_type: Token) { + pub fn exit(&mut self, name: Name) { let current_token = self.stack.pop().expect("cannot close w/o open tokens"); debug_assert_eq!( - current_token, token_type, + current_token, name, "expected exit token to match current token" ); @@ -533,18 +475,18 @@ impl<'a> Tokenizer<'a> { let mut point = self.point.clone(); debug_assert!( - current_token != previous.token_type + current_token != previous.name || previous.point.index != point.index || previous.point.vs != point.vs, "expected non-empty token" ); - if VOID_TOKENS.iter().any(|d| d == &token_type) { + if VOID_EVENTS.iter().any(|d| d == &name) { debug_assert!( - current_token == previous.token_type, + current_token == previous.name, "expected token to be void (`{:?}`), instead of including `{:?}`", current_token, - previous.token_type + previous.name ); } @@ -556,10 +498,10 @@ impl<'a> Tokenizer<'a> { move_point_back(self, &mut point); } - log::debug!("exit: `{:?}`", token_type); + log::debug!("exit: `{:?}`", name); self.events.push(Event { - event_type: EventType::Exit, - token_type, + kind: Kind::Exit, + name, point, link: None, }); @@ -595,7 +537,7 @@ impl<'a> Tokenizer<'a> { /// Parse with `name` and its future states, to see if that results in /// [`State::Ok`][] or [`State::Nok`][], then revert in both cases. - pub fn check(&mut self, name: Name, ok: State, nok: State) -> State { + pub fn check(&mut self, name: StateName, ok: State, nok: State) -> State { // Always capture (and restore) when checking. // No need to capture (and restore) when `nok` is `State::Nok`, because the // parent attempt will do it. @@ -614,7 +556,7 @@ impl<'a> Tokenizer<'a> { /// Parse with `name` and its future states, to see if that results in /// [`State::Ok`][] or [`State::Nok`][], revert in the case of /// `State::Nok`. - pub fn attempt(&mut self, name: Name, ok: State, nok: State) -> State { + pub fn attempt(&mut self, name: StateName, ok: State, nok: State) -> State { // Always capture (and restore) when checking. // No need to capture (and restore) when `nok` is `State::Nok`, because the // parent attempt will do it. |