From 6dc2011d69c85820feddf6799142d304cc2eeb29 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 12 Aug 2022 17:28:19 +0200 Subject: Refactor to improve entering --- src/construct/code_fenced.rs | 20 ++++++++++-- src/construct/heading_atx.rs | 11 +++++-- src/construct/html_flow.rs | 2 +- src/construct/paragraph.rs | 11 +++++-- src/construct/partial_destination.rs | 20 ++++++++++-- src/construct/partial_label.rs | 13 ++++++-- src/construct/partial_space_or_tab.rs | 28 ++++++++++------ src/construct/partial_space_or_tab_eol.rs | 53 +++++++++++++------------------ src/construct/partial_title.rs | 13 ++++++-- src/content/document.rs | 8 ++--- src/event.rs | 2 +- src/subtokenize.rs | 8 ++--- src/tokenizer.rs | 51 +++++++++++++---------------- 13 files changed, 144 insertions(+), 96 deletions(-) diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 56a2a04..be0542a 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -103,7 +103,7 @@ use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::event::{Content, Name}; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use crate::util::slice::{Position, Slice}; @@ -223,7 +223,14 @@ pub fn info_before(tokenizer: &mut Tokenizer) -> State { } _ => { tokenizer.enter(Name::CodeFencedFenceInfo); - tokenizer.enter_with_content(Name::Data, Some(Content::String)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); State::Retry(StateName::CodeFencedInfo) } } @@ -281,7 +288,14 @@ pub fn meta_before(tokenizer: &mut Tokenizer) -> State { None | Some(b'\n') => State::Retry(StateName::CodeFencedInfoBefore), _ => { tokenizer.enter(Name::CodeFencedFenceMeta); - tokenizer.enter_with_content(Name::Data, Some(Content::String)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); State::Retry(StateName::CodeFencedMeta) } } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index f75805a..22b93db 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -56,7 +56,7 @@ use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::event::{Content, Event, Kind, Name}; +use crate::event::{Content, Event, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; @@ -157,7 +157,14 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { State::Retry(StateName::HeadingAtxSequenceFurther) } Some(_) => { - tokenizer.enter_with_content(Name::Data, Some(Content::Text)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::Text, + }, + ); State::Retry(StateName::HeadingAtxData) } } diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 2d685b6..123e1a3 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -145,7 +145,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { usize::MAX }, connect: false, - content_type: None, + content: None, }, )) } else { diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index c956a2c..e9fd377 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -32,7 +32,7 @@ //! [code_text]: crate::construct::code_text //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element -use crate::event::{Content, Kind, Name}; +use crate::event::{Content, Kind, Link, Name}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; @@ -49,7 +49,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { None | Some(b'\n') => unreachable!("unexpected eol/eof"), _ => { tokenizer.enter(Name::Paragraph); - tokenizer.enter_with_content(Name::Data, Some(Content::Text)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::Text, + }, + ); State::Retry(StateName::ParagraphInside) } } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 29cb5c4..d2477ab 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -71,7 +71,7 @@ //! [label_end]: crate::construct::label_end //! [sanitize_uri]: crate::util::sanitize_uri -use crate::event::{Content, Name}; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; @@ -99,7 +99,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { tokenizer.enter(tokenizer.tokenize_state.token_1.clone()); tokenizer.enter(tokenizer.tokenize_state.token_4.clone()); tokenizer.enter(tokenizer.tokenize_state.token_5.clone()); - tokenizer.enter_with_content(Name::Data, Some(Content::String)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); State::Retry(StateName::DestinationRaw) } } @@ -121,7 +128,14 @@ pub fn enclosed_before(tokenizer: &mut Tokenizer) -> State { State::Ok } else { tokenizer.enter(tokenizer.tokenize_state.token_5.clone()); - tokenizer.enter_with_content(Name::Data, Some(Content::String)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); State::Retry(StateName::DestinationEnclosed) } } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index a1667e1..20a7b15 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -60,7 +60,7 @@ use crate::constant::LINK_REFERENCE_SIZE_MAX; use crate::construct::partial_space_or_tab_eol::{space_or_tab_eol_with_options, Options}; -use crate::event::{Content, Name}; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; use crate::subtokenize::link; use crate::tokenizer::Tokenizer; @@ -110,7 +110,7 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_eol_with_options( tokenizer, Options { - content_type: Some(Content::String), + content: Some(Content::String), connect: tokenizer.tokenize_state.connect, }, )) @@ -127,7 +127,14 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { State::Ok } _ => { - tokenizer.enter_with_content(Name::Data, Some(Content::String)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); if tokenizer.tokenize_state.connect { let index = tokenizer.events.len() - 1; diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 9637373..43cfd45 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -4,7 +4,7 @@ //! //! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) -use crate::event::{Content, Name}; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; use crate::subtokenize::link; use crate::tokenizer::Tokenizer; @@ -21,7 +21,7 @@ pub struct Options { /// Connect this whitespace to the previous. pub connect: bool, /// Embedded content type to use. - pub content_type: Option, + pub content: Option, } /// One or more `space_or_tab`. @@ -45,7 +45,7 @@ pub fn space_or_tab_min_max(tokenizer: &mut Tokenizer, min: usize, max: usize) - kind: Name::SpaceOrTab, min, max, - content_type: None, + content: None, connect: false, }, ) @@ -54,7 +54,7 @@ pub fn space_or_tab_min_max(tokenizer: &mut Tokenizer, min: usize, max: usize) - /// `space_or_tab`, with the given options. pub fn space_or_tab_with_options(tokenizer: &mut Tokenizer, options: Options) -> StateName { tokenizer.tokenize_state.space_or_tab_connect = options.connect; - tokenizer.tokenize_state.space_or_tab_content_type = options.content_type; + tokenizer.tokenize_state.space_or_tab_content = options.content; tokenizer.tokenize_state.space_or_tab_min = options.min; tokenizer.tokenize_state.space_or_tab_max = options.max; tokenizer.tokenize_state.space_or_tab_token = options.kind; @@ -71,15 +71,23 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.tokenize_state.space_or_tab_max > 0 && matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.enter_with_content( - tokenizer.tokenize_state.space_or_tab_token.clone(), - tokenizer.tokenize_state.space_or_tab_content_type.clone(), - ); + if let Some(ref content) = tokenizer.tokenize_state.space_or_tab_content { + tokenizer.enter_link( + tokenizer.tokenize_state.space_or_tab_token.clone(), + Link { + previous: None, + next: None, + content: content.clone(), + }, + ); + } else { + tokenizer.enter(tokenizer.tokenize_state.space_or_tab_token.clone()); + } if tokenizer.tokenize_state.space_or_tab_connect { let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); - } else if tokenizer.tokenize_state.space_or_tab_content_type.is_some() { + } else if tokenizer.tokenize_state.space_or_tab_content.is_some() { tokenizer.tokenize_state.space_or_tab_connect = true; } @@ -127,7 +135,7 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { State::Nok }; tokenizer.tokenize_state.space_or_tab_connect = false; - tokenizer.tokenize_state.space_or_tab_content_type = None; + tokenizer.tokenize_state.space_or_tab_content = None; tokenizer.tokenize_state.space_or_tab_size = 0; tokenizer.tokenize_state.space_or_tab_max = 0; tokenizer.tokenize_state.space_or_tab_min = 0; diff --git a/src/construct/partial_space_or_tab_eol.rs b/src/construct/partial_space_or_tab_eol.rs index 08f4bf2..b38bc64 100644 --- a/src/construct/partial_space_or_tab_eol.rs +++ b/src/construct/partial_space_or_tab_eol.rs @@ -7,7 +7,7 @@ use crate::construct::partial_space_or_tab::{ space_or_tab_with_options, Options as SpaceOrTabOptions, }; -use crate::event::{Content, Name}; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; use crate::subtokenize::link; use crate::tokenizer::Tokenizer; @@ -18,7 +18,7 @@ pub struct Options { /// Connect this whitespace to the previous. pub connect: bool, /// Embedded content type to use. - pub content_type: Option, + pub content: Option, } /// `space_or_tab`, or optionally `space_or_tab`, one `eol`, and @@ -31,7 +31,7 @@ pub fn space_or_tab_eol(tokenizer: &mut Tokenizer) -> StateName { space_or_tab_eol_with_options( tokenizer, Options { - content_type: None, + content: None, connect: false, }, ) @@ -39,7 +39,7 @@ pub fn space_or_tab_eol(tokenizer: &mut Tokenizer) -> StateName { /// `space_or_tab_eol`, with the given options. pub fn space_or_tab_eol_with_options(tokenizer: &mut Tokenizer, options: Options) -> StateName { - tokenizer.tokenize_state.space_or_tab_eol_content_type = options.content_type; + tokenizer.tokenize_state.space_or_tab_eol_content = options.content; tokenizer.tokenize_state.space_or_tab_eol_connect = options.connect; StateName::SpaceOrTabEolStart } @@ -65,10 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { kind: Name::SpaceOrTab, min: 1, max: usize::MAX, - content_type: tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .clone(), + content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(), connect: tokenizer.tokenize_state.space_or_tab_eol_connect, }, )) @@ -86,11 +83,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { pub fn after_first(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.space_or_tab_eol_ok = true; - if tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .is_some() - { + if tokenizer.tokenize_state.space_or_tab_eol_content.is_some() { tokenizer.tokenize_state.space_or_tab_eol_connect = true; } @@ -111,22 +104,23 @@ pub fn after_first(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn at_eol(tokenizer: &mut Tokenizer) -> State { if let Some(b'\n') = tokenizer.current { - tokenizer.enter_with_content( - Name::LineEnding, - tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .clone(), - ); + if let Some(ref content) = tokenizer.tokenize_state.space_or_tab_eol_content { + tokenizer.enter_link( + Name::LineEnding, + Link { + previous: None, + next: None, + content: content.clone(), + }, + ); + } else { + tokenizer.enter(Name::LineEnding); + } if tokenizer.tokenize_state.space_or_tab_eol_connect { let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); - } else if tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .is_some() - { + } else if tokenizer.tokenize_state.space_or_tab_eol_content.is_some() { tokenizer.tokenize_state.space_or_tab_eol_connect = true; } @@ -135,7 +129,7 @@ pub fn at_eol(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::SpaceOrTabEolAfterEol) } else { let ok = tokenizer.tokenize_state.space_or_tab_eol_ok; - tokenizer.tokenize_state.space_or_tab_eol_content_type = None; + tokenizer.tokenize_state.space_or_tab_eol_content = None; tokenizer.tokenize_state.space_or_tab_eol_connect = false; tokenizer.tokenize_state.space_or_tab_eol_ok = false; if ok { @@ -167,10 +161,7 @@ pub fn after_eol(tokenizer: &mut Tokenizer) -> State { kind: Name::SpaceOrTab, min: 1, max: usize::MAX, - content_type: tokenizer - .tokenize_state - .space_or_tab_eol_content_type - .clone(), + content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(), connect: tokenizer.tokenize_state.space_or_tab_eol_connect, }, )) @@ -187,7 +178,7 @@ pub fn after_eol(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` pub fn after_more(tokenizer: &mut Tokenizer) -> State { - tokenizer.tokenize_state.space_or_tab_eol_content_type = None; + tokenizer.tokenize_state.space_or_tab_eol_content = None; tokenizer.tokenize_state.space_or_tab_eol_connect = false; tokenizer.tokenize_state.space_or_tab_eol_ok = false; diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index b97243e..93dbd28 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -31,7 +31,7 @@ //! [label_end]: crate::construct::label_end use crate::construct::partial_space_or_tab_eol::{space_or_tab_eol_with_options, Options}; -use crate::event::{Content, Name}; +use crate::event::{Content, Link, Name}; use crate::state::{Name as StateName, State}; use crate::subtokenize::link; use crate::tokenizer::Tokenizer; @@ -106,7 +106,7 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { State::Retry(space_or_tab_eol_with_options( tokenizer, Options { - content_type: Some(Content::String), + content: Some(Content::String), connect: tokenizer.tokenize_state.connect, }, )) @@ -118,7 +118,14 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { State::Retry(StateName::TitleBegin) } Some(_) => { - tokenizer.enter_with_content(Name::Data, Some(Content::String)); + tokenizer.enter_link( + Name::Data, + Link { + previous: None, + next: None, + content: Content::String, + }, + ); if tokenizer.tokenize_state.connect { let index = tokenizer.events.len() - 1; diff --git a/src/content/document.rs b/src/content/document.rs index 59e6e7c..41d60e2 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -281,13 +281,13 @@ pub fn containers_after(tokenizer: &mut Tokenizer) -> State { tokenizer.events[previous].link.as_mut().unwrap().next = Some(current); } tokenizer.tokenize_state.document_data_index = Some(current); - tokenizer.enter_with_link( + tokenizer.enter_link( Name::Data, - Some(Link { + Link { previous, next: None, - content_type: Content::Flow, - }), + content: Content::Flow, + }, ); State::Retry(StateName::DocumentFlowInside) } diff --git a/src/event.rs b/src/event.rs index 51ecd86..be32b5b 100644 --- a/src/event.rs +++ b/src/event.rs @@ -1892,7 +1892,7 @@ pub enum Content { pub struct Link { pub previous: Option, pub next: Option, - pub content_type: Content, + pub content: Content, } /// Place in the document. diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 432c198..f55c790 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -9,7 +9,7 @@ //! * …must occur on [`Enter`][Kind::Enter] events only //! * …must occur on void events (they are followed by their corresponding //! [`Exit`][Kind::Exit] event) -//! * …must have `content_type` field to define the kind of subcontent +//! * …must have `link` field //! //! Links will then be passed through a tokenizer for the corresponding content //! type by `subtokenize`. @@ -53,8 +53,8 @@ pub fn link_to(events: &mut [Event], pevious: usize, next: usize) { link_next.previous = Some(pevious); debug_assert_eq!( - events[pevious].link.as_ref().unwrap().content_type, - events[next].link.as_ref().unwrap().content_type + events[pevious].link.as_ref().unwrap().content, + events[next].link.as_ref().unwrap().content ); } @@ -80,7 +80,7 @@ pub fn subtokenize(events: &mut Vec, parse_state: &ParseState) -> bool { // Subtokenizer. let mut tokenizer = Tokenizer::new(event.point.clone(), parse_state); // Substate. - let mut state = State::Next(if link.content_type == Content::String { + let mut state = State::Next(if link.content == Content::String { StateName::StringStart } else { StateName::TextStart diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4a9fa01..dcd34ac 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -141,11 +141,11 @@ pub struct TokenizeState<'a> { pub document_paragraph_before: bool, // Couple of very frequent settings for parsing whitespace. - pub space_or_tab_eol_content_type: Option, + pub space_or_tab_eol_content: Option, pub space_or_tab_eol_connect: bool, pub space_or_tab_eol_ok: bool, pub space_or_tab_connect: bool, - pub space_or_tab_content_type: Option, + pub space_or_tab_content: Option, pub space_or_tab_min: usize, pub space_or_tab_max: usize, pub space_or_tab_size: usize, @@ -289,11 +289,11 @@ impl<'a> Tokenizer<'a> { size: 0, size_b: 0, size_c: 0, - space_or_tab_eol_content_type: None, + space_or_tab_eol_content: None, space_or_tab_eol_connect: false, space_or_tab_eol_ok: false, space_or_tab_connect: false, - space_or_tab_content_type: None, + space_or_tab_content: None, space_or_tab_min: 0, space_or_tab_max: 0, space_or_tab_size: 0, @@ -423,34 +423,12 @@ impl<'a> Tokenizer<'a> { /// Mark the start of a semantic label. pub fn enter(&mut self, name: Name) { - self.enter_with_link(name, None); - } - - /// Enter with a content type. - pub fn enter_with_content(&mut self, name: Name, content_type_opt: Option) { - self.enter_with_link( - name, - content_type_opt.map(|content_type| Link { - content_type, - previous: None, - next: None, - }), - ); + enter_impl(self, name, None); } /// Enter with a link. - pub fn enter_with_link(&mut self, name: Name, link: Option) { - let mut point = self.point.clone(); - move_point_back(self, &mut point); - - log::debug!("enter: `{:?}`", name); - self.events.push(Event { - kind: Kind::Enter, - name: name.clone(), - point, - link, - }); - self.stack.push(name); + pub fn enter_link(&mut self, name: Name, link: Link) { + enter_impl(self, name, Some(link)); } /// Mark the end of a semantic label. @@ -597,6 +575,21 @@ fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) { } } +/// Enter. +fn enter_impl(tokenizer: &mut Tokenizer, name: Name, link: Option) { + let mut point = tokenizer.point.clone(); + move_point_back(tokenizer, &mut point); + + log::debug!("enter: `{:?}`", name); + tokenizer.stack.push(name.clone()); + tokenizer.events.push(Event { + kind: Kind::Enter, + name, + point, + link, + }); +} + /// Run the tokenizer. fn push_impl( tokenizer: &mut Tokenizer, -- cgit