From 90969231bfcdfcd09bae646abba17d832b633376 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 11 Aug 2022 17:00:07 +0200 Subject: Refactor to handle definitions when parsing --- src/construct/definition.rs | 37 ++++++++++++++++++++++++++++++++++--- src/content/document.rs | 37 +++++++------------------------------ src/tokenizer.rs | 14 +++++++++----- 3 files changed, 50 insertions(+), 38 deletions(-) (limited to 'src') diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 6f63c79..ee930b1 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -98,7 +98,11 @@ use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; use crate::event::Name; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::util::skip::opt_back as skip_opt_back; +use crate::util::{ + normalize_identifier::normalize_identifier, + skip, + slice::{Position, Slice}, +}; /// At the start of a definition. /// @@ -110,7 +114,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { // Do not interrupt paragraphs (but do follow definitions). let possible = !tokenizer.interrupt || (!tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( + && tokenizer.events[skip::opt_back( &tokenizer.events, tokenizer.events.len() - 1, &[Name::LineEnding, Name::SpaceOrTab], @@ -165,6 +169,12 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_2 = Name::Data; tokenizer.tokenize_state.token_3 = Name::Data; + tokenizer.tokenize_state.end = skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Name::DefinitionLabelString], + ); + match tokenizer.current { Some(b':') => { tokenizer.enter(Name::DefinitionMarker); @@ -239,6 +249,7 @@ pub fn destination_missing(tokenizer: &mut Tokenizer) -> State { tokenizer.tokenize_state.token_4 = Name::Data; tokenizer.tokenize_state.token_5 = Name::Data; tokenizer.tokenize_state.size_b = 0; + tokenizer.tokenize_state.end = 0; State::Nok } @@ -271,11 +282,31 @@ pub fn after_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None | Some(b'\n') => { tokenizer.exit(Name::Definition); + + // Note: we don’t care about uniqueness. + // It’s likely that that doesn’t happen very frequently. + // It is more likely that it wastes precious time. + tokenizer.tokenize_state.definitions.push( + // Note: we don’t care about virtual spaces, so `as_str` is fine. + normalize_identifier( + Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, tokenizer.tokenize_state.end), + ) + .as_str(), + ), + ); + + tokenizer.tokenize_state.end = 0; + // You’d be interrupting. tokenizer.interrupt = true; State::Ok } - _ => State::Nok, + _ => { + tokenizer.tokenize_state.end = 0; + State::Nok + }, } } diff --git a/src/content/document.rs b/src/content/document.rs index b990ba5..f2890f3 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -13,11 +13,7 @@ use crate::parser::ParseState; use crate::state::{Name as StateName, State}; use crate::subtokenize::{divide_events, subtokenize}; use crate::tokenizer::{Container, ContainerState, Tokenizer}; -use crate::util::{ - normalize_identifier::normalize_identifier, - skip, - slice::{Position, Slice}, -}; +use crate::util::skip; /// Phases where we can exit containers. #[derive(Debug, PartialEq)] @@ -61,33 +57,9 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { ); tokenizer.flush(state, true); - let mut index = 0; - let mut definitions = vec![]; - - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - if event.kind == Kind::Exit && event.name == Name::DefinitionLabelString { - // Note: we don’t care about virtual spaces, so `as_str` is fine. - let id = normalize_identifier( - Slice::from_position( - tokenizer.parse_state.bytes, - &Position::from_exit_event(&tokenizer.events, index), - ) - .as_str(), - ); - - if !definitions.contains(&id) { - definitions.push(id); - } - } - - index += 1; - } - let mut events = tokenizer.events; - parse_state.definitions = definitions; + parse_state.definitions = tokenizer.tokenize_state.definitions; while !subtokenize(&mut events, parse_state) {} @@ -531,4 +503,9 @@ fn resolve(tokenizer: &mut Tokenizer) { tokenizer .resolvers .append(&mut child.resolvers.split_off(0)); + + tokenizer + .tokenize_state + .definitions + .append(&mut child.tokenize_state.definitions.split_off(0)); } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8ff19c3..fdca6c5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -165,6 +165,9 @@ pub struct TokenizeState<'a> { /// Used when tokenizing [text content][crate::content::text]. pub media_list: Vec, + /// List of defined identifiers. + pub definitions: Vec, + /// Whether to connect tokens. pub connect: bool, /// Marker. @@ -274,18 +277,18 @@ impl<'a> Tokenizer<'a> { document_data_index: None, document_child_state: None, document_child: None, + definitions: vec![], + end: 0, + label_start_stack: vec![], + label_start_list_loose: vec![], marker: 0, marker_b: 0, markers: &[], + media_list: vec![], seen: false, size: 0, size_b: 0, size_c: 0, - start: 0, - end: 0, - label_start_stack: vec![], - label_start_list_loose: vec![], - media_list: vec![], space_or_tab_eol_content_type: None, space_or_tab_eol_connect: false, space_or_tab_eol_ok: false, @@ -295,6 +298,7 @@ impl<'a> Tokenizer<'a> { space_or_tab_max: 0, space_or_tab_size: 0, space_or_tab_token: Name::SpaceOrTab, + start: 0, token_1: Name::Data, token_2: Name::Data, token_3: Name::Data, -- cgit