From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 28 Jul 2022 16:48:00 +0200 Subject: Refactor to work on `char`s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event. --- src/content/document.rs | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'src/content/document.rs') diff --git a/src/content/document.rs b/src/content/document.rs index 32b32ba..2924f6c 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -17,12 +17,12 @@ use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; use crate::tokenizer::{ - Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, + Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, }; use crate::util::{ normalize_identifier::normalize_identifier, skip, - span::{from_exit_event, serialize}, + slice::{Position, Slice}, }; /// Phases where we can exit containers. @@ -78,7 +78,7 @@ struct DocumentInfo { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start)); + let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before)); tokenizer.flush(state, true); let mut index = 0; @@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { + // To do: when we operate on u8, we can use a `to_str` here as we + // don‘t need virtual spaces. let id = normalize_identifier( - serialize( - &parse_state.codes, - &from_exit_event(&tokenizer.events, index), - false, + &Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, index), ) - .as_str(), + .serialize(), ); if !definitions.contains(&id) { @@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { events } +/// At the beginning. +/// +/// Perhaps a BOM? +/// +/// ```markdown +/// > | a +/// ^ +/// ``` +fn before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some('\u{FEFF}') => { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Fn(Box::new(start)) + } + _ => start(tokenizer), + } +} + /// Before document. // /// ```markdown @@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State // Parse flow, pausing after eols. tokenizer.go_until( state, - |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')), + |code| matches!(code, Some('\n')), move |state| Box::new(move |t| flow_end(t, info, state)), )(tokenizer) } -- cgit