diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-28 16:48:00 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-28 16:48:00 +0200 |
commit | f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch) | |
tree | c1ac3f22473bd79566d835b2474d2ae9e00d6c55 /src/content/document.rs | |
parent | d729b07712ca9cc91e68af1776dac9d7008a90cb (diff) | |
download | markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2 markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip |
Refactor to work on `char`s
Previously, a custom char implementation was used.
This was easier to work with, as sometimes “virtual” characters are injected,
or characters are ignored.
This replaces that with working on actual `char`s.
In the hope of in the future working on `u8`s, even.
This simplifies the state machine somewhat, as only `\n` is fed, regardless of
whether it was a CRLF, CR, or LF.
It also feeds `' '` instead of virtual spaces.
The BOM, if present, is now available as a `ByteOrderMark` event.
Diffstat (limited to 'src/content/document.rs')
-rw-r--r-- | src/content/document.rs | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/src/content/document.rs b/src/content/document.rs index 32b32ba..2924f6c 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -17,12 +17,12 @@ use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; use crate::tokenizer::{ - Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, + Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, }; use crate::util::{ normalize_identifier::normalize_identifier, skip, - span::{from_exit_event, serialize}, + slice::{Position, Slice}, }; /// Phases where we can exit containers. @@ -78,7 +78,7 @@ struct DocumentInfo { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start)); + let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before)); tokenizer.flush(state, true); let mut index = 0; @@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { + // To do: when we operate on u8, we can use a `to_str` here as we + // don‘t need virtual spaces. let id = normalize_identifier( - serialize( - &parse_state.codes, - &from_exit_event(&tokenizer.events, index), - false, + &Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, index), ) - .as_str(), + .serialize(), ); if !definitions.contains(&id) { @@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { events } +/// At the beginning. +/// +/// Perhaps a BOM? +/// +/// ```markdown +/// > | a +/// ^ +/// ``` +fn before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some('\u{FEFF}') => { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Fn(Box::new(start)) + } + _ => start(tokenizer), + } +} + /// Before document. // /// ```markdown @@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State // Parse flow, pausing after eols. tokenizer.go_until( state, - |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')), + |code| matches!(code, Some('\n')), move |state| Box::new(move |t| flow_end(t, info, state)), )(tokenizer) } |