diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-28 16:48:00 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-28 16:48:00 +0200 |
commit | f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch) | |
tree | c1ac3f22473bd79566d835b2474d2ae9e00d6c55 /src/content | |
parent | d729b07712ca9cc91e68af1776dac9d7008a90cb (diff) | |
download | markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2 markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip |
Refactor to work on `char`s
Previously, a custom char implementation was used.
This was easier to work with, as sometimes “virtual” characters are injected,
or characters are ignored.
This replaces that with working on actual `char`s.
In the hope of in the future working on `u8`s, even.
This simplifies the state machine somewhat, as only `\n` is fed, regardless of
whether it was a CRLF, CR, or LF.
It also feeds `' '` instead of virtual spaces.
The BOM, if present, is now available as a `ByteOrderMark` event.
Diffstat (limited to 'src/content')
-rw-r--r-- | src/content/document.rs | 39 | ||||
-rw-r--r-- | src/content/flow.rs | 14 | ||||
-rw-r--r-- | src/content/string.rs | 6 | ||||
-rw-r--r-- | src/content/text.rs | 24 |
4 files changed, 52 insertions, 31 deletions
diff --git a/src/content/document.rs b/src/content/document.rs index 32b32ba..2924f6c 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -17,12 +17,12 @@ use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; use crate::tokenizer::{ - Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, + Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, }; use crate::util::{ normalize_identifier::normalize_identifier, skip, - span::{from_exit_event, serialize}, + slice::{Position, Slice}, }; /// Phases where we can exit containers. @@ -78,7 +78,7 @@ struct DocumentInfo { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start)); + let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before)); tokenizer.flush(state, true); let mut index = 0; @@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { + // To do: when we operate on u8, we can use a `to_str` here as we + // don‘t need virtual spaces. let id = normalize_identifier( - serialize( - &parse_state.codes, - &from_exit_event(&tokenizer.events, index), - false, + &Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, index), ) - .as_str(), + .serialize(), ); if !definitions.contains(&id) { @@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { events } +/// At the beginning. +/// +/// Perhaps a BOM? +/// +/// ```markdown +/// > | a +/// ^ +/// ``` +fn before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some('\u{FEFF}') => { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Fn(Box::new(start)) + } + _ => start(tokenizer), + } +} + /// Before document. // /// ```markdown @@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State // Parse flow, pausing after eols. tokenizer.go_until( state, - |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')), + |code| matches!(code, Some('\n')), move |state| Box::new(move |t| flow_end(t, info, state)), )(tokenizer) } diff --git a/src/content/flow.rs b/src/content/flow.rs index ea09cd9..09c4e2c 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -27,7 +27,7 @@ use crate::construct::{ thematic_break::start as thematic_break, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Before flow. /// @@ -41,7 +41,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt(blank_line, |ok| { Box::new(if ok { blank_line_after } else { initial_before }) })(tokenizer), @@ -62,7 +62,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn initial_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt_n( vec![ Box::new(code_indented), @@ -87,8 +87,8 @@ fn initial_before(tokenizer: &mut Tokenizer) -> State { /// ``` fn blank_line_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Ok, + Some('\n') => { tokenizer.enter(Token::BlankLineEnding); tokenizer.consume(); tokenizer.exit(Token::BlankLineEnding); @@ -111,8 +111,8 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Ok, + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); diff --git a/src/content/string.rs b/src/content/string.rs index c6c0094..8bc2b91 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -16,9 +16,9 @@ use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, partial_data::start as data, partial_whitespace::create_resolve_whitespace, }; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; -const MARKERS: [Code; 2] = [Code::Char('&'), Code::Char('\\')]; +const MARKERS: [char; 2] = ['&', '\\']; /// Start of string. pub fn start(tokenizer: &mut Tokenizer) -> State { @@ -32,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// Before string. fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt_n( vec![Box::new(character_reference), Box::new(character_escape)], |ok| Box::new(if ok { before } else { before_data }), diff --git a/src/content/text.rs b/src/content/text.rs index 4248053..ebdf888 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -28,18 +28,18 @@ use crate::construct::{ label_start_image::start as label_start_image, label_start_link::start as label_start_link, partial_data::start as data, partial_whitespace::create_resolve_whitespace, }; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; -const MARKERS: [Code; 9] = [ - Code::Char('!'), // `label_start_image` - Code::Char('&'), // `character_reference` - Code::Char('*'), // `attention` - Code::Char('<'), // `autolink`, `html_text` - Code::Char('['), // `label_start_link` - Code::Char('\\'), // `character_escape`, `hard_break_escape` - Code::Char(']'), // `label_end` - Code::Char('_'), // `attention` - Code::Char('`'), // `code_text` +const MARKERS: [char; 9] = [ + '!', // `label_start_image` + '&', // `character_reference` + '*', // `attention` + '<', // `autolink`, `html_text` + '[', // `label_start_link` + '\\', // `character_escape`, `hard_break_escape` + ']', // `label_end` + '_', // `attention` + '`', // `code_text` ]; /// Start of text. @@ -57,7 +57,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// Before text. pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt_n( vec![ Box::new(attention), |