diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/label_end.rs | 2 | ||||
-rw-r--r-- | src/construct/label_start_image.rs | 2 | ||||
-rw-r--r-- | src/construct/label_start_link.rs | 2 | ||||
-rw-r--r-- | src/construct/mod.rs | 2 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 51 | ||||
-rw-r--r-- | src/construct/partial_whitespace.rs | 56 | ||||
-rw-r--r-- | src/content/string.rs | 17 | ||||
-rw-r--r-- | src/content/text.rs | 22 | ||||
-rw-r--r-- | src/tokenizer.rs | 9 |
9 files changed, 143 insertions, 20 deletions
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 888355b..0da12b8 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -510,7 +510,7 @@ fn ok(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { info.media.end.1 = tokenizer.events.len() - 1; tokenizer.media_list.push(info.media); - tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media)); (State::Ok, Some(vec![code])) } diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 7725334..a45205a 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -67,7 +67,7 @@ pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { balanced: false, inactive: false, }); - tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media)); (State::Ok, None) } _ => (State::Nok, None), diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index 46d7c9c..6c4d7ae 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -49,7 +49,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { balanced: false, inactive: false, }); - tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media)); (State::Ok, None) } _ => (State::Nok, None), diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 8565b2f..9e3dfb0 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -44,6 +44,7 @@ //! * [label][partial_label] //! * [space or tab][partial_space_or_tab] //! * [title][partial_title] +//! * [whitespace][partial_whitespace] //! //! Each construct maintained here is explained with a BNF diagram. //! For example, the docs for [character escape][character_escape] contain: @@ -83,4 +84,5 @@ pub mod partial_destination; pub mod partial_label; pub mod partial_space_or_tab; pub mod partial_title; +pub mod partial_whitespace; pub mod thematic_break; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index d83787a..9f99570 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -8,7 +8,8 @@ // To do: pass token types in? -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::edit_map::EditMap; /// At the beginning of data. /// @@ -39,7 +40,10 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnRe tokenizer.exit(TokenType::LineEnding); (State::Fn(Box::new(|t, c| at_break(t, c, stop))), None) } - _ if stop.contains(&code) => (State::Ok, Some(vec![code])), + _ if stop.contains(&code) => { + tokenizer.register_resolver("data".to_string(), Box::new(resolve)); + (State::Ok, Some(vec![code])) + } _ => { tokenizer.enter(TokenType::Data); data(tokenizer, code, stop) @@ -67,3 +71,46 @@ fn data(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult (State::Fn(Box::new(|t, c| data(t, c, stop))), None) } } + +/// Merge adjacent data events. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut edit_map = EditMap::new(); + let len = tokenizer.events.len(); + let mut index = 0; + + // Loop through events and merge adjacent data events. + while index < len { + let event = &tokenizer.events[index]; + + if event.event_type == EventType::Enter && event.token_type == TokenType::Data { + let exit_index = index + 1; + let mut exit_far_index = exit_index; + + // Find multiple `data` events. + while exit_far_index + 1 < len + && tokenizer.events[exit_far_index + 1].token_type == TokenType::Data + { + exit_far_index += 2; + } + + if exit_far_index > exit_index { + edit_map.add(exit_index, exit_far_index - exit_index, vec![]); + + // Change positional info. + let exit_far = &tokenizer.events[exit_far_index]; + let point_end = exit_far.point.clone(); + let index_end = exit_far.index; + let exit = &mut tokenizer.events[exit_index]; + exit.point = point_end; + exit.index = index_end; + index = exit_far_index; + + continue; + } + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) +} diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs new file mode 100644 index 0000000..9a7a54d --- /dev/null +++ b/src/construct/partial_whitespace.rs @@ -0,0 +1,56 @@ +//! Trailing whitespace occurs in [string][] and [text][]. +//! +//! It occurs at the start or end of the whole, or around line endings. +//! This whitespace is ignored +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: the start and end here count as an eol. +//! whitespace ::= 0.*space_or_tab eol 0.*space_or_tab +//! ``` +//! +//! ## References +//! +//! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js) +//! +//! [string]: crate::content::string +//! [text]: crate::content::text + +use super::partial_space_or_tab::space_or_tab; +use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; + +/// Parse initial or final whitespace. +pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.go( + // Nothing if there’s no whitespace. + space_or_tab(), + if matches!( + tokenizer.previous, + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + ) { + // If there’s whitespace, and we were at an eol/eof, `ok` + ok + } else { + // If there’s whitespace, and we were not at an eol/eof, there must be one here. + at_eol + }, + )(tokenizer, code) +} + +/// After whitespace, at an eol/eof. +fn at_eol(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if matches!( + code, + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + ) { + ok(tokenizer, code) + } else { + (State::Nok, None) + } +} + +/// Fine. +fn ok(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + (State::Ok, Some(vec![code])) +} diff --git a/src/content/string.rs b/src/content/string.rs index 53e88b1..cc8ee53 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -14,13 +14,16 @@ use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, - partial_data::start as data, + partial_data::start as data, partial_whitespace::whitespace, }; use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; -const MARKERS: [Code; 2] = [ - Code::Char('&'), // `character_reference` - Code::Char('\\'), // `character_escape` +const MARKERS: [Code; 5] = [ + Code::VirtualSpace, // `whitespace` + Code::Char('\t'), // `whitespace` + Code::Char(' '), // `whitespace` + Code::Char('&'), // `character_reference` + Code::Char('\\'), // `character_escape` ]; /// Before string. @@ -34,7 +37,11 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), _ => tokenizer.attempt_n( - vec![Box::new(character_reference), Box::new(character_escape)], + vec![ + Box::new(character_reference), + Box::new(character_escape), + Box::new(whitespace), + ], |ok| Box::new(if ok { start } else { before_data }), )(tokenizer, code), } diff --git a/src/content/text.rs b/src/content/text.rs index 183072e..c3f4e1b 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -24,18 +24,21 @@ use crate::construct::{ hard_break_trailing::start as hard_break_trailing, html_text::start as html_text, label_end::start as label_end, label_start_image::start as label_start_image, label_start_link::start as label_start_link, partial_data::start as data, + partial_whitespace::whitespace, }; use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; -const MARKERS: [Code; 8] = [ - Code::Char(' '), // `hard_break_trailing` - Code::Char('!'), // `label_start_image` - Code::Char('&'), // `character_reference` - Code::Char('<'), // `autolink`, `html_text` - Code::Char('['), // `label_start_link` - Code::Char('\\'), // `character_escape`, `hard_break_escape` - Code::Char(']'), // `label_end` - Code::Char('`'), // `code_text` +const MARKERS: [Code; 10] = [ + Code::VirtualSpace, // `whitespace` + Code::Char('\t'), // `whitespace` + Code::Char(' '), // `hard_break_trailing`, `whitespace` + Code::Char('!'), // `label_start_image` + Code::Char('&'), // `character_reference` + Code::Char('<'), // `autolink`, `html_text` + Code::Char('['), // `label_start_link` + Code::Char('\\'), // `character_escape`, `hard_break_escape` + Code::Char(']'), // `label_end` + Code::Char('`'), // `code_text` ]; /// Before text. @@ -62,6 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Box::new(label_end), Box::new(label_start_image), Box::new(label_start_link), + Box::new(whitespace), ], |ok| Box::new(if ok { start } else { before_data }), )(tokenizer, code), diff --git a/src/tokenizer.rs b/src/tokenizer.rs index fe69366..817c1de 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1796,6 +1796,13 @@ impl<'a> Tokenizer<'a> { } } + pub fn register_resolver_before(&mut self, id: String, resolver: Box<Resolver>) { + if !self.resolver_ids.contains(&id) { + self.resolver_ids.push(id); + self.resolvers.insert(0, resolver); + } + } + /// Prepare for a next code to get consumed. fn expect(&mut self, code: Code) { assert!(self.consumed, "expected previous character to be consumed"); @@ -1901,7 +1908,7 @@ impl<'a> Tokenizer<'a> { let point = self.point.clone(); assert!( - current_token != previous.token_type || previous.point != point, + current_token != previous.token_type || previous.index != self.index, "expected non-empty token" ); |