diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-26 16:37:13 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-26 16:37:13 +0200 |
commit | a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6 (patch) | |
tree | fd7be2fe6d7355d3aafaf8b731f0e0b48624debc /src/construct | |
parent | 297784cb925b1196d89479fa24c898703ae598d6 (diff) | |
download | markdown-rs-a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6.tar.gz markdown-rs-a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6.tar.bz2 markdown-rs-a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6.zip |
Refactor to drastically improve perf around whitespace
Diffstat (limited to '')
-rw-r--r-- | src/construct/hard_break_escape.rs | 5 | ||||
-rw-r--r-- | src/construct/hard_break_trailing.rs | 88 | ||||
-rw-r--r-- | src/construct/mod.rs | 5 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 2 | ||||
-rw-r--r-- | src/construct/partial_space_or_tab.rs | 2 | ||||
-rw-r--r-- | src/construct/partial_whitespace.rs | 229 |
6 files changed, 195 insertions, 136 deletions
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index d45d685..40a83ef 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -27,7 +27,6 @@ //! ## Tokens //! //! * [`HardBreakEscape`][Token::HardBreakEscape] -//! * [`HardBreakEscapeMarker`][Token::HardBreakEscapeMarker] //! //! ## References //! @@ -37,7 +36,7 @@ //! [text]: crate::content::text //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference -//! [hard_break_trailing]: crate::construct::hard_break_trailing +//! [hard_break_trailing]: crate::construct::partial_whitespace //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element use crate::token::Token; @@ -54,9 +53,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Code::Char('\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); - tokenizer.enter(Token::HardBreakEscapeMarker); tokenizer.consume(); - tokenizer.exit(Token::HardBreakEscapeMarker); State::Fn(Box::new(inside)) } _ => State::Nok, diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs deleted file mode 100644 index f0ef83b..0000000 --- a/src/construct/hard_break_trailing.rs +++ /dev/null @@ -1,88 +0,0 @@ -//! Hard break (trailing) is a construct that occurs in the [text][] content -//! type. -//! -//! They’re formed with the following BNF: -//! -//! ```bnf -//! ; Restriction: followed by a line ending (that is part of the construct -//! ; instead of ending it). -//! hard_break_trailing ::= 2*' ' -//! ``` -//! -//! The minimum number of the spaces is defined in -//! [`HARD_BREAK_PREFIX_SIZE_MIN`][hard_break_prefix_size_min]. -//! -//! Hard breaks in markdown relate to the HTML element `<br>`. -//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. -//! -//! It is also possible to create a hard break with a similar construct: a -//! [hard break (escape)][hard_break_escape] is a backslash followed -//! by a line ending. -//! That construct is recommended because it is similar to a -//! [character escape][character_escape] and similar to how line endings can be -//! “escaped” in other languages. -//! Trailing spaces are typically invisible in editors, or even automatically -//! removed, making hard break (trailing) hard to use. -//! -//! ## Tokens -//! -//! * [`HardBreakTrailing`][Token::HardBreakTrailing] -//! * [`HardBreakTrailingSpace`][Token::HardBreakTrailingSpace] -//! -//! ## References -//! -//! * [`lib/initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js) -//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks) -//! -//! [text]: crate::content::text -//! [hard_break_escape]: crate::construct::hard_break_escape -//! [character_escape]: crate::construct::character_escape -//! [hard_break_prefix_size_min]: crate::constant::HARD_BREAK_PREFIX_SIZE_MIN -//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element - -use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; -use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; - -/// Start of a hard break (trailing). -/// -/// ```markdown -/// > | a␠␠ -/// ^ -/// | b -/// ``` -pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Code::Char(' ') if tokenizer.parse_state.constructs.hard_break_trailing => { - tokenizer.enter(Token::HardBreakTrailing); - tokenizer.enter(Token::HardBreakTrailingSpace); - tokenizer.consume(); - State::Fn(Box::new(|t| inside(t, 1))) - } - _ => State::Nok, - } -} - -/// Inside the hard break (trailing). -/// -/// ```markdown -/// > | a␠␠ -/// ^ -/// | b -/// ``` -fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { - match tokenizer.current { - Code::Char(' ') => { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, size + 1))) - } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - if size >= HARD_BREAK_PREFIX_SIZE_MIN => - { - tokenizer.exit(Token::HardBreakTrailingSpace); - tokenizer.exit(Token::HardBreakTrailing); - State::Ok - } - _ => State::Nok, - } -} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index be9dfe3..569c609 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -25,7 +25,6 @@ //! * [code (text)][code_text] //! * [definition][] //! * [hard break (escape)][hard_break_escape] -//! * [hard break (trailing)][hard_break_trailing] //! * [heading (atx)][heading_atx] //! * [heading (setext)][heading_setext] //! * [html (flow)][html_flow] @@ -37,6 +36,9 @@ //! * [paragraph][] //! * [thematic break][thematic_break] //! +//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by +//! > [whitespace][partial_whitespace]. +//! //! There are also several routines used in different places: //! //! * [data][partial_data] @@ -73,7 +75,6 @@ pub mod code_indented; pub mod code_text; pub mod definition; pub mod hard_break_escape; -pub mod hard_break_trailing; pub mod heading_atx; pub mod heading_setext; pub mod html_flow; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 86492b5..4216276 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -41,7 +41,7 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { State::Fn(Box::new(move |t| at_break(t, stop))) } _ if stop.contains(&tokenizer.current) => { - tokenizer.register_resolver("data".to_string(), Box::new(resolve_data)); + tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); State::Ok } _ => { diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index a97ac29..5f1a917 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -98,7 +98,7 @@ pub fn space_or_tab_eol() -> Box<StateFn> { pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> { Box::new(move |tokenizer| { let mut info = EolInfo { - connect: false, + connect: options.connect, ok: false, options, }; diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index afff1c4..4c94c7d 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -1,62 +1,211 @@ //! Trailing whitespace occurs in [string][] and [text][]. //! -//! It occurs at the start or end of the whole, or around line endings. -//! This whitespace is ignored +//! It occurs around line endings, and, in the case of text content it also +//! occurs at the start or end of the whole. //! //! They’re formed with the following BNF: //! //! ```bnf -//! ; Restriction: the start and end here count as an eol. +//! ; Restriction: the start and end here count as an eol in the case of `text`. //! whitespace ::= 0.*space_or_tab eol 0.*space_or_tab //! ``` //! -//! This is similar to [`space_or_tab_eol`][space_or_tab_eol], with the main -//! difference that that *does not* require a line ending and parses any -//! `space_or_tab` with one line ending. -//! This instead *requires* the line ending (or eol). +//! Normally this whitespace is ignored. +//! In the case of text content, whitespace before a line ending that +//! consistents solely of spaces, at least 2, forms a hard break (trailing). +//! +//! The minimum number of the spaces is defined in +//! [`HARD_BREAK_PREFIX_SIZE_MIN`][hard_break_prefix_size_min]. +//! +//! Hard breaks in markdown relate to the HTML element `<br>`. +//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. +//! +//! It is also possible to create a hard break with a similar construct: a +//! [hard break (escape)][hard_break_escape] is a backslash followed +//! by a line ending. +//! That construct is recommended because it is similar to a +//! [character escape][character_escape] and similar to how line endings can be +//! “escaped” in other languages. +//! Trailing spaces are typically invisible in editors, or even automatically +//! removed, making hard break (trailing) hard to use. +//! ## Tokens +//! +//! * [`HardBreakTrailing`][Token::HardBreakTrailing] +//! * [`SpaceOrTab`][Token::SpaceOrTab] //! //! ## References //! //! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js) +//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks) //! //! [string]: crate::content::string //! [text]: crate::content::text -//! [space_or_tab_eol]: crate::construct::partial_space_or_tab::space_or_tab_eol - -use super::partial_space_or_tab::space_or_tab; -use crate::tokenizer::{Code, State, Tokenizer}; - -/// Parse initial or final whitespace. -pub fn whitespace(tokenizer: &mut Tokenizer) -> State { - tokenizer.go( - // Nothing if there’s no whitespace. - space_or_tab(), - if matches!( - tokenizer.previous, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - ) { - // If there’s whitespace, and we were at an eol/eof, `ok` - ok - } else { - // If there’s whitespace, and we were not at an eol/eof, there must be one here. - at_eol - }, - )(tokenizer) +//! [hard_break_escape]: crate::construct::hard_break_escape +//! [character_escape]: crate::construct::character_escape +//! [hard_break_prefix_size_min]: crate::constant::HARD_BREAK_PREFIX_SIZE_MIN +//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element + +use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; +use crate::token::Token; +use crate::tokenizer::{Code, Event, EventType, Tokenizer}; +use crate::util::span; + +/// To do. +pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) { + move |t| resolve_whitespace(t, hard_break, trim_whole) } -/// After whitespace, at an eol/eof. -fn at_eol(tokenizer: &mut Tokenizer) -> State { - if matches!( - tokenizer.current, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - ) { - ok(tokenizer) - } else { - State::Nok +/// To do. +pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) { + let mut index = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.event_type == EventType::Exit && event.token_type == Token::Data { + let trim_start = (trim_whole && index == 1) + || (index > 1 && tokenizer.events[index - 2].token_type == Token::LineEnding); + let trim_end = (trim_whole && index == tokenizer.events.len() - 1) + || (index + 1 < tokenizer.events.len() + && tokenizer.events[index + 1].token_type == Token::LineEnding); + + trim_data(tokenizer, index, trim_start, trim_end, hard_break); + } + + index += 1; } } -/// Fine. -fn ok(_tokenizer: &mut Tokenizer) -> State { - State::Ok +/// To do. +#[allow(clippy::too_many_lines)] +fn trim_data( + tokenizer: &mut Tokenizer, + exit_index: usize, + trim_start: bool, + trim_end: bool, + hard_break: bool, +) { + let mut codes = span::codes( + &tokenizer.parse_state.codes, + &span::from_exit_event(&tokenizer.events, exit_index), + ); + + if trim_end { + let mut index = codes.len(); + let mut vs = 0; + let mut spaces_only = true; + while index > 0 { + match codes[index - 1] { + Code::Char(' ') => {} + Code::Char('\t') => spaces_only = false, + Code::VirtualSpace => { + vs += 1; + spaces_only = false; + } + _ => break, + } + + index -= 1; + } + + let diff = codes.len() - index; + let token_type = if spaces_only + && hard_break + && exit_index + 1 < tokenizer.events.len() + && diff >= HARD_BREAK_PREFIX_SIZE_MIN + { + Token::HardBreakTrailing + } else { + Token::SpaceOrTab + }; + + // The whole data is whitespace. + // We can be very fast: we only change the token types. + if index == 0 { + tokenizer.events[exit_index - 1].token_type = token_type.clone(); + tokenizer.events[exit_index].token_type = token_type; + return; + } + + if diff > 0 { + let exit_point = tokenizer.events[exit_index].point.clone(); + let mut enter_point = exit_point.clone(); + enter_point.index -= diff; + enter_point.column -= diff - vs; + enter_point.offset -= diff - vs; + + tokenizer.map.add( + exit_index + 1, + 0, + vec![ + Event { + event_type: EventType::Enter, + token_type: token_type.clone(), + point: enter_point.clone(), + link: None, + }, + Event { + event_type: EventType::Exit, + token_type, + point: exit_point, + link: None, + }, + ], + ); + + tokenizer.events[exit_index].point = enter_point; + codes = &codes[..index]; + } + } + + if trim_start { + let mut index = 0; + let mut vs = 0; + while index < codes.len() { + match codes[index] { + Code::Char(' ' | '\t') => {} + Code::VirtualSpace => vs += 1, + _ => break, + } + + index += 1; + } + + // The whole data is whitespace. + // We can be very fast: we only change the token types. + if index == codes.len() { + tokenizer.events[exit_index - 1].token_type = Token::SpaceOrTab; + tokenizer.events[exit_index].token_type = Token::SpaceOrTab; + return; + } + + if index > 0 { + let enter_point = tokenizer.events[exit_index - 1].point.clone(); + let mut exit_point = enter_point.clone(); + exit_point.index += index; + exit_point.column += index - vs; + exit_point.offset += index - vs; + + tokenizer.map.add( + exit_index - 1, + 0, + vec![ + Event { + event_type: EventType::Enter, + token_type: Token::SpaceOrTab, + point: enter_point, + link: None, + }, + Event { + event_type: EventType::Exit, + token_type: Token::SpaceOrTab, + point: exit_point.clone(), + link: None, + }, + ], + ); + + tokenizer.events[exit_index - 1].point = exit_point; + } + } } |