aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-26 16:37:13 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-26 16:37:13 +0200
commita6b317ac7fbc95b8584056b3cebffbf9d1bba2c6 (patch)
treefd7be2fe6d7355d3aafaf8b731f0e0b48624debc /src/construct
parent297784cb925b1196d89479fa24c898703ae598d6 (diff)
downloadmarkdown-rs-a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6.tar.gz
markdown-rs-a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6.tar.bz2
markdown-rs-a6b317ac7fbc95b8584056b3cebffbf9d1bba2c6.zip
Refactor to drastically improve perf around whitespace
Diffstat (limited to 'src/construct')
-rw-r--r--src/construct/hard_break_escape.rs5
-rw-r--r--src/construct/hard_break_trailing.rs88
-rw-r--r--src/construct/mod.rs5
-rw-r--r--src/construct/partial_data.rs2
-rw-r--r--src/construct/partial_space_or_tab.rs2
-rw-r--r--src/construct/partial_whitespace.rs229
6 files changed, 195 insertions, 136 deletions
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
index d45d685..40a83ef 100644
--- a/src/construct/hard_break_escape.rs
+++ b/src/construct/hard_break_escape.rs
@@ -27,7 +27,6 @@
//! ## Tokens
//!
//! * [`HardBreakEscape`][Token::HardBreakEscape]
-//! * [`HardBreakEscapeMarker`][Token::HardBreakEscapeMarker]
//!
//! ## References
//!
@@ -37,7 +36,7 @@
//! [text]: crate::content::text
//! [character_escape]: crate::construct::character_escape
//! [character_reference]: crate::construct::character_reference
-//! [hard_break_trailing]: crate::construct::hard_break_trailing
+//! [hard_break_trailing]: crate::construct::partial_whitespace
//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
use crate::token::Token;
@@ -54,9 +53,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Code::Char('\\') if tokenizer.parse_state.constructs.hard_break_escape => {
tokenizer.enter(Token::HardBreakEscape);
- tokenizer.enter(Token::HardBreakEscapeMarker);
tokenizer.consume();
- tokenizer.exit(Token::HardBreakEscapeMarker);
State::Fn(Box::new(inside))
}
_ => State::Nok,
diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs
deleted file mode 100644
index f0ef83b..0000000
--- a/src/construct/hard_break_trailing.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-//! Hard break (trailing) is a construct that occurs in the [text][] content
-//! type.
-//!
-//! They’re formed with the following BNF:
-//!
-//! ```bnf
-//! ; Restriction: followed by a line ending (that is part of the construct
-//! ; instead of ending it).
-//! hard_break_trailing ::= 2*' '
-//! ```
-//!
-//! The minimum number of the spaces is defined in
-//! [`HARD_BREAK_PREFIX_SIZE_MIN`][hard_break_prefix_size_min].
-//!
-//! Hard breaks in markdown relate to the HTML element `<br>`.
-//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info.
-//!
-//! It is also possible to create a hard break with a similar construct: a
-//! [hard break (escape)][hard_break_escape] is a backslash followed
-//! by a line ending.
-//! That construct is recommended because it is similar to a
-//! [character escape][character_escape] and similar to how line endings can be
-//! “escaped” in other languages.
-//! Trailing spaces are typically invisible in editors, or even automatically
-//! removed, making hard break (trailing) hard to use.
-//!
-//! ## Tokens
-//!
-//! * [`HardBreakTrailing`][Token::HardBreakTrailing]
-//! * [`HardBreakTrailingSpace`][Token::HardBreakTrailingSpace]
-//!
-//! ## References
-//!
-//! * [`lib/initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js)
-//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks)
-//!
-//! [text]: crate::content::text
-//! [hard_break_escape]: crate::construct::hard_break_escape
-//! [character_escape]: crate::construct::character_escape
-//! [hard_break_prefix_size_min]: crate::constant::HARD_BREAK_PREFIX_SIZE_MIN
-//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
-
-use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN;
-use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
-
-/// Start of a hard break (trailing).
-///
-/// ```markdown
-/// > | a␠␠
-/// ^
-/// | b
-/// ```
-pub fn start(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Code::Char(' ') if tokenizer.parse_state.constructs.hard_break_trailing => {
- tokenizer.enter(Token::HardBreakTrailing);
- tokenizer.enter(Token::HardBreakTrailingSpace);
- tokenizer.consume();
- State::Fn(Box::new(|t| inside(t, 1)))
- }
- _ => State::Nok,
- }
-}
-
-/// Inside the hard break (trailing).
-///
-/// ```markdown
-/// > | a␠␠
-/// ^
-/// | b
-/// ```
-fn inside(tokenizer: &mut Tokenizer, size: usize) -> State {
- match tokenizer.current {
- Code::Char(' ') => {
- tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, size + 1)))
- }
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- if size >= HARD_BREAK_PREFIX_SIZE_MIN =>
- {
- tokenizer.exit(Token::HardBreakTrailingSpace);
- tokenizer.exit(Token::HardBreakTrailing);
- State::Ok
- }
- _ => State::Nok,
- }
-}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index be9dfe3..569c609 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -25,7 +25,6 @@
//! * [code (text)][code_text]
//! * [definition][]
//! * [hard break (escape)][hard_break_escape]
-//! * [hard break (trailing)][hard_break_trailing]
//! * [heading (atx)][heading_atx]
//! * [heading (setext)][heading_setext]
//! * [html (flow)][html_flow]
@@ -37,6 +36,9 @@
//! * [paragraph][]
//! * [thematic break][thematic_break]
//!
+//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
+//! > [whitespace][partial_whitespace].
+//!
//! There are also several routines used in different places:
//!
//! * [data][partial_data]
@@ -73,7 +75,6 @@ pub mod code_indented;
pub mod code_text;
pub mod definition;
pub mod hard_break_escape;
-pub mod hard_break_trailing;
pub mod heading_atx;
pub mod heading_setext;
pub mod html_flow;
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs
index 86492b5..4216276 100644
--- a/src/construct/partial_data.rs
+++ b/src/construct/partial_data.rs
@@ -41,7 +41,7 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State {
State::Fn(Box::new(move |t| at_break(t, stop)))
}
_ if stop.contains(&tokenizer.current) => {
- tokenizer.register_resolver("data".to_string(), Box::new(resolve_data));
+ tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data));
State::Ok
}
_ => {
diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs
index a97ac29..5f1a917 100644
--- a/src/construct/partial_space_or_tab.rs
+++ b/src/construct/partial_space_or_tab.rs
@@ -98,7 +98,7 @@ pub fn space_or_tab_eol() -> Box<StateFn> {
pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> {
Box::new(move |tokenizer| {
let mut info = EolInfo {
- connect: false,
+ connect: options.connect,
ok: false,
options,
};
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index afff1c4..4c94c7d 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -1,62 +1,211 @@
//! Trailing whitespace occurs in [string][] and [text][].
//!
-//! It occurs at the start or end of the whole, or around line endings.
-//! This whitespace is ignored
+//! It occurs around line endings, and, in the case of text content it also
+//! occurs at the start or end of the whole.
//!
//! They’re formed with the following BNF:
//!
//! ```bnf
-//! ; Restriction: the start and end here count as an eol.
+//! ; Restriction: the start and end here count as an eol in the case of `text`.
//! whitespace ::= 0.*space_or_tab eol 0.*space_or_tab
//! ```
//!
-//! This is similar to [`space_or_tab_eol`][space_or_tab_eol], with the main
-//! difference that that *does not* require a line ending and parses any
-//! `space_or_tab` with one line ending.
-//! This instead *requires* the line ending (or eol).
+//! Normally this whitespace is ignored.
+//! In the case of text content, whitespace before a line ending that
+//! consistents solely of spaces, at least 2, forms a hard break (trailing).
+//!
+//! The minimum number of the spaces is defined in
+//! [`HARD_BREAK_PREFIX_SIZE_MIN`][hard_break_prefix_size_min].
+//!
+//! Hard breaks in markdown relate to the HTML element `<br>`.
+//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info.
+//!
+//! It is also possible to create a hard break with a similar construct: a
+//! [hard break (escape)][hard_break_escape] is a backslash followed
+//! by a line ending.
+//! That construct is recommended because it is similar to a
+//! [character escape][character_escape] and similar to how line endings can be
+//! “escaped” in other languages.
+//! Trailing spaces are typically invisible in editors, or even automatically
+//! removed, making hard break (trailing) hard to use.
+//! ## Tokens
+//!
+//! * [`HardBreakTrailing`][Token::HardBreakTrailing]
+//! * [`SpaceOrTab`][Token::SpaceOrTab]
//!
//! ## References
//!
//! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js)
+//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#hard-line-breaks)
//!
//! [string]: crate::content::string
//! [text]: crate::content::text
-//! [space_or_tab_eol]: crate::construct::partial_space_or_tab::space_or_tab_eol
-
-use super::partial_space_or_tab::space_or_tab;
-use crate::tokenizer::{Code, State, Tokenizer};
-
-/// Parse initial or final whitespace.
-pub fn whitespace(tokenizer: &mut Tokenizer) -> State {
- tokenizer.go(
- // Nothing if there’s no whitespace.
- space_or_tab(),
- if matches!(
- tokenizer.previous,
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- ) {
- // If there’s whitespace, and we were at an eol/eof, `ok`
- ok
- } else {
- // If there’s whitespace, and we were not at an eol/eof, there must be one here.
- at_eol
- },
- )(tokenizer)
+//! [hard_break_escape]: crate::construct::hard_break_escape
+//! [character_escape]: crate::construct::character_escape
+//! [hard_break_prefix_size_min]: crate::constant::HARD_BREAK_PREFIX_SIZE_MIN
+//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
+
+use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN;
+use crate::token::Token;
+use crate::tokenizer::{Code, Event, EventType, Tokenizer};
+use crate::util::span;
+
+/// To do.
+pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) {
+ move |t| resolve_whitespace(t, hard_break, trim_whole)
}
-/// After whitespace, at an eol/eof.
-fn at_eol(tokenizer: &mut Tokenizer) -> State {
- if matches!(
- tokenizer.current,
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- ) {
- ok(tokenizer)
- } else {
- State::Nok
+/// To do.
+pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) {
+ let mut index = 0;
+
+ while index < tokenizer.events.len() {
+ let event = &tokenizer.events[index];
+
+ if event.event_type == EventType::Exit && event.token_type == Token::Data {
+ let trim_start = (trim_whole && index == 1)
+ || (index > 1 && tokenizer.events[index - 2].token_type == Token::LineEnding);
+ let trim_end = (trim_whole && index == tokenizer.events.len() - 1)
+ || (index + 1 < tokenizer.events.len()
+ && tokenizer.events[index + 1].token_type == Token::LineEnding);
+
+ trim_data(tokenizer, index, trim_start, trim_end, hard_break);
+ }
+
+ index += 1;
}
}
-/// Fine.
-fn ok(_tokenizer: &mut Tokenizer) -> State {
- State::Ok
+/// To do.
+#[allow(clippy::too_many_lines)]
+fn trim_data(
+ tokenizer: &mut Tokenizer,
+ exit_index: usize,
+ trim_start: bool,
+ trim_end: bool,
+ hard_break: bool,
+) {
+ let mut codes = span::codes(
+ &tokenizer.parse_state.codes,
+ &span::from_exit_event(&tokenizer.events, exit_index),
+ );
+
+ if trim_end {
+ let mut index = codes.len();
+ let mut vs = 0;
+ let mut spaces_only = true;
+ while index > 0 {
+ match codes[index - 1] {
+ Code::Char(' ') => {}
+ Code::Char('\t') => spaces_only = false,
+ Code::VirtualSpace => {
+ vs += 1;
+ spaces_only = false;
+ }
+ _ => break,
+ }
+
+ index -= 1;
+ }
+
+ let diff = codes.len() - index;
+ let token_type = if spaces_only
+ && hard_break
+ && exit_index + 1 < tokenizer.events.len()
+ && diff >= HARD_BREAK_PREFIX_SIZE_MIN
+ {
+ Token::HardBreakTrailing
+ } else {
+ Token::SpaceOrTab
+ };
+
+ // The whole data is whitespace.
+ // We can be very fast: we only change the token types.
+ if index == 0 {
+ tokenizer.events[exit_index - 1].token_type = token_type.clone();
+ tokenizer.events[exit_index].token_type = token_type;
+ return;
+ }
+
+ if diff > 0 {
+ let exit_point = tokenizer.events[exit_index].point.clone();
+ let mut enter_point = exit_point.clone();
+ enter_point.index -= diff;
+ enter_point.column -= diff - vs;
+ enter_point.offset -= diff - vs;
+
+ tokenizer.map.add(
+ exit_index + 1,
+ 0,
+ vec![
+ Event {
+ event_type: EventType::Enter,
+ token_type: token_type.clone(),
+ point: enter_point.clone(),
+ link: None,
+ },
+ Event {
+ event_type: EventType::Exit,
+ token_type,
+ point: exit_point,
+ link: None,
+ },
+ ],
+ );
+
+ tokenizer.events[exit_index].point = enter_point;
+ codes = &codes[..index];
+ }
+ }
+
+ if trim_start {
+ let mut index = 0;
+ let mut vs = 0;
+ while index < codes.len() {
+ match codes[index] {
+ Code::Char(' ' | '\t') => {}
+ Code::VirtualSpace => vs += 1,
+ _ => break,
+ }
+
+ index += 1;
+ }
+
+ // The whole data is whitespace.
+ // We can be very fast: we only change the token types.
+ if index == codes.len() {
+ tokenizer.events[exit_index - 1].token_type = Token::SpaceOrTab;
+ tokenizer.events[exit_index].token_type = Token::SpaceOrTab;
+ return;
+ }
+
+ if index > 0 {
+ let enter_point = tokenizer.events[exit_index - 1].point.clone();
+ let mut exit_point = enter_point.clone();
+ exit_point.index += index;
+ exit_point.column += index - vs;
+ exit_point.offset += index - vs;
+
+ tokenizer.map.add(
+ exit_index - 1,
+ 0,
+ vec![
+ Event {
+ event_type: EventType::Enter,
+ token_type: Token::SpaceOrTab,
+ point: enter_point,
+ link: None,
+ },
+ Event {
+ event_type: EventType::Exit,
+ token_type: Token::SpaceOrTab,
+ point: exit_point.clone(),
+ link: None,
+ },
+ ],
+ );
+
+ tokenizer.events[exit_index - 1].point = exit_point;
+ }
+ }
}