aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/construct/label_end.rs2
-rw-r--r--src/construct/label_start_image.rs2
-rw-r--r--src/construct/label_start_link.rs2
-rw-r--r--src/construct/mod.rs2
-rw-r--r--src/construct/partial_data.rs51
-rw-r--r--src/construct/partial_whitespace.rs56
-rw-r--r--src/content/string.rs17
-rw-r--r--src/content/text.rs22
-rw-r--r--src/tokenizer.rs9
9 files changed, 143 insertions, 20 deletions
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 888355b..0da12b8 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -510,7 +510,7 @@ fn ok(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
info.media.end.1 = tokenizer.events.len() - 1;
tokenizer.media_list.push(info.media);
- tokenizer.register_resolver("media".to_string(), Box::new(resolve_media));
+ tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media));
(State::Ok, Some(vec![code]))
}
diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs
index 7725334..a45205a 100644
--- a/src/construct/label_start_image.rs
+++ b/src/construct/label_start_image.rs
@@ -67,7 +67,7 @@ pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
balanced: false,
inactive: false,
});
- tokenizer.register_resolver("media".to_string(), Box::new(resolve_media));
+ tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media));
(State::Ok, None)
}
_ => (State::Nok, None),
diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs
index 46d7c9c..6c4d7ae 100644
--- a/src/construct/label_start_link.rs
+++ b/src/construct/label_start_link.rs
@@ -49,7 +49,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
balanced: false,
inactive: false,
});
- tokenizer.register_resolver("media".to_string(), Box::new(resolve_media));
+ tokenizer.register_resolver_before("media".to_string(), Box::new(resolve_media));
(State::Ok, None)
}
_ => (State::Nok, None),
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 8565b2f..9e3dfb0 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -44,6 +44,7 @@
//! * [label][partial_label]
//! * [space or tab][partial_space_or_tab]
//! * [title][partial_title]
+//! * [whitespace][partial_whitespace]
//!
//! Each construct maintained here is explained with a BNF diagram.
//! For example, the docs for [character escape][character_escape] contain:
@@ -83,4 +84,5 @@ pub mod partial_destination;
pub mod partial_label;
pub mod partial_space_or_tab;
pub mod partial_title;
+pub mod partial_whitespace;
pub mod thematic_break;
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs
index d83787a..9f99570 100644
--- a/src/construct/partial_data.rs
+++ b/src/construct/partial_data.rs
@@ -8,7 +8,8 @@
// To do: pass token types in?
-use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::edit_map::EditMap;
/// At the beginning of data.
///
@@ -39,7 +40,10 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnRe
tokenizer.exit(TokenType::LineEnding);
(State::Fn(Box::new(|t, c| at_break(t, c, stop))), None)
}
- _ if stop.contains(&code) => (State::Ok, Some(vec![code])),
+ _ if stop.contains(&code) => {
+ tokenizer.register_resolver("data".to_string(), Box::new(resolve));
+ (State::Ok, Some(vec![code]))
+ }
_ => {
tokenizer.enter(TokenType::Data);
data(tokenizer, code, stop)
@@ -67,3 +71,46 @@ fn data(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult
(State::Fn(Box::new(|t, c| data(t, c, stop))), None)
}
}
+
+/// Merge adjacent data events.
+pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
+ let mut edit_map = EditMap::new();
+ let len = tokenizer.events.len();
+ let mut index = 0;
+
+ // Loop through events and merge adjacent data events.
+ while index < len {
+ let event = &tokenizer.events[index];
+
+ if event.event_type == EventType::Enter && event.token_type == TokenType::Data {
+ let exit_index = index + 1;
+ let mut exit_far_index = exit_index;
+
+ // Find multiple `data` events.
+ while exit_far_index + 1 < len
+ && tokenizer.events[exit_far_index + 1].token_type == TokenType::Data
+ {
+ exit_far_index += 2;
+ }
+
+ if exit_far_index > exit_index {
+ edit_map.add(exit_index, exit_far_index - exit_index, vec![]);
+
+ // Change positional info.
+ let exit_far = &tokenizer.events[exit_far_index];
+ let point_end = exit_far.point.clone();
+ let index_end = exit_far.index;
+ let exit = &mut tokenizer.events[exit_index];
+ exit.point = point_end;
+ exit.index = index_end;
+ index = exit_far_index;
+
+ continue;
+ }
+ }
+
+ index += 1;
+ }
+
+ edit_map.consume(&mut tokenizer.events)
+}
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
new file mode 100644
index 0000000..9a7a54d
--- /dev/null
+++ b/src/construct/partial_whitespace.rs
@@ -0,0 +1,56 @@
+//! Trailing whitespace occurs in [string][] and [text][].
+//!
+//! It occurs at the start or end of the whole, or around line endings.
+//! This whitespace is ignored
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: the start and end here count as an eol.
+//! whitespace ::= 0.*space_or_tab eol 0.*space_or_tab
+//! ```
+//!
+//! ## References
+//!
+//! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js)
+//!
+//! [string]: crate::content::string
+//! [text]: crate::content::text
+
+use super::partial_space_or_tab::space_or_tab;
+use crate::tokenizer::{Code, State, StateFnResult, Tokenizer};
+
+/// Parse initial or final whitespace.
+pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.go(
+ // Nothing if there’s no whitespace.
+ space_or_tab(),
+ if matches!(
+ tokenizer.previous,
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ ) {
+ // If there’s whitespace, and we were at an eol/eof, `ok`
+ ok
+ } else {
+ // If there’s whitespace, and we were not at an eol/eof, there must be one here.
+ at_eol
+ },
+ )(tokenizer, code)
+}
+
+/// After whitespace, at an eol/eof.
+fn at_eol(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if matches!(
+ code,
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ ) {
+ ok(tokenizer, code)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// Fine.
+fn ok(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ (State::Ok, Some(vec![code]))
+}
diff --git a/src/content/string.rs b/src/content/string.rs
index 53e88b1..cc8ee53 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -14,13 +14,16 @@
use crate::construct::{
character_escape::start as character_escape, character_reference::start as character_reference,
- partial_data::start as data,
+ partial_data::start as data, partial_whitespace::whitespace,
};
use crate::tokenizer::{Code, State, StateFnResult, Tokenizer};
-const MARKERS: [Code; 2] = [
- Code::Char('&'), // `character_reference`
- Code::Char('\\'), // `character_escape`
+const MARKERS: [Code; 5] = [
+ Code::VirtualSpace, // `whitespace`
+ Code::Char('\t'), // `whitespace`
+ Code::Char(' '), // `whitespace`
+ Code::Char('&'), // `character_reference`
+ Code::Char('\\'), // `character_escape`
];
/// Before string.
@@ -34,7 +37,11 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
_ => tokenizer.attempt_n(
- vec![Box::new(character_reference), Box::new(character_escape)],
+ vec![
+ Box::new(character_reference),
+ Box::new(character_escape),
+ Box::new(whitespace),
+ ],
|ok| Box::new(if ok { start } else { before_data }),
)(tokenizer, code),
}
diff --git a/src/content/text.rs b/src/content/text.rs
index 183072e..c3f4e1b 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -24,18 +24,21 @@ use crate::construct::{
hard_break_trailing::start as hard_break_trailing, html_text::start as html_text,
label_end::start as label_end, label_start_image::start as label_start_image,
label_start_link::start as label_start_link, partial_data::start as data,
+ partial_whitespace::whitespace,
};
use crate::tokenizer::{Code, State, StateFnResult, Tokenizer};
-const MARKERS: [Code; 8] = [
- Code::Char(' '), // `hard_break_trailing`
- Code::Char('!'), // `label_start_image`
- Code::Char('&'), // `character_reference`
- Code::Char('<'), // `autolink`, `html_text`
- Code::Char('['), // `label_start_link`
- Code::Char('\\'), // `character_escape`, `hard_break_escape`
- Code::Char(']'), // `label_end`
- Code::Char('`'), // `code_text`
+const MARKERS: [Code; 10] = [
+ Code::VirtualSpace, // `whitespace`
+ Code::Char('\t'), // `whitespace`
+ Code::Char(' '), // `hard_break_trailing`, `whitespace`
+ Code::Char('!'), // `label_start_image`
+ Code::Char('&'), // `character_reference`
+ Code::Char('<'), // `autolink`, `html_text`
+ Code::Char('['), // `label_start_link`
+ Code::Char('\\'), // `character_escape`, `hard_break_escape`
+ Code::Char(']'), // `label_end`
+ Code::Char('`'), // `code_text`
];
/// Before text.
@@ -62,6 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
Box::new(label_end),
Box::new(label_start_image),
Box::new(label_start_link),
+ Box::new(whitespace),
],
|ok| Box::new(if ok { start } else { before_data }),
)(tokenizer, code),
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index fe69366..817c1de 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1796,6 +1796,13 @@ impl<'a> Tokenizer<'a> {
}
}
+ pub fn register_resolver_before(&mut self, id: String, resolver: Box<Resolver>) {
+ if !self.resolver_ids.contains(&id) {
+ self.resolver_ids.push(id);
+ self.resolvers.insert(0, resolver);
+ }
+ }
+
/// Prepare for a next code to get consumed.
fn expect(&mut self, code: Code) {
assert!(self.consumed, "expected previous character to be consumed");
@@ -1901,7 +1908,7 @@ impl<'a> Tokenizer<'a> {
let point = self.point.clone();
assert!(
- current_token != previous.token_type || previous.point != point,
+ current_token != previous.token_type || previous.index != self.index,
"expected non-empty token"
);