aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/construct/mod.rs1
-rw-r--r--src/construct/partial_bom.rs12
-rw-r--r--src/construct/partial_whitespace.rs16
-rw-r--r--src/tokenizer.rs100
-rw-r--r--src/util/slice.rs39
5 files changed, 90 insertions, 78 deletions
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 7b50957..cfaca0a 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -41,6 +41,7 @@
//!
//! There are also several routines used in different places:
//!
+//! * [bom][partial_bom]
//! * [data][partial_data]
//! * [destination][partial_destination]
//! * [label][partial_label]
diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs
index 155a1a3..d92c9c1 100644
--- a/src/construct/partial_bom.rs
+++ b/src/construct/partial_bom.rs
@@ -1,4 +1,14 @@
-//! To do.
+//! Byte order mark occurs at the start of the document.
+//!
+//! It’s the three bytes 0xEF, 0xBB, and 0xBF.
+//!
+//! ## Tokens
+//!
+//! * [`ByteOrderMark`][Token::ByteOrderMark]
+//!
+//! ## References
+//!
+//! * [`micromark/lib/preprocess.js` in `micromark`](https://github.com/micromark/micromark/blob/ed23453/packages/micromark/dev/lib/preprocess.js#L54-L60)
use crate::token::Token;
use crate::tokenizer::{State, Tokenizer};
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index 4f872ba..bf3bd4d 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -47,15 +47,18 @@
use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN;
use crate::token::Token;
-use crate::tokenizer::{Event, EventType, Tokenizer};
+use crate::tokenizer::{Event, EventType, Resolver, Tokenizer};
use crate::util::slice::{Position, Slice};
-/// To do.
-pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) {
- move |t| resolve_whitespace(t, hard_break, trim_whole)
+/// Create a resolver to handle trailing whitespace in events.
+///
+/// Performing this as a resolver instead of a tokenizer improves performance
+/// *a lot*.
+pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> Box<Resolver> {
+ Box::new(move |t| resolve_whitespace(t, hard_break, trim_whole))
}
-/// To do.
+/// Resolve whitespace.
pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) {
let mut index = 0;
@@ -76,8 +79,7 @@ pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whol
}
}
-/// To do.
-#[allow(clippy::too_many_lines)]
+/// Trim a [`Data`][Token::Data] token.
fn trim_data(
tokenizer: &mut Tokenizer,
exit_index: usize,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 9c5e9f6..9ab4309 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -26,7 +26,7 @@ pub enum ContentType {
}
#[derive(Debug, PartialEq)]
-pub enum CharAction {
+pub enum ByteAction {
Normal(u8),
Insert(u8),
Ignore,
@@ -47,10 +47,9 @@ pub struct Point {
pub column: usize,
/// 0-indexed position in the document.
///
- /// Also an `index` into `codes`.
- // To do: call it `offset`?
+ /// Also an `index` into `bytes`.
pub index: usize,
- /// To do.
+ /// Virtual step on the same `index`.
pub vs: usize,
}
@@ -171,7 +170,7 @@ pub struct Tokenizer<'a> {
column_start: Vec<(usize, usize)>,
// First line.
first_line: usize,
- /// To do.
+ /// First point after the last line ending.
line_start: Point,
/// Track whether the current byte is already consumed (`true`) or expected
/// to be consumed (`false`).
@@ -192,7 +191,7 @@ pub struct Tokenizer<'a> {
///
/// Tracked to make sure everything’s valid.
pub stack: Vec<Token>,
- /// To do.
+ /// Edit map, to batch changes.
pub map: EditMap,
/// List of attached resolvers, which will be called when done feeding,
/// to clean events.
@@ -323,15 +322,15 @@ impl<'a> Tokenizer<'a> {
/// Move to the next (virtual) byte.
pub fn move_one(&mut self) {
match byte_action(self.parse_state.bytes, &self.point) {
- CharAction::Ignore => {
+ ByteAction::Ignore => {
self.point.index += 1;
}
- CharAction::Insert(byte) => {
+ ByteAction::Insert(byte) => {
self.previous = Some(byte);
self.point.column += 1;
self.point.vs += 1;
}
- CharAction::Normal(byte) => {
+ ByteAction::Normal(byte) => {
self.previous = Some(byte);
self.point.vs = 0;
self.point.index += 1;
@@ -386,7 +385,7 @@ impl<'a> Tokenizer<'a> {
while point.index > 0 {
point.index -= 1;
let action = byte_action(self.parse_state.bytes, &point);
- if !matches!(action, CharAction::Ignore) {
+ if !matches!(action, ByteAction::Ignore) {
point.index += 1;
break;
}
@@ -439,7 +438,7 @@ impl<'a> Tokenizer<'a> {
while point.index > 0 {
point.index -= 1;
let action = byte_action(self.parse_state.bytes, &point);
- if !matches!(action, CharAction::Ignore) {
+ if !matches!(action, ByteAction::Ignore) {
point.index += 1;
break;
}
@@ -636,6 +635,7 @@ impl<'a> Tokenizer<'a> {
///
/// This is set up to support repeatedly calling `feed`, and thus streaming
/// markdown into the state machine, and normally pauses after feeding.
+ // Note: if needed: accept `vs`?
pub fn push(
&mut self,
min: usize,
@@ -644,8 +644,6 @@ impl<'a> Tokenizer<'a> {
) -> State {
debug_assert!(!self.resolved, "cannot feed after drain");
debug_assert!(min >= self.point.index, "cannot move backwards");
-
- // To do: accept `vs`?
self.move_to((min, 0));
let mut state = State::Fn(Box::new(start));
@@ -654,16 +652,11 @@ impl<'a> Tokenizer<'a> {
match state {
State::Ok | State::Nok => break,
State::Fn(func) => match byte_action(self.parse_state.bytes, &self.point) {
- CharAction::Ignore => {
+ ByteAction::Ignore => {
state = State::Fn(Box::new(func));
self.move_one();
}
- CharAction::Insert(byte) => {
- log::debug!("main: passing (fake): `{:?}` ({:?})", byte, self.point);
- self.expect(Some(byte));
- state = func(self);
- }
- CharAction::Normal(byte) => {
+ ByteAction::Insert(byte) | ByteAction::Normal(byte) => {
log::debug!("main: passing: `{:?}` ({:?})", byte, self.point);
self.expect(Some(byte));
state = func(self);
@@ -685,35 +678,30 @@ impl<'a> Tokenizer<'a> {
match state {
State::Ok | State::Nok => break,
State::Fn(func) => {
- // To do: clean this?
// We sometimes move back when flushing, so then we use those codes.
- if self.point.index == max {
- let byte = None;
- log::debug!("main: flushing eof: `{:?}` ({:?})", byte, self.point);
- self.expect(byte);
- state = func(self);
+ let action = if self.point.index == max {
+ None
} else {
- match byte_action(self.parse_state.bytes, &self.point) {
- CharAction::Ignore => {
- state = State::Fn(Box::new(func));
- self.move_one();
- }
- CharAction::Insert(byte) => {
- log::debug!(
- "main: flushing (fake): `{:?}` ({:?})",
- byte,
- self.point
- );
- self.expect(Some(byte));
- state = func(self);
- }
- CharAction::Normal(byte) => {
- log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point);
- self.expect(Some(byte));
- state = func(self);
- }
- }
+ Some(byte_action(self.parse_state.bytes, &self.point))
};
+
+ if let Some(ByteAction::Ignore) = action {
+ state = State::Fn(Box::new(func));
+ self.move_one();
+ } else {
+ let byte =
+ if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) =
+ action
+ {
+ Some(byte)
+ } else {
+ None
+ };
+
+ log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point);
+ self.expect(byte);
+ state = func(self);
+ }
}
}
}
@@ -733,18 +721,18 @@ impl<'a> Tokenizer<'a> {
}
}
-fn byte_action(bytes: &[u8], point: &Point) -> CharAction {
+fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
if point.index < bytes.len() {
let byte = bytes[point.index];
if byte == b'\r' {
// CRLF.
if point.index < bytes.len() - 1 && bytes[point.index + 1] == b'\n' {
- CharAction::Ignore
+ ByteAction::Ignore
}
// CR.
else {
- CharAction::Normal(b'\n')
+ ByteAction::Normal(b'\n')
}
} else if byte == b'\t' {
let remainder = point.column % TAB_SIZE;
@@ -757,19 +745,17 @@ fn byte_action(bytes: &[u8], point: &Point) -> CharAction {
// On the tab itself, first send it.
if point.vs == 0 {
if vs == 0 {
- CharAction::Normal(byte)
+ ByteAction::Normal(byte)
} else {
- CharAction::Insert(byte)
+ ByteAction::Insert(byte)
}
} else if vs == 0 {
- CharAction::Normal(b' ')
+ ByteAction::Normal(b' ')
} else {
- CharAction::Insert(b' ')
+ ByteAction::Insert(b' ')
}
- }
- // VS?
- else {
- CharAction::Normal(byte)
+ } else {
+ ByteAction::Normal(byte)
}
} else {
unreachable!("out of bounds")
diff --git a/src/util/slice.rs b/src/util/slice.rs
index 13b664d..f287978 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -43,7 +43,11 @@ impl<'a> Position<'a> {
}
}
- /// To do.
+ /// Turn a position into indices.
+ ///
+ /// Indices are places in `bytes` where this position starts and ends.
+ ///
+ /// > 👉 **Note**: indices cannot represent virtual spaces.
pub fn to_indices(&self) -> (usize, usize) {
(self.start.index, self.end.index)
}
@@ -60,7 +64,7 @@ pub struct Slice<'a> {
}
impl<'a> Slice<'a> {
- /// Get the slice belonging to a position.
+ /// Get the slice belonging to a point.
pub fn from_point(bytes: &'a [u8], point: &Point) -> Slice<'a> {
let mut before = point.vs;
let mut start = point.index;
@@ -84,13 +88,13 @@ impl<'a> Slice<'a> {
}
}
- /// To do.
+ /// Create a slice from one index.
+ ///
+ /// Indices are places in `bytes`.
+ ///
+ /// > 👉 **Note**: indices cannot represent virtual spaces.
pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> {
- Slice {
- bytes: &bytes[index..=index],
- before: 0,
- after: 0,
- }
+ Slice::from_indices(bytes, index, index + 1)
}
/// Get the slice belonging to a position.
@@ -121,7 +125,11 @@ impl<'a> Slice<'a> {
}
}
- /// To do.
+ /// Create a slice from two indices.
+ ///
+ /// Indices are places in `bytes`.
+ ///
+ /// > 👉 **Note**: indices cannot represent virtual spaces.
pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> {
Slice {
bytes: &bytes[start..end],
@@ -130,12 +138,13 @@ impl<'a> Slice<'a> {
}
}
- /// To do.
+ /// Get the size of this slice, including virtual spaces.
pub fn len(&self) -> usize {
self.bytes.len() + self.before + self.after
}
- /// To do.
+ /// Get the first byte in this slice, representing a virtual space as a
+ /// space.
pub fn head(&self) -> Option<u8> {
if self.before > 0 {
Some(b' ')
@@ -146,12 +155,16 @@ impl<'a> Slice<'a> {
}
}
- // To do:
+ /// Turn the slice into a `&str`.
+ ///
+ /// Does not support virtual spaces.
pub fn as_str(&self) -> &str {
str::from_utf8(self.bytes).unwrap()
}
- /// To do.
+ /// Turn the slice into a `String`.
+ ///
+ /// Support virtual spaces.
pub fn serialize(&self) -> String {
let mut string = String::with_capacity(self.len());
let mut index = self.before;