Add missing docs, refactor some code

author: Titus Wormer <tituswormer@gmail.com> 2022-08-01 11:27:39 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-08-01 11:27:39 +0200
commit: 1bb160f9dc45c3cdbe929e8965be69bcf8415d0c (patch)
tree: 4e3dfd7795a15082ed5218d25f852be80f3fd89c /src
parent: e97ad954e1468b90722cf91996d7dfc069fedf78 (diff)
download: markdown-rs-1bb160f9dc45c3cdbe929e8965be69bcf8415d0c.tar.gz
markdown-rs-1bb160f9dc45c3cdbe929e8965be69bcf8415d0c.tar.bz2
markdown-rs-1bb160f9dc45c3cdbe929e8965be69bcf8415d0c.zip
5 files changed, 90 insertions, 78 deletions
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 7b50957..cfaca0a 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -41,6 +41,7 @@
 //!
 //! There are also several routines used in different places:
 //!
+//! *   [bom][partial_bom]
 //! *   [data][partial_data]
 //! *   [destination][partial_destination]
 //! *   [label][partial_label]
diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs
index 155a1a3..d92c9c1 100644
--- a/src/construct/partial_bom.rs
+++ b/src/construct/partial_bom.rs
@@ -1,4 +1,14 @@
-//! To do.
+//! Byte order mark occurs at the start of the document.
+//!
+//! It’s the three bytes 0xEF, 0xBB, and 0xBF.
+//!
+//! ## Tokens
+//!
+//! *   [`ByteOrderMark`][Token::ByteOrderMark]
+//!
+//! ## References
+//!
+//! *   [`micromark/lib/preprocess.js` in `micromark`](https://github.com/micromark/micromark/blob/ed23453/packages/micromark/dev/lib/preprocess.js#L54-L60)
 
 use crate::token::Token;
 use crate::tokenizer::{State, Tokenizer};
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index 4f872ba..bf3bd4d 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -47,15 +47,18 @@
 
 use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN;
 use crate::token::Token;
-use crate::tokenizer::{Event, EventType, Tokenizer};
+use crate::tokenizer::{Event, EventType, Resolver, Tokenizer};
 use crate::util::slice::{Position, Slice};
 
-/// To do.
-pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) {
-    move |t| resolve_whitespace(t, hard_break, trim_whole)
+/// Create a resolver to handle trailing whitespace in events.
+///
+/// Performing this as a resolver instead of a tokenizer improves performance
+/// *a lot*.
+pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> Box<Resolver> {
+    Box::new(move |t| resolve_whitespace(t, hard_break, trim_whole))
 }
 
-/// To do.
+/// Resolve whitespace.
 pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) {
     let mut index = 0;
 
@@ -76,8 +79,7 @@ pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whol
     }
 }
 
-/// To do.
-#[allow(clippy::too_many_lines)]
+/// Trim a [`Data`][Token::Data] token.
 fn trim_data(
     tokenizer: &mut Tokenizer,
     exit_index: usize,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 9c5e9f6..9ab4309 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -26,7 +26,7 @@ pub enum ContentType {
 }
 
 #[derive(Debug, PartialEq)]
-pub enum CharAction {
+pub enum ByteAction {
     Normal(u8),
     Insert(u8),
     Ignore,
@@ -47,10 +47,9 @@ pub struct Point {
     pub column: usize,
     /// 0-indexed position in the document.
     ///
-    /// Also an `index` into `codes`.
-    // To do: call it `offset`?
+    /// Also an `index` into `bytes`.
     pub index: usize,
-    /// To do.
+    /// Virtual step on the same `index`.
     pub vs: usize,
 }
 
@@ -171,7 +170,7 @@ pub struct Tokenizer<'a> {
     column_start: Vec<(usize, usize)>,
     // First line.
     first_line: usize,
-    /// To do.
+    /// First point after the last line ending.
     line_start: Point,
     /// Track whether the current byte is already consumed (`true`) or expected
     /// to be consumed (`false`).
@@ -192,7 +191,7 @@ pub struct Tokenizer<'a> {
     ///
     /// Tracked to make sure everything’s valid.
     pub stack: Vec<Token>,
-    /// To do.
+    /// Edit map, to batch changes.
     pub map: EditMap,
     /// List of attached resolvers, which will be called when done feeding,
     /// to clean events.
@@ -323,15 +322,15 @@ impl<'a> Tokenizer<'a> {
     /// Move to the next (virtual) byte.
     pub fn move_one(&mut self) {
         match byte_action(self.parse_state.bytes, &self.point) {
-            CharAction::Ignore => {
+            ByteAction::Ignore => {
                 self.point.index += 1;
             }
-            CharAction::Insert(byte) => {
+            ByteAction::Insert(byte) => {
                 self.previous = Some(byte);
                 self.point.column += 1;
                 self.point.vs += 1;
             }
-            CharAction::Normal(byte) => {
+            ByteAction::Normal(byte) => {
                 self.previous = Some(byte);
                 self.point.vs = 0;
                 self.point.index += 1;
@@ -386,7 +385,7 @@ impl<'a> Tokenizer<'a> {
         while point.index > 0 {
             point.index -= 1;
             let action = byte_action(self.parse_state.bytes, &point);
-            if !matches!(action, CharAction::Ignore) {
+            if !matches!(action, ByteAction::Ignore) {
                 point.index += 1;
                 break;
             }
@@ -439,7 +438,7 @@ impl<'a> Tokenizer<'a> {
             while point.index > 0 {
                 point.index -= 1;
                 let action = byte_action(self.parse_state.bytes, &point);
-                if !matches!(action, CharAction::Ignore) {
+                if !matches!(action, ByteAction::Ignore) {
                     point.index += 1;
                     break;
                 }
@@ -636,6 +635,7 @@ impl<'a> Tokenizer<'a> {
     ///
     /// This is set up to support repeatedly calling `feed`, and thus streaming
     /// markdown into the state machine, and normally pauses after feeding.
+    // Note: if needed: accept `vs`?
     pub fn push(
         &mut self,
         min: usize,
@@ -644,8 +644,6 @@ impl<'a> Tokenizer<'a> {
     ) -> State {
         debug_assert!(!self.resolved, "cannot feed after drain");
         debug_assert!(min >= self.point.index, "cannot move backwards");
-
-        // To do: accept `vs`?
         self.move_to((min, 0));
 
         let mut state = State::Fn(Box::new(start));
@@ -654,16 +652,11 @@ impl<'a> Tokenizer<'a> {
             match state {
                 State::Ok | State::Nok => break,
                 State::Fn(func) => match byte_action(self.parse_state.bytes, &self.point) {
-                    CharAction::Ignore => {
+                    ByteAction::Ignore => {
                         state = State::Fn(Box::new(func));
                         self.move_one();
                     }
-                    CharAction::Insert(byte) => {
-                        log::debug!("main: passing (fake): `{:?}` ({:?})", byte, self.point);
-                        self.expect(Some(byte));
-                        state = func(self);
-                    }
-                    CharAction::Normal(byte) => {
+                    ByteAction::Insert(byte) | ByteAction::Normal(byte) => {
                         log::debug!("main: passing: `{:?}` ({:?})", byte, self.point);
                         self.expect(Some(byte));
                         state = func(self);
@@ -685,35 +678,30 @@ impl<'a> Tokenizer<'a> {
             match state {
                 State::Ok | State::Nok => break,
                 State::Fn(func) => {
-                    // To do: clean this?
                     // We sometimes move back when flushing, so then we use those codes.
-                    if self.point.index == max {
-                        let byte = None;
-                        log::debug!("main: flushing eof: `{:?}` ({:?})", byte, self.point);
-                        self.expect(byte);
-                        state = func(self);
+                    let action = if self.point.index == max {
+                        None
                     } else {
-                        match byte_action(self.parse_state.bytes, &self.point) {
-                            CharAction::Ignore => {
-                                state = State::Fn(Box::new(func));
-                                self.move_one();
-                            }
-                            CharAction::Insert(byte) => {
-                                log::debug!(
-                                    "main: flushing (fake): `{:?}` ({:?})",
-                                    byte,
-                                    self.point
-                                );
-                                self.expect(Some(byte));
-                                state = func(self);
-                            }
-                            CharAction::Normal(byte) => {
-                                log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point);
-                                self.expect(Some(byte));
-                                state = func(self);
-                            }
-                        }
+                        Some(byte_action(self.parse_state.bytes, &self.point))
                     };
+
+                    if let Some(ByteAction::Ignore) = action {
+                        state = State::Fn(Box::new(func));
+                        self.move_one();
+                    } else {
+                        let byte =
+                            if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) =
+                                action
+                            {
+                                Some(byte)
+                            } else {
+                                None
+                            };
+
+                        log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point);
+                        self.expect(byte);
+                        state = func(self);
+                    }
                 }
             }
         }
@@ -733,18 +721,18 @@ impl<'a> Tokenizer<'a> {
     }
 }
 
-fn byte_action(bytes: &[u8], point: &Point) -> CharAction {
+fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
     if point.index < bytes.len() {
         let byte = bytes[point.index];
 
         if byte == b'\r' {
             // CRLF.
             if point.index < bytes.len() - 1 && bytes[point.index + 1] == b'\n' {
-                CharAction::Ignore
+                ByteAction::Ignore
             }
             // CR.
             else {
-                CharAction::Normal(b'\n')
+                ByteAction::Normal(b'\n')
             }
         } else if byte == b'\t' {
             let remainder = point.column % TAB_SIZE;
@@ -757,19 +745,17 @@ fn byte_action(bytes: &[u8], point: &Point) -> CharAction {
             // On the tab itself, first send it.
             if point.vs == 0 {
                 if vs == 0 {
-                    CharAction::Normal(byte)
+                    ByteAction::Normal(byte)
                 } else {
-                    CharAction::Insert(byte)
+                    ByteAction::Insert(byte)
                 }
             } else if vs == 0 {
-                CharAction::Normal(b' ')
+                ByteAction::Normal(b' ')
             } else {
-                CharAction::Insert(b' ')
+                ByteAction::Insert(b' ')
             }
-        }
-        // VS?
-        else {
-            CharAction::Normal(byte)
+        } else {
+            ByteAction::Normal(byte)
         }
     } else {
         unreachable!("out of bounds")
diff --git a/src/util/slice.rs b/src/util/slice.rs
index 13b664d..f287978 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -43,7 +43,11 @@ impl<'a> Position<'a> {
         }
     }
 
-    /// To do.
+    /// Turn a position into indices.
+    ///
+    /// Indices are places in `bytes` where this position starts and ends.
+    ///
+    /// > 👉 **Note**: indices cannot represent virtual spaces.
     pub fn to_indices(&self) -> (usize, usize) {
         (self.start.index, self.end.index)
     }
@@ -60,7 +64,7 @@ pub struct Slice<'a> {
 }
 
 impl<'a> Slice<'a> {
-    /// Get the slice belonging to a position.
+    /// Get the slice belonging to a point.
     pub fn from_point(bytes: &'a [u8], point: &Point) -> Slice<'a> {
         let mut before = point.vs;
         let mut start = point.index;
@@ -84,13 +88,13 @@ impl<'a> Slice<'a> {
         }
     }
 
-    /// To do.
+    /// Create a slice from one index.
+    ///
+    /// Indices are places in `bytes`.
+    ///
+    /// > 👉 **Note**: indices cannot represent virtual spaces.
     pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> {
-        Slice {
-            bytes: &bytes[index..=index],
-            before: 0,
-            after: 0,
-        }
+        Slice::from_indices(bytes, index, index + 1)
     }
 
     /// Get the slice belonging to a position.
@@ -121,7 +125,11 @@ impl<'a> Slice<'a> {
         }
     }
 
-    /// To do.
+    /// Create a slice from two indices.
+    ///
+    /// Indices are places in `bytes`.
+    ///
+    /// > 👉 **Note**: indices cannot represent virtual spaces.
     pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> {
         Slice {
             bytes: &bytes[start..end],
@@ -130,12 +138,13 @@ impl<'a> Slice<'a> {
         }
     }
 
-    /// To do.
+    /// Get the size of this slice, including virtual spaces.
     pub fn len(&self) -> usize {
         self.bytes.len() + self.before + self.after
     }
 
-    /// To do.
+    /// Get the first byte in this slice, representing a virtual space as a
+    /// space.
     pub fn head(&self) -> Option<u8> {
         if self.before > 0 {
             Some(b' ')
@@ -146,12 +155,16 @@ impl<'a> Slice<'a> {
         }
     }
 
-    // To do:
+    /// Turn the slice into a `&str`.
+    ///
+    /// Does not support virtual spaces.
     pub fn as_str(&self) -> &str {
         str::from_utf8(self.bytes).unwrap()
     }
 
-    /// To do.
+    /// Turn the slice into a `String`.
+    ///
+    /// Support virtual spaces.
     pub fn serialize(&self) -> String {
         let mut string = String::with_capacity(self.len());
         let mut index = self.before;
author	Titus Wormer <tituswormer@gmail.com>	2022-08-01 11:27:39 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-08-01 11:27:39 +0200
commit	1bb160f9dc45c3cdbe929e8965be69bcf8415d0c (patch)
tree	4e3dfd7795a15082ed5218d25f852be80f3fd89c /src
parent	e97ad954e1468b90722cf91996d7dfc069fedf78 (diff)
download	markdown-rs-1bb160f9dc45c3cdbe929e8965be69bcf8415d0c.tar.gz markdown-rs-1bb160f9dc45c3cdbe929e8965be69bcf8415d0c.tar.bz2 markdown-rs-1bb160f9dc45c3cdbe929e8965be69bcf8415d0c.zip