From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Thu, 28 Jul 2022 16:48:00 +0200
Subject: Refactor to work on `char`s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, a custom char implementation was used.
This was easier to work with, as sometimes “virtual” characters are injected,
or characters are ignored.

This replaces that with working on actual `char`s.
In the hope of in the future working on `u8`s, even.

This simplifies the state machine somewhat, as only `\n` is fed, regardless of
whether it was a CRLF, CR, or LF.
It also feeds `' '` instead of virtual spaces.

The BOM, if present, is now available as a `ByteOrderMark` event.
---
 src/util/codes.rs        | 125 -------------------------------------
 src/util/encode.rs       |  12 +++-
 src/util/mod.rs          |   3 +-
 src/util/sanitize_uri.rs |   2 +-
 src/util/slice.rs        | 156 +++++++++++++++++++++++++++++++++++++++++++++++
 src/util/span.rs         |  57 -----------------
 6 files changed, 167 insertions(+), 188 deletions(-)
 delete mode 100644 src/util/codes.rs
 create mode 100644 src/util/slice.rs
 delete mode 100644 src/util/span.rs

(limited to 'src/util')
diff --git a/src/util/codes.rs b/src/util/codes.rs
deleted file mode 100644
index 5006a00..0000000
--- a/src/util/codes.rs
+++ /dev/null
@@ -1,125 +0,0 @@
-//! Utilities to deal with character codes.
-
-use crate::constant::TAB_SIZE;
-use crate::tokenizer::Code;
-
-/// Turn a string into codes.
-pub fn parse(value: &str) -> Vec<Code> {
-    // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller
-    // with `Code::CarriageReturnLineFeed`.
-    let mut codes = Vec::with_capacity(value.len());
-    let mut at_start = true;
-    let mut at_carriage_return = false;
-    let mut column = 1;
-
-    for char in value.chars() {
-        if at_start {
-            at_start = false;
-
-            if char == '\u{feff}' {
-                // Ignore.
-                continue;
-            }
-        }
-
-        // Send a CRLF.
-        if at_carriage_return && '\n' == char {
-            at_carriage_return = false;
-            codes.push(Code::CarriageReturnLineFeed);
-        } else {
-            // Send the previous CR: we’re not at a next `\n`.
-            if at_carriage_return {
-                at_carriage_return = false;
-                codes.push(Code::Char('\r'));
-            }
-
-            match char {
-                // Send a replacement character.
-                '\0' => {
-                    column += 1;
-                    codes.push(Code::Char(char::REPLACEMENT_CHARACTER));
-                }
-                // Send a tab and virtual spaces.
-                '\t' => {
-                    let remainder = column % TAB_SIZE;
-                    let mut virtual_spaces = if remainder == 0 {
-                        0
-                    } else {
-                        TAB_SIZE - remainder
-                    };
-                    codes.push(Code::Char(char));
-                    column += 1;
-                    while virtual_spaces > 0 {
-                        codes.push(Code::VirtualSpace);
-                        column += 1;
-                        virtual_spaces -= 1;
-                    }
-                }
-                // Send an LF.
-                '\n' => {
-                    column = 1;
-                    codes.push(Code::Char(char));
-                }
-                // Don’t send anything yet.
-                '\r' => {
-                    column = 1;
-                    at_carriage_return = true;
-                }
-                // Send the char.
-                _ => {
-                    column += 1;
-                    codes.push(Code::Char(char));
-                }
-            }
-        };
-    }
-
-    // Send the last CR: we’re not at a next `\n`.
-    if at_carriage_return {
-        codes.push(Code::Char('\r'));
-    }
-
-    codes
-}
-
-/// Serialize codes, optionally expanding tabs.
-pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
-    let mut at_tab = false;
-    // Note: It’ll grow a bit smaller with each
-    // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false,
-    // and bigger with `Code::CarriageReturnLineFeed`,
-    let mut value = String::with_capacity(codes.len());
-
-    for code in codes {
-        let mut at_tab_next = false;
-
-        match code {
-            Code::CarriageReturnLineFeed => {
-                value.push_str("\r\n");
-            }
-            Code::Char(char) if *char == '\n' || *char == '\r' => {
-                value.push(*char);
-            }
-            Code::Char(char) if *char == '\t' => {
-                at_tab_next = true;
-                value.push(if expand_tabs { ' ' } else { *char });
-            }
-            Code::VirtualSpace => {
-                if !expand_tabs && at_tab {
-                    continue;
-                }
-                value.push(' ');
-            }
-            Code::Char(char) => {
-                value.push(*char);
-            }
-            Code::None => {
-                unreachable!("unexpected EOF code in codes");
-            }
-        }
-
-        at_tab = at_tab_next;
-    }
-
-    value
-}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 965ea5c..91c5462 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -20,7 +20,8 @@
 /// ## References
 ///
 /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
-pub fn encode<S: Into<String>>(value: S) -> String {
+pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String {
+    let check = if encode_html { check_all } else { check_nil };
     let mut value = value.into();
 
     // It’ll grow a bit bigger for each dangerous character.
@@ -31,6 +32,7 @@ pub fn encode<S: Into<String>>(value: S) -> String {
         let dangerous = value.pop().unwrap();
         result.push_str(&value);
         result.push_str(match dangerous {
+            '\0' => "�",
             '&' => "&amp;",
             '"' => "&quot;",
             '<' => "&lt;",
@@ -45,6 +47,10 @@ pub fn encode<S: Into<String>>(value: S) -> String {
     result
 }
 
-fn check(char: char) -> bool {
-    matches!(char, '&' | '"' | '<' | '>')
+fn check_all(char: char) -> bool {
+    matches!(char, '\0' | '&' | '"' | '<' | '>')
+}
+
+fn check_nil(char: char) -> bool {
+    matches!(char, '\0')
 }
diff --git a/src/util/mod.rs b/src/util/mod.rs
index ae1add6..a01f31e 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,10 +1,9 @@
 //! Utilities used when compiling markdown.
 
-pub mod codes;
 pub mod decode_character_reference;
 pub mod edit_map;
 pub mod encode;
 pub mod normalize_identifier;
 pub mod sanitize_uri;
 pub mod skip;
-pub mod span;
+pub mod slice;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 81450ae..8c09549 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -32,7 +32,7 @@ use crate::util::encode::encode;
 ///
 /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
 pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
-    let value = encode(normalize_uri(value));
+    let value = encode(normalize_uri(value), true);
 
     if let Some(protocols) = protocols {
         let end = value.find(|c| matches!(c, '?' | '#' | '/'));
diff --git a/src/util/slice.rs b/src/util/slice.rs
new file mode 100644
index 0000000..2134069
--- /dev/null
+++ b/src/util/slice.rs
@@ -0,0 +1,156 @@
+//! Utilities to deal with characters.
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Event, EventType, Point};
+
+/// A range between two places.
+#[derive(Debug)]
+pub struct Position<'a> {
+    pub start: &'a Point,
+    pub end: &'a Point,
+}
+
+impl<'a> Position<'a> {
+    /// Get a position from an exit event.
+    ///
+    /// Looks backwards for the corresponding `enter` event.
+    /// This does not support nested events (such as lists in lists).
+    ///
+    /// ## Panics
+    ///
+    /// This function panics if an enter event is given.
+    /// When `micromark` is used, this function never panics.
+    pub fn from_exit_event(events: &'a [Event], index: usize) -> Position<'a> {
+        let exit = &events[index];
+        assert_eq!(
+            exit.event_type,
+            EventType::Exit,
+            "expected `from_exit_event` to be called on `exit` event"
+        );
+        let mut enter_index = index - 1;
+
+        loop {
+            let enter = &events[enter_index];
+            if enter.event_type == EventType::Enter && enter.token_type == exit.token_type {
+                return Position {
+                    start: &enter.point,
+                    end: &exit.point,
+                };
+            }
+
+            enter_index -= 1;
+        }
+    }
+}
+
+/// Chars belonging to a range.
+///
+/// Includes information on virtual spaces before and after the chars.
+#[derive(Debug)]
+pub struct Slice<'a> {
+    pub chars: &'a [char],
+    pub before: usize,
+    pub after: usize,
+}
+
+impl<'a> Slice<'a> {
+    /// Get the slice belonging to a position.
+    pub fn from_point(list: &'a [char], point: &Point) -> Slice<'a> {
+        let mut before = point.vs;
+        let mut start = point.index;
+        let end = if start < list.len() { start + 1 } else { start };
+
+        // If we have virtual spaces before, it means we are past the actual
+        // character at that index, and those virtual spaces.
+        if before > 0 {
+            before = TAB_SIZE - before;
+            start += 1;
+        };
+
+        Slice {
+            chars: if start < end { &list[start..end] } else { &[] },
+            before,
+            after: 0,
+        }
+    }
+
+    /// Get the slice belonging to a position.
+    pub fn from_position(list: &'a [char], position: &Position) -> Slice<'a> {
+        let mut before = position.start.vs;
+        let mut after = position.end.vs;
+        let mut start = position.start.index;
+        let mut end = position.end.index;
+
+        // If we have virtual spaces before, it means we are past the actual
+        // character at that index, and those virtual spaces.
+        if before > 0 {
+            before = TAB_SIZE - before;
+            start += 1;
+        };
+
+        // If we have virtual spaces after, it means that character is included,
+        // and one less virtual space.
+        if after > 0 {
+            after -= 1;
+            end += 1;
+        }
+
+        Slice {
+            chars: &list[start..end],
+            before,
+            after,
+        }
+    }
+
+    /// To do.
+    pub fn size(&self) -> usize {
+        self.chars.len() + self.before + self.after
+    }
+
+    // To do:
+    // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html>
+    // to implement an `as_str`.
+
+    /// To do.
+    pub fn head(&self) -> Option<char> {
+        if self.before > 0 {
+            Some(' ')
+        } else if self.chars.is_empty() {
+            None
+        } else {
+            Some(self.chars[0])
+        }
+    }
+
+    /// To do.
+    pub fn tail(&self) -> Option<char> {
+        if self.after > 0 {
+            Some(' ')
+        } else {
+            let index = self.chars.len();
+            if index > 0 {
+                Some(self.chars[index - 1])
+            } else {
+                None
+            }
+        }
+    }
+
+    /// To do.
+    pub fn serialize(&self) -> String {
+        let mut string = String::with_capacity(self.size());
+        let mut index = self.before;
+        while index > 0 {
+            string.push(' ');
+            index -= 1;
+        }
+        string.push_str(&self.chars.iter().collect::<String>());
+        index = self.after;
+        while index > 0 {
+            string.push(' ');
+            index -= 1;
+        }
+
+        string
+    }
+}
diff --git a/src/util/span.rs b/src/util/span.rs
deleted file mode 100644
index ca25924..0000000
--- a/src/util/span.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-//! Utilities to deal with semantic labels.
-
-use crate::tokenizer::{Code, Event, EventType};
-use crate::util::codes::serialize as serialize_codes;
-
-/// A struct representing the span of an opening and closing event of a token.
-#[derive(Debug)]
-pub struct Span {
-    /// Absolute offset (an `index` in `codes`) of where this span starts.
-    pub start_index: usize,
-    /// Absolute offset (an `index` in `codes`) of where this span ends.
-    pub end_index: usize,
-}
-
-/// Get a span from an event.
-///
-/// Get the span of an `exit` event, by looking backwards through the events to
-/// find the corresponding `enter` event.
-/// This assumes that tokens with the same are not nested.
-///
-/// ## Panics
-///
-/// This function panics if an enter event is given.
-/// When `micromark` is used, this function never panics.
-pub fn from_exit_event(events: &[Event], index: usize) -> Span {
-    let exit = &events[index];
-    let end_index = exit.point.index;
-    let token_type = exit.token_type.clone();
-    assert_eq!(
-        exit.event_type,
-        EventType::Exit,
-        "expected `from_exit_event` to be called on `exit` event"
-    );
-    let mut enter_index = index - 1;
-
-    loop {
-        let enter = &events[enter_index];
-        if enter.event_type == EventType::Enter && enter.token_type == token_type {
-            return Span {
-                start_index: enter.point.index,
-                end_index,
-            };
-        }
-
-        enter_index -= 1;
-    }
-}
-
-/// Serialize a span, optionally expanding tabs.
-pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {
-    serialize_codes(codes(all_codes, span), expand_tabs)
-}
-
-/// Get a slice of codes from a span.
-pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
-    &codes[span.start_index..span.end_index]
-}
-- 
cgit