Refactor to improve states

* Remove custom kind wrappers, use plain bytes instead * Remove `Into`s, use the explicit expected types instead * Refactor to use `slice.as_str` in most places * Remove unneeded unique check before adding a definition * Use a shared CDATA prefix in constants * Inline byte checks into matches * Pass bytes back from parser instead of whole parse state * Refactor to work more often on bytes * Rename custom `size` to `len`
author: Titus Wormer <tituswormer@gmail.com> 2022-07-29 18:22:59 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-07-29 18:22:59 +0200
commit: 0eeff9148e327183e532752f46421a75506dd7a6 (patch)
tree: 4f0aed04f90aa759ce96a2e87aa719e7fa95c450 /src/util
parent: 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff)
download: markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz
markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2
markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip
5 files changed, 91 insertions, 76 deletions
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs
index 5277f90..f8fd18f 100644
--- a/src/util/decode_character_reference.rs
+++ b/src/util/decode_character_reference.rs
@@ -57,9 +57,9 @@ pub fn decode_named(value: &str) -> String {
 /// ```rust ignore
 /// use micromark::util::decode_character_reference::decode_numeric;
 ///
-/// assert_eq!(decode_numeric("123", 10), '{');
-/// assert_eq!(decode_numeric("9", 16), '\t');
-/// assert_eq!(decode_numeric("0", 10), '�'); // Not allowed.
+/// assert_eq!(decode_numeric("123", 10), "{");
+/// assert_eq!(decode_numeric("9", 16), "\t");
+/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
 /// ```
 ///
 /// ## Panics
@@ -74,27 +74,19 @@ pub fn decode_named(value: &str) -> String {
 ///
 /// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
 /// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_numeric(value: &str, radix: u32) -> char {
-    let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int");
-
-    if
-    // C0 except for HT, LF, FF, CR, space
-    code < 0x09 ||
-    code == 0x0B ||
-    (code > 0x0D && code < 0x20) ||
-    // Control character (DEL) of the basic block and C1 controls.
-    (code > 0x7E && code < 0xA0) ||
-    // Lone high surrogates and low surrogates.
-    (code > 0xd7ff && code < 0xe000) ||
-    // Noncharacters.
-    (code > 0xfdcf && code < 0xfdf0) ||
-    ((code & 0xffff) == 0xffff) ||
-    ((code & 0xffff) == 0xfffe) ||
-    // Out of range
-    code > 0x0010_ffff
-    {
-        char::REPLACEMENT_CHARACTER
-    } else {
-        char::from_u32(code).expect("expected valid `code`")
+pub fn decode_numeric(value: &str, radix: u32) -> String {
+    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
+        if !matches!(char,
+            // C0 except for HT, LF, FF, CR, space
+            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
+            // Control character (DEL) of c0, and C1 controls.
+            '\u{7F}'..='\u{9F}'
+            // Lone surrogates, noncharacters, and out of range are handled by
+            // Rust.
+        ) {
+            return char.to_string();
+        }
     }
+
+    char::REPLACEMENT_CHARACTER.to_string()
 }
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 91c5462..d37a2de 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -20,37 +20,33 @@
 /// ## References
 ///
 /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
-pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String {
-    let check = if encode_html { check_all } else { check_nil };
-    let mut value = value.into();
-
+pub fn encode(value: &str, encode_html: bool) -> String {
     // It’ll grow a bit bigger for each dangerous character.
     let mut result = String::with_capacity(value.len());
+    let bytes = value.as_bytes();
+    let mut index = 0;
+    let mut start = 0;
 
-    while let Some(indice) = value.find(check) {
-        let after = value.split_off(indice + 1);
-        let dangerous = value.pop().unwrap();
-        result.push_str(&value);
-        result.push_str(match dangerous {
-            '\0' => "�",
-            '&' => "&amp;",
-            '"' => "&quot;",
-            '<' => "&lt;",
-            '>' => "&gt;",
-            _ => unreachable!("xxx"),
-        });
-        value = after;
-    }
+    while index < bytes.len() {
+        let byte = bytes[index];
+        if matches!(byte, b'\0') || (encode_html && matches!(byte, b'&' | b'"' | b'<' | b'>')) {
+            result.push_str(&value[start..index]);
+            result.push_str(match byte {
+                b'\0' => "�",
+                b'&' => "&amp;",
+                b'"' => "&quot;",
+                b'<' => "&lt;",
+                b'>' => "&gt;",
+                _ => panic!("impossible"),
+            });
 
-    result.push_str(&value);
+            start = index + 1;
+        }
 
-    result
-}
+        index += 1;
+    }
 
-fn check_all(char: char) -> bool {
-    matches!(char, '\0' | '&' | '"' | '<' | '>')
-}
+    result.push_str(&value[start..]);
 
-fn check_nil(char: char) -> bool {
-    matches!(char, '\0')
+    result
 }
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index 42a2bb0..f5b12d0 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -34,25 +34,34 @@
 pub fn normalize_identifier(value: &str) -> String {
     // Note: it’ll grow a bit smaller for consecutive whitespace.
     let mut result = String::with_capacity(value.len());
-    let mut at_start = true;
-    let mut at_whitespace = true;
+    let bytes = value.as_bytes();
+    let mut in_whitespace = true;
+    let mut index = 0;
+    let mut start = 0;
 
-    // Collapse markdown whitespace and trim it.
-    for char in value.chars() {
-        match char {
-            '\t' | '\n' | '\r' | ' ' => {
-                at_whitespace = true;
+    while index < bytes.len() {
+        if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') {
+            // First whitespace we see after non-whitespace.
+            if !in_whitespace {
+                result.push_str(&value[start..index]);
+                in_whitespace = true;
             }
-            _ => {
-                if at_whitespace && !at_start {
-                    result.push(' ');
-                }
-
-                result.push(char);
-                at_start = false;
-                at_whitespace = false;
+        }
+        // First non-whitespace we see after whitespace.
+        else if in_whitespace {
+            if start != 0 {
+                result.push(' ');
             }
+
+            start = index;
+            in_whitespace = false;
         }
+
+        index += 1;
+    }
+
+    if !in_whitespace {
+        result.push_str(&value[start..]);
     }
 
     // Some characters are considered “uppercase”, but if their lowercase
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 8c09549..051e1e1 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -32,7 +32,7 @@ use crate::util::encode::encode;
 ///
 /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
 pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
-    let value = encode(normalize_uri(value), true);
+    let value = encode(&*normalize_uri(value), true);
 
     if let Some(protocols) = protocols {
         let end = value.find(|c| matches!(c, '?' | '#' | '/'));
diff --git a/src/util/slice.rs b/src/util/slice.rs
index cd3641e..d899dac 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -2,6 +2,7 @@
 
 use crate::constant::TAB_SIZE;
 use crate::tokenizer::{Event, EventType, Point};
+use std::str;
 
 /// A range between two places.
 #[derive(Debug)]
@@ -78,6 +79,15 @@ impl<'a> Slice<'a> {
         }
     }
 
+    /// To do.
+    pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> {
+        Slice {
+            bytes: &bytes[index..=index],
+            before: 0,
+            after: 0,
+        }
+    }
+
     /// Get the slice belonging to a position.
     pub fn from_position(bytes: &'a [u8], position: &Position) -> Slice<'a> {
         let mut before = position.start.vs;
@@ -107,14 +117,18 @@ impl<'a> Slice<'a> {
     }
 
     /// To do.
-    // To do: rename to `len`?
-    pub fn size(&self) -> usize {
-        self.bytes.len() + self.before + self.after
+    pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> {
+        Slice {
+            bytes: &bytes[start..end],
+            before: 0,
+            after: 0,
+        }
     }
 
-    // To do:
-    // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html>
-    // to implement an `as_str`.
+    /// To do.
+    pub fn len(&self) -> usize {
+        self.bytes.len() + self.before + self.after
+    }
 
     /// To do.
     pub fn head(&self) -> Option<u8> {
@@ -127,16 +141,20 @@ impl<'a> Slice<'a> {
         }
     }
 
+    // To do:
+    pub fn as_str(&self) -> &str {
+        str::from_utf8(self.bytes).unwrap()
+    }
+
     /// To do.
     pub fn serialize(&self) -> String {
-        let mut string = String::with_capacity(self.size());
+        let mut string = String::with_capacity(self.len());
         let mut index = self.before;
         while index > 0 {
             string.push(' ');
             index -= 1;
         }
-        // To do: invalid UTF8?
-        string.push_str(std::str::from_utf8(self.bytes).unwrap());
+        string.push_str(self.as_str());
         index = self.after;
         while index > 0 {
             string.push(' ');
author	Titus Wormer <tituswormer@gmail.com>	2022-07-29 18:22:59 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-07-29 18:22:59 +0200
commit	0eeff9148e327183e532752f46421a75506dd7a6 (patch)
tree	4f0aed04f90aa759ce96a2e87aa719e7fa95c450 /src/util
parent	148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff)
download	markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2 markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip