From a820d849c3e20a1d72137072d70a7c8e00306f98 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Wed, 20 Jul 2022 17:19:17 +0200
Subject: Refactor to improve allocation around strings

---
 src/util/codes.rs                | 35 ++++++++++++++--------------
 src/util/encode.rs               | 50 ++++++++++++++++++----------------------
 src/util/normalize_identifier.rs | 13 ++++-------
 src/util/sanitize_uri.rs         | 49 +++++++++++++++++----------------------
 4 files changed, 65 insertions(+), 82 deletions(-)

(limited to 'src/util')
diff --git a/src/util/codes.rs b/src/util/codes.rs
index 9b6ad39..d35d7d9 100644
--- a/src/util/codes.rs
+++ b/src/util/codes.rs
@@ -5,19 +5,21 @@ use crate::tokenizer::Code;
 
 /// Turn a string into codes.
 pub fn parse(value: &str) -> Vec<Code> {
-    let mut codes: Vec<Code> = vec![];
+    // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller
+    // with `Code::CarriageReturnLineFeed`.
+    let mut codes: Vec<Code> = Vec::with_capacity(value.len());
     let mut at_start = true;
     let mut at_carriage_return = false;
     let mut column = 1;
 
     for char in value.chars() {
         if at_start {
+            at_start = false;
+
             if char == '\u{feff}' {
                 // Ignore.
                 continue;
             }
-
-            at_start = false;
         }
 
         // Send a CRLF.
@@ -83,34 +85,33 @@ pub fn parse(value: &str) -> Vec<Code> {
 /// Serialize codes, optionally expanding tabs.
 pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
     let mut at_tab = false;
-    let mut index = 0;
-    let mut value: Vec<char> = vec![];
+    // Note: It’ll grow a bit smaller with each
+    // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false,
+    // and bigger with `Code::CarriageReturnLineFeed`,
+    let mut value = String::with_capacity(codes.len());
 
-    while index < codes.len() {
-        let code = codes[index];
+    for code in codes {
         let mut at_tab_next = false;
 
         match code {
             Code::CarriageReturnLineFeed => {
-                value.push('\r');
-                value.push('\n');
+                value.push_str("\r\n");
             }
-            Code::Char(char) if char == '\n' || char == '\r' => {
-                value.push(char);
+            Code::Char(char) if *char == '\n' || *char == '\r' => {
+                value.push(*char);
             }
-            Code::Char(char) if char == '\t' => {
+            Code::Char(char) if *char == '\t' => {
                 at_tab_next = true;
-                value.push(if expand_tabs { ' ' } else { char });
+                value.push(if expand_tabs { ' ' } else { *char });
             }
             Code::VirtualSpace => {
                 if !expand_tabs && at_tab {
-                    index += 1;
                     continue;
                 }
                 value.push(' ');
             }
             Code::Char(char) => {
-                value.push(char);
+                value.push(*char);
             }
             Code::None => {
                 unreachable!("unexpected EOF code in codes");
@@ -118,9 +119,7 @@ pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
         }
 
         at_tab = at_tab_next;
-
-        index += 1;
     }
 
-    value.into_iter().collect()
+    value
 }
diff --git a/src/util/encode.rs b/src/util/encode.rs
index a3bd589..965ea5c 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -20,37 +20,31 @@
 /// ## References
 ///
 /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
-pub fn encode(value: &str) -> String {
-    let mut result: Vec<&str> = vec![];
-    let mut start = 0;
-    let mut index = 0;
+pub fn encode<S: Into<String>>(value: S) -> String {
+    let mut value = value.into();
 
-    for byte in value.bytes() {
-        if let Some(replacement) = match byte {
-            b'&' => Some("&amp;"),
-            b'"' => Some("&quot;"),
-            b'<' => Some("&lt;"),
-            b'>' => Some("&gt;"),
-            _ => None,
-        } {
-            if start != index {
-                result.push(&value[start..index]);
-            }
+    // It’ll grow a bit bigger for each dangerous character.
+    let mut result = String::with_capacity(value.len());
 
-            result.push(replacement);
-            start = index + 1;
-        }
-
-        index += 1;
+    while let Some(indice) = value.find(check) {
+        let after = value.split_off(indice + 1);
+        let dangerous = value.pop().unwrap();
+        result.push_str(&value);
+        result.push_str(match dangerous {
+            '&' => "&amp;",
+            '"' => "&quot;",
+            '<' => "&lt;",
+            '>' => "&gt;",
+            _ => unreachable!("xxx"),
+        });
+        value = after;
     }
 
-    if start == 0 {
-        value.to_string()
-    } else {
-        if start < index {
-            result.push(&value[start..index]);
-        }
+    result.push_str(&value);
 
-        result.join("")
-    }
+    result
+}
+
+fn check(char: char) -> bool {
+    matches!(char, '&' | '"' | '<' | '>')
 }
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index feb7239..42a2bb0 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -32,7 +32,8 @@
 /// [definition]: crate::construct::definition
 /// [label_end]: crate::construct::label_end
 pub fn normalize_identifier(value: &str) -> String {
-    let mut codes = vec![];
+    // Note: it’ll grow a bit smaller for consecutive whitespace.
+    let mut result = String::with_capacity(value.len());
     let mut at_start = true;
     let mut at_whitespace = true;
 
@@ -44,10 +45,10 @@ pub fn normalize_identifier(value: &str) -> String {
             }
             _ => {
                 if at_whitespace && !at_start {
-                    codes.push(' ');
+                    result.push(' ');
                 }
 
-                codes.push(char);
+                result.push(char);
                 at_start = false;
                 at_whitespace = false;
             }
@@ -66,9 +67,5 @@ pub fn normalize_identifier(value: &str) -> String {
     // to `SS` (U+0053 U+0053).
     // If we’d inverse the steps, for `ẞ`, we’d first uppercase without a
     // change, and then lowercase to `ß`, which would not match `ss`.
-    codes
-        .iter()
-        .collect::<String>()
-        .to_lowercase()
-        .to_uppercase()
+    result.to_lowercase().to_uppercase()
 }
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 55b15e4..81450ae 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -32,32 +32,25 @@ use crate::util::encode::encode;
 ///
 /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
 pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
-    let value = encode(&normalize_uri(value));
+    let value = encode(normalize_uri(value));
 
     if let Some(protocols) = protocols {
-        let chars: Vec<char> = value.chars().collect();
-        let mut index = 0;
-        let mut colon: Option<usize> = None;
-
-        while index < chars.len() {
-            let char = chars[index];
-
-            match char {
-                ':' => {
-                    colon = Some(index);
-                    break;
+        let end = value.find(|c| matches!(c, '?' | '#' | '/'));
+        let mut colon = value.find(|c| matches!(c, ':'));
+
+        // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
+        if let Some(end) = end {
+            if let Some(index) = colon {
+                if index > end {
+                    colon = None;
                 }
-                '?' | '#' | '/' => break,
-                _ => {}
             }
-
-            index += 1;
         }
 
-        // If there is no protocol, or the first colon is after `?`, `#`, or `/`, it’s relative.
-        // It is a protocol, it should be allowed.
+        // If there is no protocol, it’s relative, and fine.
         if let Some(colon) = colon {
-            let protocol = chars[0..colon].iter().collect::<String>().to_lowercase();
+            // If it is a protocol, it should be allowed.
+            let protocol = value[0..colon].to_lowercase();
             if !protocols.contains(&protocol.as_str()) {
                 return "".to_string();
             }
@@ -85,8 +78,9 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
 ///
 /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
 fn normalize_uri(value: &str) -> String {
-    let chars: Vec<char> = value.chars().collect();
-    let mut result: Vec<String> = vec![];
+    let chars = value.chars().collect::<Vec<_>>();
+    // Note: it’ll grow bigger for each non-ascii or non-safe character.
+    let mut result = String::with_capacity(value.len());
     let mut index = 0;
     let mut start = 0;
     let mut buff = [0; 4];
@@ -104,16 +98,15 @@ fn normalize_uri(value: &str) -> String {
             continue;
         }
 
-        // Note: Rust already takes care of lone astral surrogates.
+        // Note: Rust already takes care of lone surrogates.
         // Non-ascii or not allowed ascii.
         if char >= '\u{0080}'
             || !matches!(char, '!' | '#' | '$' | '&'..=';' | '=' | '?'..='Z' | '_' | 'a'..='z' | '~')
         {
-            result.push(chars[start..index].iter().collect::<String>());
-
+            result.push_str(&chars[start..index].iter().collect::<String>());
             char.encode_utf8(&mut buff);
-            result.push(
-                buff[0..char.len_utf8()]
+            result.push_str(
+                &buff[0..char.len_utf8()]
                     .iter()
                     .map(|&byte| format!("%{:>02X}", byte))
                     .collect::<String>(),
@@ -125,7 +118,7 @@ fn normalize_uri(value: &str) -> String {
         index += 1;
     }
 
-    result.push(chars[start..].iter().collect::<String>());
+    result.push_str(&chars[start..].iter().collect::<String>());
 
-    result.join("")
+    result
 }
-- 
cgit