Add support for normalizing identifiers

author: Titus Wormer <tituswormer@gmail.com> 2022-06-22 17:24:05 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-22 17:24:05 +0200
commit: 79c3275f91f1c0867a1bfba3085c0682aa5486ef (patch)
tree: be30b9a8b755bc6bc01e3f9d59e7d69c60b80b24 /src/util
parent: b0accb11f1aade55e9fc4dc0a1c1d1b8362ab5d9 (diff)
download: markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.gz
markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.bz2
markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.zip
3 files changed, 40 insertions, 0 deletions
diff --git a/src/util/encode.rs b/src/util/encode.rs
index f79c8ea..5762c22 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -21,6 +21,8 @@
 ///
 /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
 pub fn encode(value: &str) -> String {
+    // To do: replacing 4 times might just be slow.
+    // Perhaps we can walk the chars.
     value
         .replace('&', "&amp;")
         .replace('"', "&quot;")
diff --git a/src/util/mod.rs b/src/util/mod.rs
index c3db267..ee58518 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -2,5 +2,6 @@
 
 pub mod decode_character_reference;
 pub mod encode;
+pub mod normalize_identifier;
 pub mod sanitize_uri;
 pub mod span;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
new file mode 100644
index 0000000..870fd33
--- /dev/null
+++ b/src/util/normalize_identifier.rs
@@ -0,0 +1,37 @@
+//! To do.
+
+/// To do.
+pub fn normalize_identifier(value: &str) -> String {
+    let mut codes = vec![];
+    let mut at_start = true;
+    let mut at_whitespace = true;
+
+    // Collapse markdown whitespace and trim it.
+    for char in value.chars() {
+        match char {
+            '\t' | '\r' | '\n' | ' ' => {
+                at_whitespace = true;
+            }
+            _ => {
+                if at_whitespace && !at_start {
+                    codes.push(' ');
+                }
+
+                codes.push(char);
+                at_start = false;
+                at_whitespace = false;
+            }
+        }
+    }
+
+    // To do: test if this matches unicode.
+    // Some characters are considered “uppercase”, but if their lowercase
+    // counterpart is uppercased will result in a different uppercase
+    // character.
+    // Hence, to get that form, we perform both lower- and uppercase.
+    codes
+        .iter()
+        .collect::<String>()
+        .to_uppercase()
+        .to_lowercase()
+}
author	Titus Wormer <tituswormer@gmail.com>	2022-06-22 17:24:05 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-22 17:24:05 +0200
commit	79c3275f91f1c0867a1bfba3085c0682aa5486ef (patch)
tree	be30b9a8b755bc6bc01e3f9d59e7d69c60b80b24 /src/util
parent	b0accb11f1aade55e9fc4dc0a1c1d1b8362ab5d9 (diff)
download	markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.gz markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.bz2 markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.zip