From 79c3275f91f1c0867a1bfba3085c0682aa5486ef Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 22 Jun 2022 17:24:05 +0200 Subject: Add support for normalizing identifiers --- src/util/encode.rs | 2 ++ src/util/mod.rs | 1 + src/util/normalize_identifier.rs | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 src/util/normalize_identifier.rs (limited to 'src/util') diff --git a/src/util/encode.rs b/src/util/encode.rs index f79c8ea..5762c22 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,6 +21,8 @@ /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) pub fn encode(value: &str) -> String { + // To do: replacing 4 times might just be slow. + // Perhaps we can walk the chars. value .replace('&', "&") .replace('"', """) diff --git a/src/util/mod.rs b/src/util/mod.rs index c3db267..ee58518 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -2,5 +2,6 @@ pub mod decode_character_reference; pub mod encode; +pub mod normalize_identifier; pub mod sanitize_uri; pub mod span; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs new file mode 100644 index 0000000..870fd33 --- /dev/null +++ b/src/util/normalize_identifier.rs @@ -0,0 +1,37 @@ +//! To do. + +/// To do. +pub fn normalize_identifier(value: &str) -> String { + let mut codes = vec![]; + let mut at_start = true; + let mut at_whitespace = true; + + // Collapse markdown whitespace and trim it. + for char in value.chars() { + match char { + '\t' | '\r' | '\n' | ' ' => { + at_whitespace = true; + } + _ => { + if at_whitespace && !at_start { + codes.push(' '); + } + + codes.push(char); + at_start = false; + at_whitespace = false; + } + } + } + + // To do: test if this matches unicode. + // Some characters are considered “uppercase”, but if their lowercase + // counterpart is uppercased will result in a different uppercase + // character. + // Hence, to get that form, we perform both lower- and uppercase. + codes + .iter() + .collect::() + .to_uppercase() + .to_lowercase() +} -- cgit