diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-22 17:24:05 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-22 17:24:05 +0200 |
commit | 79c3275f91f1c0867a1bfba3085c0682aa5486ef (patch) | |
tree | be30b9a8b755bc6bc01e3f9d59e7d69c60b80b24 /src/util | |
parent | b0accb11f1aade55e9fc4dc0a1c1d1b8362ab5d9 (diff) | |
download | markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.gz markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.bz2 markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.zip |
Add support for normalizing identifiers
Diffstat (limited to 'src/util')
-rw-r--r-- | src/util/encode.rs | 2 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | src/util/normalize_identifier.rs | 37 |
3 files changed, 40 insertions, 0 deletions
diff --git a/src/util/encode.rs b/src/util/encode.rs index f79c8ea..5762c22 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,6 +21,8 @@ /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) pub fn encode(value: &str) -> String { + // To do: replacing 4 times might just be slow. + // Perhaps we can walk the chars. value .replace('&', "&") .replace('"', """) diff --git a/src/util/mod.rs b/src/util/mod.rs index c3db267..ee58518 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -2,5 +2,6 @@ pub mod decode_character_reference; pub mod encode; +pub mod normalize_identifier; pub mod sanitize_uri; pub mod span; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs new file mode 100644 index 0000000..870fd33 --- /dev/null +++ b/src/util/normalize_identifier.rs @@ -0,0 +1,37 @@ +//! To do. + +/// To do. +pub fn normalize_identifier(value: &str) -> String { + let mut codes = vec![]; + let mut at_start = true; + let mut at_whitespace = true; + + // Collapse markdown whitespace and trim it. + for char in value.chars() { + match char { + '\t' | '\r' | '\n' | ' ' => { + at_whitespace = true; + } + _ => { + if at_whitespace && !at_start { + codes.push(' '); + } + + codes.push(char); + at_start = false; + at_whitespace = false; + } + } + } + + // To do: test if this matches unicode. + // Some characters are considered “uppercase”, but if their lowercase + // counterpart is uppercased will result in a different uppercase + // character. + // Hence, to get that form, we perform both lower- and uppercase. + codes + .iter() + .collect::<String>() + .to_uppercase() + .to_lowercase() +} |