aboutsummaryrefslogtreecommitdiffstats
path: root/src/util/normalize_identifier.rs
blob: 123a3a9ac108fc26977bec0ffa21ba7418d0d261 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
//! Utility to normalize identifiers.

/// Normalize an identifier, as found in [references][label_end] and
/// [definitions][definition], so it can be compared when matching.
///
/// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `)
/// into one space, trims it (as in, dropping the first and last space),
/// and then performs unicode case folding twice: first by uppercasing
/// lowercase characters, and then lowercasing uppercase characters.
///
/// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if
/// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a
/// different uppercase character (U+0398 (`Θ`)).
/// Hence, to get that form, we perform both upper- and lowercase.
///
/// ## Examples
///
/// ```rust ignore
/// micromark::util::normalize_identifier::normalize_identifier;
///
/// assert_eq!(normalize_identifier(" a "), "a");
/// assert_eq!(normalize_identifier("a\t\r\nb"), "a b");
/// assert_eq!(normalize_identifier("ПРИВЕТ"), "привет");
/// assert_eq!(normalize_identifier("Привет"), "привет");
/// assert_eq!(normalize_identifier("привет"), "привет");
/// ```
///
/// ## References
///
/// *   [`micromark-util-normalize-identifier` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-normalize-identifier)
///
/// [definition]: crate::construct::definition
/// [label_end]: crate::construct::label_end
pub fn normalize_identifier(value: &str) -> String {
    let mut codes = vec![];
    let mut at_start = true;
    let mut at_whitespace = true;

    // Collapse markdown whitespace and trim it.
    for char in value.chars() {
        match char {
            '\t' | '\n' | '\r' | ' ' => {
                at_whitespace = true;
            }
            _ => {
                if at_whitespace && !at_start {
                    codes.push(' ');
                }

                codes.push(char);
                at_start = false;
                at_whitespace = false;
            }
        }
    }

    // To do: test if this matches unicode.
    // Some characters are considered “uppercase”, but if their lowercase
    // counterpart is uppercased will result in a different uppercase
    // character.
    // Hence, to get that form, we perform both lower- and uppercase.
    codes
        .iter()
        .collect::<String>()
        .to_uppercase()
        .to_lowercase()
}