src/util/sanitize_uri.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

//! Utilities to make urls safe.

use crate::util::encode::encode;

/// Make a value safe for injection as a URL.
///
/// This encodes unsafe characters with percent-encoding and skips already
/// encoded sequences (see `normalize_uri` below).
/// Further unsafe characters are encoded as character references (see
/// `encode`).
///
/// Then, a vec of (lowercase) allowed protocols can be given, in which case
/// the URL is sanitized.
///
/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])`
/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`.
/// If the URL includes an unknown protocol (one not matched by `protocol`, such
/// as a dangerous example, `javascript:`), the value is ignored.
///
/// ## References
///
/// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
    let value = encode(&normalize_uri(value));

    if let Some(protocols) = protocols {
        let chars: Vec<char> = value.chars().collect();
        let mut index = 0;
        let mut colon: Option<usize> = None;

        while index < chars.len() {
            let char = chars[index];

            match char {
                ':' => {
                    colon = Some(index);
                    break;
                }
                '?' | '#' | '/' => break,
                _ => {}
            }

            index += 1;
        }

        // If there is no protocol, or the first colon is after `?`, `#`, or `/`, it’s relative.
        // It is a protocol, it should be allowed.
        if let Some(colon) = colon {
            let protocol = chars[0..colon].iter().collect::<String>().to_lowercase();
            if !protocols.contains(&protocol.as_str()) {
                return "".to_string();
            }
        }
    }

    value
}

/// Normalize a URL (such as used in definitions).
///
/// Encode unsafe characters with percent-encoding, skipping already encoded
/// sequences.
///
/// ## References
///
/// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
fn normalize_uri(value: &str) -> String {
    let chars: Vec<char> = value.chars().collect();
    let mut result: Vec<String> = vec![];
    let mut index = 0;
    let mut start = 0;
    let mut buff = [0; 4];

    while index < chars.len() {
        let char = chars[index];

        // A correct percent encoded value.
        if char == '%'
            && index + 2 < chars.len()
            && chars[index + 1].is_ascii_alphanumeric()
            && chars[index + 2].is_ascii_alphanumeric()
        {
            index += 3;
            continue;
        }

        // Note: Rust already takes care of lone astral surrogates.
        // Non-ascii or not allowed ascii.
        if char >= '\u{0080}'
            || !matches!(char, '!' | '#' | '$' | '&'..=';' | '=' | '?'..='Z' | '_' | 'a'..='z' | '~')
        {
            result.push(chars[start..index].iter().collect::<String>());

            char.encode_utf8(&mut buff);
            result.push(
                buff[0..char.len_utf8()]
                    .iter()
                    .map(|&byte| format!("%{:X}", byte))
                    .collect::<String>(),
            );

            start = index + 1;
        }

        index += 1;
    }

    result.push(chars[start..].iter().collect::<String>());

    result.join("")
}