1 files changed, 206 insertions, 0 deletions
diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs
new file mode 100644
index 0000000..75db98b
--- /dev/null
+++ b/src/util/character_reference.rs
@@ -0,0 +1,206 @@
+//! Helpers for character references.
+
+use crate::util::constant::{
+    CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+    CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+};
+use alloc::string::{String, ToString};
+use core::str;
+
+/// Decode named character references.
+///
+/// Turn the name coming from a named character reference (without the `&` or
+/// `;`) into a string.
+/// This looks the given string up at `0` in the tuples of
+/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then
+/// takes the corresponding value from `1`.
+///
+/// The result is `String` instead of `char` because named character references
+/// can expand into multiple characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_named;
+///
+/// assert_eq!(decode_named("amp", true), "&");
+/// assert_eq!(decode_named("AElig", true), "Æ");
+/// assert_eq!(decode_named("aelig", true), "æ");
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
+/// given.
+/// It is expected that figuring out whether a name is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
+/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_named(value: &str, html5: bool) -> Option<String> {
+    let mut iter = if html5 {
+        CHARACTER_REFERENCES.iter()
+    } else {
+        CHARACTER_REFERENCES_HTML_4.iter()
+    };
+    iter.find(|d| d.0 == value).map(|d| d.1.into())
+}
+
+/// Decode numeric character references.
+///
+/// Turn the number (in string form as either hexadecimal or decimal) coming
+/// from a numeric character reference into a string.
+/// The base of the string form must be passed as the `radix` parameter, as
+/// `10` (decimal) or `16` (hexadecimal).
+///
+/// This returns a `String` form of the associated character or a replacement
+/// character for C0 control characters (except for ASCII whitespace), C1
+/// control characters, lone surrogates, noncharacters, and out of range
+/// characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_numeric;
+///
+/// assert_eq!(decode_numeric("123", 10), "{");
+/// assert_eq!(decode_numeric("9", 16), "\t");
+/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a invalid string or an out of bounds valid string
+/// is given.
+/// It is expected that figuring out whether a number is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
+/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_numeric(value: &str, radix: u32) -> String {
+    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
+        if !matches!(char,
+            // C0 except for HT, LF, FF, CR, space
+            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
+            // Control character (DEL) of c0, and C1 controls.
+            '\u{7F}'..='\u{9F}'
+            // Lone surrogates, noncharacters, and out of range are handled by
+            // Rust.
+        ) {
+            return char.to_string();
+        }
+    }
+
+    char::REPLACEMENT_CHARACTER.to_string()
+}
+
+pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
+    match marker {
+        b'#' => Some(decode_numeric(value, 10)),
+        b'x' => Some(decode_numeric(value, 16)),
+        b'&' => decode_named(value, html5),
+        _ => unreachable!("Unexpected marker `{}`", marker),
+    }
+}
+
+/// Get the maximum size of a value for different kinds of references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_max(marker: u8) -> usize {
+    match marker {
+        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
+        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+        _ => unreachable!("Unexpected marker `{}`", marker),
+    }
+}
+
+/// Get a test to check if a byte is allowed as a value for different kinds of
+/// references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_test(marker: u8) -> fn(&u8) -> bool {
+    match marker {
+        b'&' => u8::is_ascii_alphanumeric,
+        b'x' => u8::is_ascii_hexdigit,
+        b'#' => u8::is_ascii_digit,
+        _ => unreachable!("Unexpected marker `{}`", marker),
+    }
+}
+
+/// Decode character references in a string.
+///
+/// Note: this currently only supports HTML 4 references, as it’s only used for
+/// them.
+///
+/// If it’s ever needed to support HTML 5 (which is what normal markdown uses),
+/// a boolean parameter can be added here.
+pub fn parse(value: &str) -> String {
+    let bytes = value.as_bytes();
+    let mut index = 0;
+    let len = bytes.len();
+    // Grows a bit smaller with each character reference.
+    let mut result = String::with_capacity(value.len());
+    let mut start = 0;
+
+    while index < len {
+        if bytes[index] == b'&' {
+            let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
+                if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
+                    (b'x', index + 3)
+                } else {
+                    (b'#', index + 2)
+                }
+            } else {
+                (b'&', index + 1)
+            };
+
+            let max = value_max(marker);
+            let test = value_test(marker);
+            let mut value_index = 0;
+            while value_index < max && (value_start + value_index) < len {
+                if !test(&bytes[value_start + value_index]) {
+                    break;
+                }
+                value_index += 1;
+            }
+
+            let value_end = value_start + value_index;
+
+            // Non empty and terminated.
+            if value_index > 0 && bytes[value_end] == b';' {
+                if let Some(decoded) = decode(
+                    str::from_utf8(&bytes[value_start..value_end]).unwrap(),
+                    marker,
+                    false,
+                ) {
+                    result.push_str(&value[start..index]);
+                    result.push_str(&decoded);
+                    start = value_end + 1;
+                    index = start;
+                    continue;
+                }
+            }
+        }
+
+        index += 1;
+    }
+
+    result.push_str(&value[start..]);
+
+    result
+}