path: root/src/util/character_reference.rs



//! Helpers for character references.

use crate::util::constant::{
    CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
    CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
};
use alloc::string::String;
use core::str;

/// Decode named character references.
///
/// Turn the name coming from a named character reference (without the `&` or
/// `;`) into a string.
/// This looks the given string up at `0` in the tuples of
/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then
/// takes the corresponding value from `1`.
///
/// The `html5` boolean us used for named character references, and specifier
/// whether the 2125 names from HTML 5 or the 252 names from HTML 4 are
/// supported.
///
/// The result is `String` instead of `char` because named character references
/// can expand into multiple characters.
///
/// ## Examples
///
/// ```rust ignore
/// use micromark::util::decode_character_reference::decode_named;
///
/// assert_eq!(decode_named("amp", true), "&");
/// assert_eq!(decode_named("AElig", true), "Æ");
/// assert_eq!(decode_named("aelig", true), "æ");
/// ```
///
/// ## References
///
/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
pub fn decode_named(value: &str, html5: bool) -> Option<String> {
    let mut iter = if html5 {
        CHARACTER_REFERENCES.iter()
    } else {
        CHARACTER_REFERENCES_HTML_4.iter()
    };
    iter.find(|d| d.0 == value).map(|d| d.1.into())
}

/// Decode numeric character references.
///
/// Turn the number (in string form as either hexadecimal or decimal) coming
/// from a numeric character reference into a string.
/// The base of the string form must be passed as the `radix` parameter, as
/// `10` (decimal) or `16` (hexadecimal).
///
/// This returns a `String` form of the associated character or a replacement
/// character for C0 control characters (except for ASCII whitespace), C1
/// control characters, lone surrogates, noncharacters, and out of range
/// characters.
///
/// ## Examples
///
/// ```rust ignore
/// use micromark::util::decode_character_reference::decode_numeric;
///
/// assert_eq!(decode_numeric("123", 10), "{");
/// assert_eq!(decode_numeric("9", 16), "\t");
/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
/// ```
///
/// ## Panics
///
/// This function panics if a invalid string or an out of bounds valid string
/// is given.
/// It is expected that figuring out whether a number is allowed is handled in
/// the parser.
/// When `micromark` is used, this function never panics.
///
/// ## References
///
/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
pub fn decode_numeric(value: &str, radix: u32) -> String {
    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
        if !matches!(char,
            // C0 except for HT, LF, FF, CR, space
            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
            // Control character (DEL) of c0, and C1 controls.
            '\u{7F}'..='\u{9F}'
            // Lone surrogates, noncharacters, and out of range are handled by
            // Rust.
        ) {
            return char.into();
        }
    }

    char::REPLACEMENT_CHARACTER.into()
}

/// Decode a character reference.
///
/// This turns the number (in string form as either hexadecimal or decimal) or
/// name from a character reference into a string.
///
/// The marker specifies the format: `#` for hexadecimal, `x` for decimal, and
/// `&` for named.
///
/// The `html5` boolean us used for named character references, and specifier
/// whether the 2125 names from HTML 5 or the 252 names from HTML 4 are
/// supported.
///
/// ## Panics
///
/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
    match marker {
        b'#' => Some(decode_numeric(value, 10)),
        b'x' => Some(decode_numeric(value, 16)),
        b'&' => decode_named(value, html5),
        _ => unreachable!("Unexpected marker `{}`", marker),
    }
}

/// Get the maximum size of a value for different kinds of references.
///
/// The value is the stuff after the markers, before the `;`.
///
/// ## Panics
///
/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
pub fn value_max(marker: u8) -> usize {
    match marker {
        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
        _ => unreachable!("Unexpected marker `{}`", marker),
    }
}

/// Get a test to check if a byte is allowed as a value for different kinds of
/// references.
///
/// The value is the stuff after the markers, before the `;`.
///
/// ## Panics
///
/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
pub fn value_test(marker: u8) -> fn(&u8) -> bool {
    match marker {
        b'&' => u8::is_ascii_alphanumeric,
        b'x' => u8::is_ascii_hexdigit,
        b'#' => u8::is_ascii_digit,
        _ => unreachable!("Unexpected marker `{}`", marker),
    }
}

/// Decode character references in a string.
///
/// > 👉 **Note**: this currently only supports the 252 named character
/// > references from HTML 4, as it’s only used for JSX.
/// >
/// > If it’s ever needed to support HTML 5 (which is what normal markdown
/// > uses), a boolean parameter can be added here.
pub fn parse(value: &str) -> String {
    let bytes = value.as_bytes();
    let mut index = 0;
    let len = bytes.len();
    // Grows a bit smaller with each character reference.
    let mut result = String::with_capacity(value.len());
    let mut start = 0;

    while index < len {
        if bytes[index] == b'&' {
            let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
                if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
                    (b'x', index + 3)
                } else {
                    (b'#', index + 2)
                }
            } else {
                (b'&', index + 1)
            };

            let max = value_max(marker);
            let test = value_test(marker);
            let mut value_index = 0;
            while value_index < max && (value_start + value_index) < len {
                if !test(&bytes[value_start + value_index]) {
                    break;
                }
                value_index += 1;
            }

            let value_end = value_start + value_index;

            // Non empty and terminated.
            if value_index > 0 && bytes[value_end] == b';' {
                if let Some(decoded) = decode(
                    str::from_utf8(&bytes[value_start..value_end]).unwrap(),
                    marker,
                    false,
                ) {
                    result.push_str(&value[start..index]);
                    result.push_str(&decoded);
                    start = value_end + 1;
                    index = start;
                    continue;
                }
            }
        }

        index += 1;
    }

    result.push_str(&value[start..]);

    result
}
//! Helpers for character references.

use crate::util::constant::{
    CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
    CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
};
use alloc::string::String;
use core::str;

/// Decode named character references.
///
/// Turn the name coming from a named character reference (without the `&` or
/// `;`) into a string.
/// This looks the given string up at `0` in the tuples of
/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then
/// takes the corresponding value from `1`.
///
/// The `html5` boolean us used for named character references, and specifier
/// whether the 2125 names from HTML 5 or the 252 names from HTML 4 are
/// supported.
///
/// The result is `String` instead of `char` because named character references
/// can expand into multiple characters.
///
/// ## Examples
///
/// ```rust ignore
/// use micromark::util::decode_character_reference::decode_named;
///
/// assert_eq!(decode_named("amp", true), "&");
/// assert_eq!(decode_named("AElig", true), "Æ");
/// assert_eq!(decode_named("aelig", true), "æ");
/// ```
///
/// ## References
///
/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
pub fn decode_named(value: &str, html5: bool) -> Option<String> {
    let mut iter = if html5 {
        CHARACTER_REFERENCES.iter()
    } else {
        CHARACTER_REFERENCES_HTML_4.iter()
    };
    iter.find(|d| d.0 == value).map(|d| d.1.into())
}

/// Decode numeric character references.
///
/// Turn the number (in string form as either hexadecimal or decimal) coming
/// from a numeric character reference into a string.
/// The base of the string form must be passed as the `radix` parameter, as
/// `10` (decimal) or `16` (hexadecimal).
///
/// This returns a `String` form of the associated character or a replacement
/// character for C0 control characters (except for ASCII whitespace), C1
/// control characters, lone surrogates, noncharacters, and out of range
/// characters.
///
/// ## Examples
///
/// ```rust ignore
/// use micromark::util::decode_character_reference::decode_numeric;
///
/// assert_eq!(decode_numeric("123", 10), "{");
/// assert_eq!(decode_numeric("9", 16), "\t");
/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
/// ```
///
/// ## Panics
///
/// This function panics if a invalid string or an out of bounds valid string
/// is given.
/// It is expected that figuring out whether a number is allowed is handled in
/// the parser.
/// When `micromark` is used, this function never panics.
///
/// ## References
///
/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
pub fn decode_numeric(value: &str, radix: u32) -> String {
    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
        if !matches!(char,
            // C0 except for HT, LF, FF, CR, space
            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
            // Control character (DEL) of c0, and C1 controls.
            '\u{7F}'..='\u{9F}'
            // Lone surrogates, noncharacters, and out of range are handled by
            // Rust.
        ) {
            return char.into();
        }
    }

    char::REPLACEMENT_CHARACTER.into()
}

/// Decode a character reference.
///
/// This turns the number (in string form as either hexadecimal or decimal) or
/// name from a character reference into a string.
///
/// The marker specifies the format: `#` for hexadecimal, `x` for decimal, and
/// `&` for named.
///
/// The `html5` boolean us used for named character references, and specifier
/// whether the 2125 names from HTML 5 or the 252 names from HTML 4 are
/// supported.
///
/// ## Panics
///
/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
    match marker {
        b'#' => Some(decode_numeric(value, 10)),
        b'x' => Some(decode_numeric(value, 16)),
        b'&' => decode_named(value, html5),
        _ => unreachable!("Unexpected marker `{}`", marker),
    }
}

/// Get the maximum size of a value for different kinds of references.
///
/// The value is the stuff after the markers, before the `;`.
///
/// ## Panics
///
/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
pub fn value_max(marker: u8) -> usize {
    match marker {
        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
        _ => unreachable!("Unexpected marker `{}`", marker),
    }
}

/// Get a test to check if a byte is allowed as a value for different kinds of
/// references.
///
/// The value is the stuff after the markers, before the `;`.
///
/// ## Panics
///
/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
pub fn value_test(marker: u8) -> fn(&u8) -> bool {
    match marker {
        b'&' => u8::is_ascii_alphanumeric,
        b'x' => u8::is_ascii_hexdigit,
        b'#' => u8::is_ascii_digit,
        _ => unreachable!("Unexpected marker `{}`", marker),
    }
}

/// Decode character references in a string.
///
/// > 👉 **Note**: this currently only supports the 252 named character
/// > references from HTML 4, as it’s only used for JSX.
/// >
/// > If it’s ever needed to support HTML 5 (which is what normal markdown
/// > uses), a boolean parameter can be added here.
pub fn parse(value: &str) -> String {
    let bytes = value.as_bytes();
    let mut index = 0;
    let len = bytes.len();
    // Grows a bit smaller with each character reference.
    let mut result = String::with_capacity(value.len());
    let mut start = 0;

    while index < len {
        if bytes[index] == b'&' {
            let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
                if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
                    (b'x', index + 3)
                } else {
                    (b'#', index + 2)
                }
            } else {
                (b'&', index + 1)
            };

            let max = value_max(marker);
            let test = value_test(marker);
            let mut value_index = 0;
            while value_index < max && (value_start + value_index) < len {
                if !test(&bytes[value_start + value_index]) {
                    break;
                }
                value_index += 1;
            }

            let value_end = value_start + value_index;

            // Non empty and terminated.
            if value_index > 0 && bytes[value_end] == b';' {
                if let Some(decoded) = decode(
                    str::from_utf8(&bytes[value_start..value_end]).unwrap(),
                    marker,
                    false,
                ) {
                    result.push_str(&value[start..index]);
                    result.push_str(&decoded);
                    start = value_end + 1;
                    index = start;
                    continue;
                }
            }
        }

        index += 1;
    }

    result.push_str(&value[start..]);

    result
}