From b75d7976cfe8db43783b930c1f4774f2ad4936f5 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 6 Oct 2022 11:43:26 +0200 Subject: Add support for HTML 4 character references in JSX attributes --- src/util/character_reference.rs | 206 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 src/util/character_reference.rs (limited to 'src/util/character_reference.rs') diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs new file mode 100644 index 0000000..75db98b --- /dev/null +++ b/src/util/character_reference.rs @@ -0,0 +1,206 @@ +//! Helpers for character references. + +use crate::util::constant::{ + CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, +}; +use alloc::string::{String, ToString}; +use core::str; + +/// Decode named character references. +/// +/// Turn the name coming from a named character reference (without the `&` or +/// `;`) into a string. +/// This looks the given string up at `0` in the tuples of +/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then +/// takes the corresponding value from `1`. +/// +/// The result is `String` instead of `char` because named character references +/// can expand into multiple characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_named; +/// +/// assert_eq!(decode_named("amp", true), "&"); +/// assert_eq!(decode_named("AElig", true), "Æ"); +/// assert_eq!(decode_named("aelig", true), "æ"); +/// ``` +/// +/// ## Panics +/// +/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is +/// given. +/// It is expected that figuring out whether a name is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_named(value: &str, html5: bool) -> Option { + let mut iter = if html5 { + CHARACTER_REFERENCES.iter() + } else { + CHARACTER_REFERENCES_HTML_4.iter() + }; + iter.find(|d| d.0 == value).map(|d| d.1.into()) +} + +/// Decode numeric character references. +/// +/// Turn the number (in string form as either hexadecimal or decimal) coming +/// from a numeric character reference into a string. +/// The base of the string form must be passed as the `radix` parameter, as +/// `10` (decimal) or `16` (hexadecimal). +/// +/// This returns a `String` form of the associated character or a replacement +/// character for C0 control characters (except for ASCII whitespace), C1 +/// control characters, lone surrogates, noncharacters, and out of range +/// characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_numeric; +/// +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. +/// ``` +/// +/// ## Panics +/// +/// This function panics if a invalid string or an out of bounds valid string +/// is given. +/// It is expected that figuring out whether a number is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_numeric(value: &str, radix: u32) -> String { + if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { + if !matches!(char, + // C0 except for HT, LF, FF, CR, space + '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | + // Control character (DEL) of c0, and C1 controls. + '\u{7F}'..='\u{9F}' + // Lone surrogates, noncharacters, and out of range are handled by + // Rust. + ) { + return char.to_string(); + } + } + + char::REPLACEMENT_CHARACTER.to_string() +} + +pub fn decode(value: &str, marker: u8, html5: bool) -> Option { + match marker { + b'#' => Some(decode_numeric(value, 10)), + b'x' => Some(decode_numeric(value, 16)), + b'&' => decode_named(value, html5), + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Get the maximum size of a value for different kinds of references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_max(marker: u8) -> usize { + match marker { + b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, + b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Get a test to check if a byte is allowed as a value for different kinds of +/// references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_test(marker: u8) -> fn(&u8) -> bool { + match marker { + b'&' => u8::is_ascii_alphanumeric, + b'x' => u8::is_ascii_hexdigit, + b'#' => u8::is_ascii_digit, + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Decode character references in a string. +/// +/// Note: this currently only supports HTML 4 references, as it’s only used for +/// them. +/// +/// If it’s ever needed to support HTML 5 (which is what normal markdown uses), +/// a boolean parameter can be added here. +pub fn parse(value: &str) -> String { + let bytes = value.as_bytes(); + let mut index = 0; + let len = bytes.len(); + // Grows a bit smaller with each character reference. + let mut result = String::with_capacity(value.len()); + let mut start = 0; + + while index < len { + if bytes[index] == b'&' { + let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' { + if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') { + (b'x', index + 3) + } else { + (b'#', index + 2) + } + } else { + (b'&', index + 1) + }; + + let max = value_max(marker); + let test = value_test(marker); + let mut value_index = 0; + while value_index < max && (value_start + value_index) < len { + if !test(&bytes[value_start + value_index]) { + break; + } + value_index += 1; + } + + let value_end = value_start + value_index; + + // Non empty and terminated. + if value_index > 0 && bytes[value_end] == b';' { + if let Some(decoded) = decode( + str::from_utf8(&bytes[value_start..value_end]).unwrap(), + marker, + false, + ) { + result.push_str(&value[start..index]); + result.push_str(&decoded); + start = value_end + 1; + index = start; + continue; + } + } + } + + index += 1; + } + + result.push_str(&value[start..]); + + result +} -- cgit