4 files changed, 475 insertions, 94 deletions
diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs
new file mode 100644
index 0000000..75db98b
--- /dev/null
+++ b/src/util/character_reference.rs
@@ -0,0 +1,206 @@
+//! Helpers for character references.
+
+use crate::util::constant::{
+    CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+    CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+};
+use alloc::string::{String, ToString};
+use core::str;
+
+/// Decode named character references.
+///
+/// Turn the name coming from a named character reference (without the `&` or
+/// `;`) into a string.
+/// This looks the given string up at `0` in the tuples of
+/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then
+/// takes the corresponding value from `1`.
+///
+/// The result is `String` instead of `char` because named character references
+/// can expand into multiple characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_named;
+///
+/// assert_eq!(decode_named("amp", true), "&");
+/// assert_eq!(decode_named("AElig", true), "Æ");
+/// assert_eq!(decode_named("aelig", true), "æ");
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
+/// given.
+/// It is expected that figuring out whether a name is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
+/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_named(value: &str, html5: bool) -> Option<String> {
+    let mut iter = if html5 {
+        CHARACTER_REFERENCES.iter()
+    } else {
+        CHARACTER_REFERENCES_HTML_4.iter()
+    };
+    iter.find(|d| d.0 == value).map(|d| d.1.into())
+}
+
+/// Decode numeric character references.
+///
+/// Turn the number (in string form as either hexadecimal or decimal) coming
+/// from a numeric character reference into a string.
+/// The base of the string form must be passed as the `radix` parameter, as
+/// `10` (decimal) or `16` (hexadecimal).
+///
+/// This returns a `String` form of the associated character or a replacement
+/// character for C0 control characters (except for ASCII whitespace), C1
+/// control characters, lone surrogates, noncharacters, and out of range
+/// characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_numeric;
+///
+/// assert_eq!(decode_numeric("123", 10), "{");
+/// assert_eq!(decode_numeric("9", 16), "\t");
+/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a invalid string or an out of bounds valid string
+/// is given.
+/// It is expected that figuring out whether a number is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
+/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_numeric(value: &str, radix: u32) -> String {
+    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
+        if !matches!(char,
+            // C0 except for HT, LF, FF, CR, space
+            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
+            // Control character (DEL) of c0, and C1 controls.
+            '\u{7F}'..='\u{9F}'
+            // Lone surrogates, noncharacters, and out of range are handled by
+            // Rust.
+        ) {
+            return char.to_string();
+        }
+    }
+
+    char::REPLACEMENT_CHARACTER.to_string()
+}
+
+pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
+    match marker {
+        b'#' => Some(decode_numeric(value, 10)),
+        b'x' => Some(decode_numeric(value, 16)),
+        b'&' => decode_named(value, html5),
+        _ => unreachable!("Unexpected marker `{}`", marker),
+    }
+}
+
+/// Get the maximum size of a value for different kinds of references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_max(marker: u8) -> usize {
+    match marker {
+        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
+        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+        _ => unreachable!("Unexpected marker `{}`", marker),
+    }
+}
+
+/// Get a test to check if a byte is allowed as a value for different kinds of
+/// references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_test(marker: u8) -> fn(&u8) -> bool {
+    match marker {
+        b'&' => u8::is_ascii_alphanumeric,
+        b'x' => u8::is_ascii_hexdigit,
+        b'#' => u8::is_ascii_digit,
+        _ => unreachable!("Unexpected marker `{}`", marker),
+    }
+}
+
+/// Decode character references in a string.
+///
+/// Note: this currently only supports HTML 4 references, as it’s only used for
+/// them.
+///
+/// If it’s ever needed to support HTML 5 (which is what normal markdown uses),
+/// a boolean parameter can be added here.
+pub fn parse(value: &str) -> String {
+    let bytes = value.as_bytes();
+    let mut index = 0;
+    let len = bytes.len();
+    // Grows a bit smaller with each character reference.
+    let mut result = String::with_capacity(value.len());
+    let mut start = 0;
+
+    while index < len {
+        if bytes[index] == b'&' {
+            let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
+                if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
+                    (b'x', index + 3)
+                } else {
+                    (b'#', index + 2)
+                }
+            } else {
+                (b'&', index + 1)
+            };
+
+            let max = value_max(marker);
+            let test = value_test(marker);
+            let mut value_index = 0;
+            while value_index < max && (value_start + value_index) < len {
+                if !test(&bytes[value_start + value_index]) {
+                    break;
+                }
+                value_index += 1;
+            }
+
+            let value_end = value_start + value_index;
+
+            // Non empty and terminated.
+            if value_index > 0 && bytes[value_end] == b';' {
+                if let Some(decoded) = decode(
+                    str::from_utf8(&bytes[value_start..value_end]).unwrap(),
+                    marker,
+                    false,
+                ) {
+                    result.push_str(&value[start..index]);
+                    result.push_str(&decoded);
+                    start = value_end + 1;
+                    index = start;
+                    continue;
+                }
+            }
+        }
+
+        index += 1;
+    }
+
+    result.push_str(&value[start..]);
+
+    result
+}
diff --git a/src/util/constant.rs b/src/util/constant.rs
index e9deac2..65704d0 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -2433,6 +2433,274 @@ pub const CHARACTER_REFERENCES: [(&str, &str); 2125] = [
     ("zwnj", "‌"),
 ];
 
+// Important: please touch the below lists as few times as possible to keep Git small.
+
+/// List of names and values that form named character reference in HTML 4.
+///
+/// This list is normally not used in markdown, but it is used in MDX, because
+/// in JSX attribute values, only the old HTML 4 character references are
+/// supported.
+///
+/// This list is sensitive to casing.
+///
+/// ## References
+///
+/// *   [*§ 1.5.2 HTML Character References* in `JSX`](https://facebook.github.io/jsx/#sec-HTMLCharacterReference)
+pub const CHARACTER_REFERENCES_HTML_4: [(&str, &str); 252] = [
+    ("AElig", "Æ"),
+    ("Aacute", "Á"),
+    ("Acirc", "Â"),
+    ("Agrave", "À"),
+    ("Alpha", "Α"),
+    ("Aring", "Å"),
+    ("Atilde", "Ã"),
+    ("Auml", "Ä"),
+    ("Beta", "Β"),
+    ("Ccedil", "Ç"),
+    ("Chi", "Χ"),
+    ("Dagger", "‡"),
+    ("Delta", "Δ"),
+    ("ETH", "Ð"),
+    ("Eacute", "É"),
+    ("Ecirc", "Ê"),
+    ("Egrave", "È"),
+    ("Epsilon", "Ε"),
+    ("Eta", "Η"),
+    ("Euml", "Ë"),
+    ("Gamma", "Γ"),
+    ("Iacute", "Í"),
+    ("Icirc", "Î"),
+    ("Igrave", "Ì"),
+    ("Iota", "Ι"),
+    ("Iuml", "Ï"),
+    ("Kappa", "Κ"),
+    ("Lambda", "Λ"),
+    ("Mu", "Μ"),
+    ("Ntilde", "Ñ"),
+    ("Nu", "Ν"),
+    ("OElig", "Œ"),
+    ("Oacute", "Ó"),
+    ("Ocirc", "Ô"),
+    ("Ograve", "Ò"),
+    ("Omega", "Ω"),
+    ("Omicron", "Ο"),
+    ("Oslash", "Ø"),
+    ("Otilde", "Õ"),
+    ("Ouml", "Ö"),
+    ("Phi", "Φ"),
+    ("Pi", "Π"),
+    ("Prime", "″"),
+    ("Psi", "Ψ"),
+    ("Rho", "Ρ"),
+    ("Scaron", "Š"),
+    ("Sigma", "Σ"),
+    ("THORN", "Þ"),
+    ("Tau", "Τ"),
+    ("Theta", "Θ"),
+    ("Uacute", "Ú"),
+    ("Ucirc", "Û"),
+    ("Ugrave", "Ù"),
+    ("Upsilon", "Υ"),
+    ("Uuml", "Ü"),
+    ("Xi", "Ξ"),
+    ("Yacute", "Ý"),
+    ("Yuml", "Ÿ"),
+    ("Zeta", "Ζ"),
+    ("aacute", "á"),
+    ("acirc", "â"),
+    ("acute", "´"),
+    ("aelig", "æ"),
+    ("agrave", "à"),
+    ("alefsym", "ℵ"),
+    ("alpha", "α"),
+    ("amp", "&"),
+    ("and", "∧"),
+    ("ang", "∠"),
+    ("aring", "å"),
+    ("asymp", "≈"),
+    ("atilde", "ã"),
+    ("auml", "ä"),
+    ("bdquo", "„"),
+    ("beta", "β"),
+    ("brvbar", "¦"),
+    ("bull", "•"),
+    ("cap", "∩"),
+    ("ccedil", "ç"),
+    ("cedil", "¸"),
+    ("cent", "¢"),
+    ("chi", "χ"),
+    ("circ", "ˆ"),
+    ("clubs", "♣"),
+    ("cong", "≅"),
+    ("copy", "©"),
+    ("crarr", "↵"),
+    ("cup", "∪"),
+    ("curren", "¤"),
+    ("dArr", "⇓"),
+    ("dagger", "†"),
+    ("darr", "↓"),
+    ("deg", "°"),
+    ("delta", "δ"),
+    ("diams", "♦"),
+    ("divide", "÷"),
+    ("eacute", "é"),
+    ("ecirc", "ê"),
+    ("egrave", "è"),
+    ("empty", "∅"),
+    ("emsp", " "),
+    ("ensp", " "),
+    ("epsilon", "ε"),
+    ("equiv", "≡"),
+    ("eta", "η"),
+    ("eth", "ð"),
+    ("euml", "ë"),
+    ("euro", "€"),
+    ("exist", "∃"),
+    ("fnof", "ƒ"),
+    ("forall", "∀"),
+    ("frac12", "½"),
+    ("frac14", "¼"),
+    ("frac34", "¾"),
+    ("frasl", "⁄"),
+    ("gamma", "γ"),
+    ("ge", "≥"),
+    ("gt", ">"),
+    ("hArr", "⇔"),
+    ("harr", "↔"),
+    ("hearts", "♥"),
+    ("hellip", "…"),
+    ("iacute", "í"),
+    ("icirc", "î"),
+    ("iexcl", "¡"),
+    ("igrave", "ì"),
+    ("image", "ℑ"),
+    ("infin", "∞"),
+    ("int", "∫"),
+    ("iota", "ι"),
+    ("iquest", "¿"),
+    ("isin", "∈"),
+    ("iuml", "ï"),
+    ("kappa", "κ"),
+    ("lArr", "⇐"),
+    ("lambda", "λ"),
+    ("lang", "〈"),
+    ("laquo", "«"),
+    ("larr", "←"),
+    ("lceil", "⌈"),
+    ("ldquo", "“"),
+    ("le", "≤"),
+    ("lfloor", "⌊"),
+    ("lowast", "∗"),
+    ("loz", "◊"),
+    ("lrm", "‎"),
+    ("lsaquo", "‹"),
+    ("lsquo", "‘"),
+    ("lt", "<"),
+    ("macr", "¯"),
+    ("mdash", "—"),
+    ("micro", "µ"),
+    ("middot", "·"),
+    ("minus", "−"),
+    ("mu", "μ"),
+    ("nabla", "∇"),
+    ("nbsp", " "),
+    ("ndash", "–"),
+    ("ne", "≠"),
+    ("ni", "∋"),
+    ("not", "¬"),
+    ("notin", "∉"),
+    ("nsub", "⊄"),
+    ("ntilde", "ñ"),
+    ("nu", "ν"),
+    ("oacute", "ó"),
+    ("ocirc", "ô"),
+    ("oelig", "œ"),
+    ("ograve", "ò"),
+    ("oline", "‾"),
+    ("omega", "ω"),
+    ("omicron", "ο"),
+    ("oplus", "⊕"),
+    ("or", "∨"),
+    ("ordf", "ª"),
+    ("ordm", "º"),
+    ("oslash", "ø"),
+    ("otilde", "õ"),
+    ("otimes", "⊗"),
+    ("ouml", "ö"),
+    ("para", "¶"),
+    ("part", "∂"),
+    ("permil", "‰"),
+    ("perp", "⊥"),
+    ("phi", "φ"),
+    ("pi", "π"),
+    ("piv", "ϖ"),
+    ("plusmn", "±"),
+    ("pound", "£"),
+    ("prime", "′"),
+    ("prod", "∏"),
+    ("prop", "∝"),
+    ("psi", "ψ"),
+    ("quot", "\""),
+    ("rArr", "⇒"),
+    ("radic", "√"),
+    ("rang", "〉"),
+    ("raquo", "»"),
+    ("rarr", "→"),
+    ("rceil", "⌉"),
+    ("rdquo", "”"),
+    ("real", "ℜ"),
+    ("reg", "®"),
+    ("rfloor", "⌋"),
+    ("rho", "ρ"),
+    ("rlm", "‏"),
+    ("rsaquo", "›"),
+    ("rsquo", "’"),
+    ("sbquo", "‚"),
+    ("scaron", "š"),
+    ("sdot", "⋅"),
+    ("sect", "§"),
+    ("shy", "\u{AD}"),
+    ("sigma", "σ"),
+    ("sigmaf", "ς"),
+    ("sim", "∼"),
+    ("spades", "♠"),
+    ("sub", "⊂"),
+    ("sube", "⊆"),
+    ("sum", "∑"),
+    ("sup", "⊃"),
+    ("sup1", "¹"),
+    ("sup2", "²"),
+    ("sup3", "³"),
+    ("supe", "⊇"),
+    ("szlig", "ß"),
+    ("tau", "τ"),
+    ("there4", "∴"),
+    ("theta", "θ"),
+    ("thetasym", "ϑ"),
+    ("thinsp", " "),
+    ("thorn", "þ"),
+    ("tilde", "˜"),
+    ("times", "×"),
+    ("trade", "™"),
+    ("uArr", "⇑"),
+    ("uacute", "ú"),
+    ("uarr", "↑"),
+    ("ucirc", "û"),
+    ("ugrave", "ù"),
+    ("uml", "¨"),
+    ("upsih", "ϒ"),
+    ("upsilon", "υ"),
+    ("uuml", "ü"),
+    ("weierp", "℘"),
+    ("xi", "ξ"),
+    ("yacute", "ý"),
+    ("yen", "¥"),
+    ("yuml", "ÿ"),
+    ("zeta", "ζ"),
+    ("zwj", "‍"),
+    ("zwnj", "‌"),
+];
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs
deleted file mode 100644
index d4c983a..0000000
--- a/src/util/decode_character_reference.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-//! Decode character references.
-
-use crate::util::constant::CHARACTER_REFERENCES;
-use alloc::string::{String, ToString};
-
-/// Decode named character references.
-///
-/// Turn the name coming from a named character reference (without the `&` or
-/// `;`) into a string.
-/// This looks the given string up at `0` in the tuples of
-/// [`CHARACTER_REFERENCES`][] and then takes the corresponding value from `1`.
-///
-/// The result is `String` instead of `char` because named character references
-/// can expand into multiple characters.
-///
-/// ## Examples
-///
-/// ```rust ignore
-/// use micromark::util::decode_character_reference::decode_named;
-///
-/// assert_eq!(decode_named("amp"), "&");
-/// assert_eq!(decode_named("AElig"), "Æ");
-/// assert_eq!(decode_named("aelig"), "æ");
-/// ```
-///
-/// ## Panics
-///
-/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
-/// given.
-/// It is expected that figuring out whether a name is allowed is handled in
-/// the parser.
-/// When `micromark` is used, this function never panics.
-///
-/// ## References
-///
-/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
-/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_named(value: &str) -> String {
-    let entry = CHARACTER_REFERENCES.iter().find(|d| d.0 == value);
-    let tuple = entry.expect("expected valid `name`");
-    tuple.1.to_string()
-}
-
-/// Decode numeric character references.
-///
-/// Turn the number (in string form as either hexadecimal or decimal) coming
-/// from a numeric character reference into a string.
-/// The base of the string form must be passed as the `radix` parameter, as
-/// `10` (decimal) or `16` (hexadecimal).
-///
-/// This returns a `String` form of the associated character or a replacement
-/// character for C0 control characters (except for ASCII whitespace), C1
-/// control characters, lone surrogates, noncharacters, and out of range
-/// characters.
-///
-/// ## Examples
-///
-/// ```rust ignore
-/// use micromark::util::decode_character_reference::decode_numeric;
-///
-/// assert_eq!(decode_numeric("123", 10), "{");
-/// assert_eq!(decode_numeric("9", 16), "\t");
-/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
-/// ```
-///
-/// ## Panics
-///
-/// This function panics if a invalid string or an out of bounds valid string
-/// is given.
-/// It is expected that figuring out whether a number is allowed is handled in
-/// the parser.
-/// When `micromark` is used, this function never panics.
-///
-/// ## References
-///
-/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
-/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_numeric(value: &str, radix: u32) -> String {
-    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
-        if !matches!(char,
-            // C0 except for HT, LF, FF, CR, space
-            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
-            // Control character (DEL) of c0, and C1 controls.
-            '\u{7F}'..='\u{9F}'
-            // Lone surrogates, noncharacters, and out of range are handled by
-            // Rust.
-        ) {
-            return char.to_string();
-        }
-    }
-
-    char::REPLACEMENT_CHARACTER.to_string()
-}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index dcbf1ae..edc7e14 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,8 +1,8 @@
 //! Utilities used when processing markdown.
 
 pub mod char;
+pub mod character_reference;
 pub mod constant;
-pub mod decode_character_reference;
 pub mod edit_map;
 pub mod encode;
 pub mod gfm_tagfilter;