aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-10-06 11:43:26 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-10-06 11:43:26 +0200
commitb75d7976cfe8db43783b930c1f4774f2ad4936f5 (patch)
treed8c38c5bc6d1427b408d0b6b53aeb33f39e8d704
parentc12c31e1b2d55fa407217c0e14c51c8693f919ae (diff)
downloadmarkdown-rs-b75d7976cfe8db43783b930c1f4774f2ad4936f5.tar.gz
markdown-rs-b75d7976cfe8db43783b930c1f4774f2ad4936f5.tar.bz2
markdown-rs-b75d7976cfe8db43783b930c1f4774f2ad4936f5.zip
Add support for HTML 4 character references in JSX attributes
-rw-r--r--src/construct/character_reference.rs27
-rw-r--r--src/to_html.rs12
-rw-r--r--src/to_mdast.rs20
-rw-r--r--src/util/character_reference.rs206
-rw-r--r--src/util/constant.rs268
-rw-r--r--src/util/decode_character_reference.rs93
-rw-r--r--src/util/mod.rs2
-rw-r--r--tests/mdx_jsx_text.rs91
8 files changed, 584 insertions, 135 deletions
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs
index 927e3d9..d87050c 100644
--- a/src/construct/character_reference.rs
+++ b/src/construct/character_reference.rs
@@ -64,7 +64,7 @@
//! [string]: crate::construct::string
//! [text]: crate::construct::text
//! [character_escape]: crate::construct::character_reference
-//! [decode_numeric]: crate::util::decode_character_reference::decode_numeric
+//! [decode_numeric]: crate::util::character_reference::decode_numeric
//! [character_references]: crate::util::constant::CHARACTER_REFERENCES
//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
@@ -72,10 +72,7 @@ use crate::event::Name;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::{
- constant::{
- CHARACTER_REFERENCES, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
- CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
- },
+ character_reference::{decode_named, value_max, value_test},
slice::Slice,
};
@@ -173,9 +170,8 @@ pub fn value(tokenizer: &mut Tokenizer) -> State {
tokenizer.point.index - tokenizer.tokenize_state.size,
tokenizer.point.index,
);
- let name = slice.as_str();
- if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) {
+ if decode_named(slice.as_str(), true).is_none() {
tokenizer.tokenize_state.marker = 0;
tokenizer.tokenize_state.size = 0;
return State::Nok;
@@ -192,21 +188,10 @@ pub fn value(tokenizer: &mut Tokenizer) -> State {
return State::Ok;
}
- let max = match tokenizer.tokenize_state.marker {
- b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
- b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
- b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
- _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker),
- };
- let test = match tokenizer.tokenize_state.marker {
- b'&' => u8::is_ascii_alphanumeric,
- b'x' => u8::is_ascii_hexdigit,
- b'#' => u8::is_ascii_digit,
- _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker),
- };
-
if let Some(byte) = tokenizer.current {
- if tokenizer.tokenize_state.size < max && test(&byte) {
+ if tokenizer.tokenize_state.size < value_max(tokenizer.tokenize_state.marker)
+ && value_test(tokenizer.tokenize_state.marker)(&byte)
+ {
tokenizer.tokenize_state.size += 1;
tokenizer.consume();
return State::Next(StateName::CharacterReferenceValue);
diff --git a/src/to_html.rs b/src/to_html.rs
index d7d054d..814f7cf 100644
--- a/src/to_html.rs
+++ b/src/to_html.rs
@@ -2,8 +2,8 @@
use crate::event::{Event, Kind, Name};
use crate::mdast::AlignKind;
use crate::util::{
+ character_reference::decode as decode_character_reference,
constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC},
- decode_character_reference::{decode_named, decode_numeric},
encode::encode,
gfm_tagfilter::gfm_tagfilter,
infer::{gfm_table_align, list_loose},
@@ -783,14 +783,8 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {
context.bytes,
&Position::from_exit_event(context.events, context.index),
);
- let value = slice.as_str();
-
- let value = match marker {
- b'#' => decode_numeric(value, 10),
- b'x' => decode_numeric(value, 16),
- b'&' => decode_named(value),
- _ => panic!("impossible"),
- };
+ let value = decode_character_reference(slice.as_str(), marker, true)
+ .expect("expected to parse only valid named references");
context.push(&encode(&value, context.encode_html));
}
diff --git a/src/to_mdast.rs b/src/to_mdast.rs
index c47eb22..4db76e6 100644
--- a/src/to_mdast.rs
+++ b/src/to_mdast.rs
@@ -10,7 +10,9 @@ use crate::mdast::{
};
use crate::unist::{Point, Position};
use crate::util::{
- decode_character_reference::{decode_named, decode_numeric},
+ character_reference::{
+ decode as decode_character_reference, parse as parse_character_reference,
+ },
infer::{gfm_table_align, list_item_loose, list_loose},
normalize_identifier::normalize_identifier,
slice::{Position as SlicePosition, Slice},
@@ -892,14 +894,9 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {
context.bytes,
&SlicePosition::from_exit_event(context.events, context.index),
);
- let value = slice.as_str();
-
- let value = match context.character_reference_marker {
- b'#' => decode_numeric(value, 10),
- b'x' => decode_numeric(value, 16),
- b'&' => decode_named(value),
- _ => panic!("impossible"),
- };
+ let value =
+ decode_character_reference(slice.as_str(), context.character_reference_marker, true)
+ .expect("expected to parse only valid named references");
if let Node::Text(node) = context.tail_mut() {
node.value.push_str(value.as_str());
@@ -1558,8 +1555,9 @@ fn on_exit_mdx_jsx_tag_attribute_value_literal(context: &mut CompileContext) {
.attributes
.last_mut()
{
- // To do: character references.
- node.value = Some(AttributeValue::Literal(value.to_string()));
+ node.value = Some(AttributeValue::Literal(parse_character_reference(
+ &value.to_string(),
+ )));
} else {
unreachable!("expected property")
}
diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs
new file mode 100644
index 0000000..75db98b
--- /dev/null
+++ b/src/util/character_reference.rs
@@ -0,0 +1,206 @@
+//! Helpers for character references.
+
+use crate::util::constant::{
+ CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+};
+use alloc::string::{String, ToString};
+use core::str;
+
+/// Decode named character references.
+///
+/// Turn the name coming from a named character reference (without the `&` or
+/// `;`) into a string.
+/// This looks the given string up at `0` in the tuples of
+/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then
+/// takes the corresponding value from `1`.
+///
+/// The result is `String` instead of `char` because named character references
+/// can expand into multiple characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_named;
+///
+/// assert_eq!(decode_named("amp", true), "&");
+/// assert_eq!(decode_named("AElig", true), "Æ");
+/// assert_eq!(decode_named("aelig", true), "æ");
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
+/// given.
+/// It is expected that figuring out whether a name is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_named(value: &str, html5: bool) -> Option<String> {
+ let mut iter = if html5 {
+ CHARACTER_REFERENCES.iter()
+ } else {
+ CHARACTER_REFERENCES_HTML_4.iter()
+ };
+ iter.find(|d| d.0 == value).map(|d| d.1.into())
+}
+
+/// Decode numeric character references.
+///
+/// Turn the number (in string form as either hexadecimal or decimal) coming
+/// from a numeric character reference into a string.
+/// The base of the string form must be passed as the `radix` parameter, as
+/// `10` (decimal) or `16` (hexadecimal).
+///
+/// This returns a `String` form of the associated character or a replacement
+/// character for C0 control characters (except for ASCII whitespace), C1
+/// control characters, lone surrogates, noncharacters, and out of range
+/// characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_numeric;
+///
+/// assert_eq!(decode_numeric("123", 10), "{");
+/// assert_eq!(decode_numeric("9", 16), "\t");
+/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a invalid string or an out of bounds valid string
+/// is given.
+/// It is expected that figuring out whether a number is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_numeric(value: &str, radix: u32) -> String {
+ if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
+ if !matches!(char,
+ // C0 except for HT, LF, FF, CR, space
+ '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
+ // Control character (DEL) of c0, and C1 controls.
+ '\u{7F}'..='\u{9F}'
+ // Lone surrogates, noncharacters, and out of range are handled by
+ // Rust.
+ ) {
+ return char.to_string();
+ }
+ }
+
+ char::REPLACEMENT_CHARACTER.to_string()
+}
+
+pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
+ match marker {
+ b'#' => Some(decode_numeric(value, 10)),
+ b'x' => Some(decode_numeric(value, 16)),
+ b'&' => decode_named(value, html5),
+ _ => unreachable!("Unexpected marker `{}`", marker),
+ }
+}
+
+/// Get the maximum size of a value for different kinds of references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_max(marker: u8) -> usize {
+ match marker {
+ b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+ b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
+ b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+ _ => unreachable!("Unexpected marker `{}`", marker),
+ }
+}
+
+/// Get a test to check if a byte is allowed as a value for different kinds of
+/// references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_test(marker: u8) -> fn(&u8) -> bool {
+ match marker {
+ b'&' => u8::is_ascii_alphanumeric,
+ b'x' => u8::is_ascii_hexdigit,
+ b'#' => u8::is_ascii_digit,
+ _ => unreachable!("Unexpected marker `{}`", marker),
+ }
+}
+
+/// Decode character references in a string.
+///
+/// Note: this currently only supports HTML 4 references, as it’s only used for
+/// them.
+///
+/// If it’s ever needed to support HTML 5 (which is what normal markdown uses),
+/// a boolean parameter can be added here.
+pub fn parse(value: &str) -> String {
+ let bytes = value.as_bytes();
+ let mut index = 0;
+ let len = bytes.len();
+ // Grows a bit smaller with each character reference.
+ let mut result = String::with_capacity(value.len());
+ let mut start = 0;
+
+ while index < len {
+ if bytes[index] == b'&' {
+ let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
+ if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
+ (b'x', index + 3)
+ } else {
+ (b'#', index + 2)
+ }
+ } else {
+ (b'&', index + 1)
+ };
+
+ let max = value_max(marker);
+ let test = value_test(marker);
+ let mut value_index = 0;
+ while value_index < max && (value_start + value_index) < len {
+ if !test(&bytes[value_start + value_index]) {
+ break;
+ }
+ value_index += 1;
+ }
+
+ let value_end = value_start + value_index;
+
+ // Non empty and terminated.
+ if value_index > 0 && bytes[value_end] == b';' {
+ if let Some(decoded) = decode(
+ str::from_utf8(&bytes[value_start..value_end]).unwrap(),
+ marker,
+ false,
+ ) {
+ result.push_str(&value[start..index]);
+ result.push_str(&decoded);
+ start = value_end + 1;
+ index = start;
+ continue;
+ }
+ }
+ }
+
+ index += 1;
+ }
+
+ result.push_str(&value[start..]);
+
+ result
+}
diff --git a/src/util/constant.rs b/src/util/constant.rs
index e9deac2..65704d0 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -2433,6 +2433,274 @@ pub const CHARACTER_REFERENCES: [(&str, &str); 2125] = [
("zwnj", "‌"),
];
+// Important: please touch the below lists as few times as possible to keep Git small.
+
+/// List of names and values that form named character reference in HTML 4.
+///
+/// This list is normally not used in markdown, but it is used in MDX, because
+/// in JSX attribute values, only the old HTML 4 character references are
+/// supported.
+///
+/// This list is sensitive to casing.
+///
+/// ## References
+///
+/// * [*§ 1.5.2 HTML Character References* in `JSX`](https://facebook.github.io/jsx/#sec-HTMLCharacterReference)
+pub const CHARACTER_REFERENCES_HTML_4: [(&str, &str); 252] = [
+ ("AElig", "Æ"),
+ ("Aacute", "Á"),
+ ("Acirc", "Â"),
+ ("Agrave", "À"),
+ ("Alpha", "Α"),
+ ("Aring", "Å"),
+ ("Atilde", "Ã"),
+ ("Auml", "Ä"),
+ ("Beta", "Β"),
+ ("Ccedil", "Ç"),
+ ("Chi", "Χ"),
+ ("Dagger", "‡"),
+ ("Delta", "Δ"),
+ ("ETH", "Ð"),
+ ("Eacute", "É"),
+ ("Ecirc", "Ê"),
+ ("Egrave", "È"),
+ ("Epsilon", "Ε"),
+ ("Eta", "Η"),
+ ("Euml", "Ë"),
+ ("Gamma", "Γ"),
+ ("Iacute", "Í"),
+ ("Icirc", "Î"),
+ ("Igrave", "Ì"),
+ ("Iota", "Ι"),
+ ("Iuml", "Ï"),
+ ("Kappa", "Κ"),
+ ("Lambda", "Λ"),
+ ("Mu", "Μ"),
+ ("Ntilde", "Ñ"),
+ ("Nu", "Ν"),
+ ("OElig", "Œ"),
+ ("Oacute", "Ó"),
+ ("Ocirc", "Ô"),
+ ("Ograve", "Ò"),
+ ("Omega", "Ω"),
+ ("Omicron", "Ο"),
+ ("Oslash", "Ø"),
+ ("Otilde", "Õ"),
+ ("Ouml", "Ö"),
+ ("Phi", "Φ"),
+ ("Pi", "Π"),
+ ("Prime", "″"),
+ ("Psi", "Ψ"),
+ ("Rho", "Ρ"),
+ ("Scaron", "Š"),
+ ("Sigma", "Σ"),
+ ("THORN", "Þ"),
+ ("Tau", "Τ"),
+ ("Theta", "Θ"),
+ ("Uacute", "Ú"),
+ ("Ucirc", "Û"),
+ ("Ugrave", "Ù"),
+ ("Upsilon", "Υ"),
+ ("Uuml", "Ü"),
+ ("Xi", "Ξ"),
+ ("Yacute", "Ý"),
+ ("Yuml", "Ÿ"),
+ ("Zeta", "Ζ"),
+ ("aacute", "á"),
+ ("acirc", "â"),
+ ("acute", "´"),
+ ("aelig", "æ"),
+ ("agrave", "à"),
+ ("alefsym", "ℵ"),
+ ("alpha", "α"),
+ ("amp", "&"),
+ ("and", "∧"),
+ ("ang", "∠"),
+ ("aring", "å"),
+ ("asymp", "≈"),
+ ("atilde", "ã"),
+ ("auml", "ä"),
+ ("bdquo", "„"),
+ ("beta", "β"),
+ ("brvbar", "¦"),
+ ("bull", "•"),
+ ("cap", "∩"),
+ ("ccedil", "ç"),
+ ("cedil", "¸"),
+ ("cent", "¢"),
+ ("chi", "χ"),
+ ("circ", "ˆ"),
+ ("clubs", "♣"),
+ ("cong", "≅"),
+ ("copy", "©"),
+ ("crarr", "↵"),
+ ("cup", "∪"),
+ ("curren", "¤"),
+ ("dArr", "⇓"),
+ ("dagger", "†"),
+ ("darr", "↓"),
+ ("deg", "°"),
+ ("delta", "δ"),
+ ("diams", "♦"),
+ ("divide", "÷"),
+ ("eacute", "é"),
+ ("ecirc", "ê"),
+ ("egrave", "è"),
+ ("empty", "∅"),
+ ("emsp", " "),
+ ("ensp", " "),
+ ("epsilon", "ε"),
+ ("equiv", "≡"),
+ ("eta", "η"),
+ ("eth", "ð"),
+ ("euml", "ë"),
+ ("euro", "€"),
+ ("exist", "∃"),
+ ("fnof", "ƒ"),
+ ("forall", "∀"),
+ ("frac12", "½"),
+ ("frac14", "¼"),
+ ("frac34", "¾"),
+ ("frasl", "⁄"),
+ ("gamma", "γ"),
+ ("ge", "≥"),
+ ("gt", ">"),
+ ("hArr", "⇔"),
+ ("harr", "↔"),
+ ("hearts", "♥"),
+ ("hellip", "…"),
+ ("iacute", "í"),
+ ("icirc", "î"),
+ ("iexcl", "¡"),
+ ("igrave", "ì"),
+ ("image", "ℑ"),
+ ("infin", "∞"),
+ ("int", "∫"),
+ ("iota", "ι"),
+ ("iquest", "¿"),
+ ("isin", "∈"),
+ ("iuml", "ï"),
+ ("kappa", "κ"),
+ ("lArr", "⇐"),
+ ("lambda", "λ"),
+ ("lang", "〈"),
+ ("laquo", "«"),
+ ("larr", "←"),
+ ("lceil", "⌈"),
+ ("ldquo", "“"),
+ ("le", "≤"),
+ ("lfloor", "⌊"),
+ ("lowast", "∗"),
+ ("loz", "◊"),
+ ("lrm", "‎"),
+ ("lsaquo", "‹"),
+ ("lsquo", "‘"),
+ ("lt", "<"),
+ ("macr", "¯"),
+ ("mdash", "—"),
+ ("micro", "µ"),
+ ("middot", "·"),
+ ("minus", "−"),
+ ("mu", "μ"),
+ ("nabla", "∇"),
+ ("nbsp", " "),
+ ("ndash", "–"),
+ ("ne", "≠"),
+ ("ni", "∋"),
+ ("not", "¬"),
+ ("notin", "∉"),
+ ("nsub", "⊄"),
+ ("ntilde", "ñ"),
+ ("nu", "ν"),
+ ("oacute", "ó"),
+ ("ocirc", "ô"),
+ ("oelig", "œ"),
+ ("ograve", "ò"),
+ ("oline", "‾"),
+ ("omega", "ω"),
+ ("omicron", "ο"),
+ ("oplus", "⊕"),
+ ("or", "∨"),
+ ("ordf", "ª"),
+ ("ordm", "º"),
+ ("oslash", "ø"),
+ ("otilde", "õ"),
+ ("otimes", "⊗"),
+ ("ouml", "ö"),
+ ("para", "¶"),
+ ("part", "∂"),
+ ("permil", "‰"),
+ ("perp", "⊥"),
+ ("phi", "φ"),
+ ("pi", "π"),
+ ("piv", "ϖ"),
+ ("plusmn", "±"),
+ ("pound", "£"),
+ ("prime", "′"),
+ ("prod", "∏"),
+ ("prop", "∝"),
+ ("psi", "ψ"),
+ ("quot", "\""),
+ ("rArr", "⇒"),
+ ("radic", "√"),
+ ("rang", "〉"),
+ ("raquo", "»"),
+ ("rarr", "→"),
+ ("rceil", "⌉"),
+ ("rdquo", "”"),
+ ("real", "ℜ"),
+ ("reg", "®"),
+ ("rfloor", "⌋"),
+ ("rho", "ρ"),
+ ("rlm", "‏"),
+ ("rsaquo", "›"),
+ ("rsquo", "’"),
+ ("sbquo", "‚"),
+ ("scaron", "š"),
+ ("sdot", "⋅"),
+ ("sect", "§"),
+ ("shy", "\u{AD}"),
+ ("sigma", "σ"),
+ ("sigmaf", "ς"),
+ ("sim", "∼"),
+ ("spades", "♠"),
+ ("sub", "⊂"),
+ ("sube", "⊆"),
+ ("sum", "∑"),
+ ("sup", "⊃"),
+ ("sup1", "¹"),
+ ("sup2", "²"),
+ ("sup3", "³"),
+ ("supe", "⊇"),
+ ("szlig", "ß"),
+ ("tau", "τ"),
+ ("there4", "∴"),
+ ("theta", "θ"),
+ ("thetasym", "ϑ"),
+ ("thinsp", " "),
+ ("thorn", "þ"),
+ ("tilde", "˜"),
+ ("times", "×"),
+ ("trade", "™"),
+ ("uArr", "⇑"),
+ ("uacute", "ú"),
+ ("uarr", "↑"),
+ ("ucirc", "û"),
+ ("ugrave", "ù"),
+ ("uml", "¨"),
+ ("upsih", "ϒ"),
+ ("upsilon", "υ"),
+ ("uuml", "ü"),
+ ("weierp", "℘"),
+ ("xi", "ξ"),
+ ("yacute", "ý"),
+ ("yen", "¥"),
+ ("yuml", "ÿ"),
+ ("zeta", "ζ"),
+ ("zwj", "‍"),
+ ("zwnj", "‌"),
+];
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs
deleted file mode 100644
index d4c983a..0000000
--- a/src/util/decode_character_reference.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-//! Decode character references.
-
-use crate::util::constant::CHARACTER_REFERENCES;
-use alloc::string::{String, ToString};
-
-/// Decode named character references.
-///
-/// Turn the name coming from a named character reference (without the `&` or
-/// `;`) into a string.
-/// This looks the given string up at `0` in the tuples of
-/// [`CHARACTER_REFERENCES`][] and then takes the corresponding value from `1`.
-///
-/// The result is `String` instead of `char` because named character references
-/// can expand into multiple characters.
-///
-/// ## Examples
-///
-/// ```rust ignore
-/// use micromark::util::decode_character_reference::decode_named;
-///
-/// assert_eq!(decode_named("amp"), "&");
-/// assert_eq!(decode_named("AElig"), "Æ");
-/// assert_eq!(decode_named("aelig"), "æ");
-/// ```
-///
-/// ## Panics
-///
-/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
-/// given.
-/// It is expected that figuring out whether a name is allowed is handled in
-/// the parser.
-/// When `micromark` is used, this function never panics.
-///
-/// ## References
-///
-/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
-/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_named(value: &str) -> String {
- let entry = CHARACTER_REFERENCES.iter().find(|d| d.0 == value);
- let tuple = entry.expect("expected valid `name`");
- tuple.1.to_string()
-}
-
-/// Decode numeric character references.
-///
-/// Turn the number (in string form as either hexadecimal or decimal) coming
-/// from a numeric character reference into a string.
-/// The base of the string form must be passed as the `radix` parameter, as
-/// `10` (decimal) or `16` (hexadecimal).
-///
-/// This returns a `String` form of the associated character or a replacement
-/// character for C0 control characters (except for ASCII whitespace), C1
-/// control characters, lone surrogates, noncharacters, and out of range
-/// characters.
-///
-/// ## Examples
-///
-/// ```rust ignore
-/// use micromark::util::decode_character_reference::decode_numeric;
-///
-/// assert_eq!(decode_numeric("123", 10), "{");
-/// assert_eq!(decode_numeric("9", 16), "\t");
-/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
-/// ```
-///
-/// ## Panics
-///
-/// This function panics if a invalid string or an out of bounds valid string
-/// is given.
-/// It is expected that figuring out whether a number is allowed is handled in
-/// the parser.
-/// When `micromark` is used, this function never panics.
-///
-/// ## References
-///
-/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
-/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_numeric(value: &str, radix: u32) -> String {
- if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
- if !matches!(char,
- // C0 except for HT, LF, FF, CR, space
- '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
- // Control character (DEL) of c0, and C1 controls.
- '\u{7F}'..='\u{9F}'
- // Lone surrogates, noncharacters, and out of range are handled by
- // Rust.
- ) {
- return char.to_string();
- }
- }
-
- char::REPLACEMENT_CHARACTER.to_string()
-}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index dcbf1ae..edc7e14 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,8 +1,8 @@
//! Utilities used when processing markdown.
pub mod char;
+pub mod character_reference;
pub mod constant;
-pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
pub mod gfm_tagfilter;
diff --git a/tests/mdx_jsx_text.rs b/tests/mdx_jsx_text.rs
index ea3502f..0a27bb2 100644
--- a/tests/mdx_jsx_text.rs
+++ b/tests/mdx_jsx_text.rs
@@ -251,6 +251,97 @@ fn mdx_jsx_text_core() -> Result<(), String> {
);
assert_eq!(
+ micromark_to_mdast("<a b='&nbsp; &amp; &copy; &AElig; &Dcaron; &frac34; &HilbertSpace; &DifferentialD; &ClockwiseContourIntegral; &ngE;' />.", &mdx)?,
+ Node::Root(Root {
+ children: vec![Node::Paragraph(Paragraph {
+ children: vec![
+ Node::MdxJsxTextElement(MdxJsxTextElement {
+ name: Some("a".to_string()),
+ attributes: vec![
+ AttributeContent::Property(MdxJsxAttribute {
+ name: "b".to_string(),
+ value: Some(AttributeValue::Literal("\u{a0} & © Æ &Dcaron; ¾ &HilbertSpace; &DifferentialD; &ClockwiseContourIntegral; &ngE;".into())),
+ }),
+ ],
+ children: vec![],
+ position: Some(Position::new(1, 1, 0, 1, 120, 119))
+ }),
+ Node::Text(Text {
+ value: ".".to_string(),
+ position: Some(Position::new(1, 120, 119, 1, 121, 120))
+ })
+ ],
+ position: Some(Position::new(1, 1, 0, 1, 121, 120))
+ })],
+ position: Some(Position::new(1, 1, 0, 1, 121, 120))
+ }),
+ "should support character references (HTML 4, named) in JSX attribute values"
+ );
+
+ assert_eq!(
+ micromark_to_mdast(
+ "<a b='&#35; &#1234; &#992; &#0;' c='&#X22; &#XD06; &#xcab;' />.",
+ &mdx
+ )?,
+ Node::Root(Root {
+ children: vec![Node::Paragraph(Paragraph {
+ children: vec![
+ Node::MdxJsxTextElement(MdxJsxTextElement {
+ name: Some("a".to_string()),
+ attributes: vec![
+ AttributeContent::Property(MdxJsxAttribute {
+ name: "b".to_string(),
+ value: Some(AttributeValue::Literal("# Ӓ Ϡ �".into())),
+ }),
+ AttributeContent::Property(MdxJsxAttribute {
+ name: "c".to_string(),
+ value: Some(AttributeValue::Literal("\" ആ ಫ".into())),
+ }),
+ ],
+ children: vec![],
+ position: Some(Position::new(1, 1, 0, 1, 63, 62))
+ }),
+ Node::Text(Text {
+ value: ".".to_string(),
+ position: Some(Position::new(1, 63, 62, 1, 64, 63))
+ })
+ ],
+ position: Some(Position::new(1, 1, 0, 1, 64, 63))
+ })],
+ position: Some(Position::new(1, 1, 0, 1, 64, 63))
+ }),
+ "should support character references (numeric) in JSX attribute values"
+ );
+
+ assert_eq!(
+ micromark_to_mdast("<a b='&nbsp &x; &#; &#x; &#987654321; &#abcdef0; &ThisIsNotDefined; &hi?;' />.", &mdx)?,
+ Node::Root(Root {
+ children: vec![Node::Paragraph(Paragraph {
+ children: vec![
+ Node::MdxJsxTextElement(MdxJsxTextElement {
+ name: Some("a".to_string()),
+ attributes: vec![
+ AttributeContent::Property(MdxJsxAttribute {
+ name: "b".to_string(),
+ value: Some(AttributeValue::Literal("&nbsp &x; &#; &#x; &#987654321; &#abcdef0; &ThisIsNotDefined; &hi?;".into())),
+ })
+ ],
+ children: vec![],
+ position: Some(Position::new(1, 1, 0, 1, 78, 77))
+ }),
+ Node::Text(Text {
+ value: ".".to_string(),
+ position: Some(Position::new(1, 78, 77, 1, 79, 78))
+ })
+ ],
+ position: Some(Position::new(1, 1, 0, 1, 79, 78))
+ })],
+ position: Some(Position::new(1, 1, 0, 1, 79, 78))
+ }),
+ "should not support things that look like character references but aren’t"
+ );
+
+ assert_eq!(
micromark_to_mdast("a </b> c", &mdx)
.err()
.unwrap(),