From b75d7976cfe8db43783b930c1f4774f2ad4936f5 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 6 Oct 2022 11:43:26 +0200 Subject: Add support for HTML 4 character references in JSX attributes --- src/construct/character_reference.rs | 27 +--- src/to_html.rs | 12 +- src/to_mdast.rs | 20 ++- src/util/character_reference.rs | 206 +++++++++++++++++++++++++ src/util/constant.rs | 268 +++++++++++++++++++++++++++++++++ src/util/decode_character_reference.rs | 93 ------------ src/util/mod.rs | 2 +- tests/mdx_jsx_text.rs | 91 +++++++++++ 8 files changed, 584 insertions(+), 135 deletions(-) create mode 100644 src/util/character_reference.rs delete mode 100644 src/util/decode_character_reference.rs diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 927e3d9..d87050c 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -64,7 +64,7 @@ //! [string]: crate::construct::string //! [text]: crate::construct::text //! [character_escape]: crate::construct::character_reference -//! [decode_numeric]: crate::util::decode_character_reference::decode_numeric +//! [decode_numeric]: crate::util::character_reference::decode_numeric //! [character_references]: crate::util::constant::CHARACTER_REFERENCES //! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state @@ -72,10 +72,7 @@ use crate::event::Name; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use crate::util::{ - constant::{ - CHARACTER_REFERENCES, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, - CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, - }, + character_reference::{decode_named, value_max, value_test}, slice::Slice, }; @@ -173,9 +170,8 @@ pub fn value(tokenizer: &mut Tokenizer) -> State { tokenizer.point.index - tokenizer.tokenize_state.size, tokenizer.point.index, ); - let name = slice.as_str(); - if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { + if decode_named(slice.as_str(), true).is_none() { tokenizer.tokenize_state.marker = 0; tokenizer.tokenize_state.size = 0; return State::Nok; @@ -192,21 +188,10 @@ pub fn value(tokenizer: &mut Tokenizer) -> State { return State::Ok; } - let max = match tokenizer.tokenize_state.marker { - b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, - b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, - b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, - _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker), - }; - let test = match tokenizer.tokenize_state.marker { - b'&' => u8::is_ascii_alphanumeric, - b'x' => u8::is_ascii_hexdigit, - b'#' => u8::is_ascii_digit, - _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker), - }; - if let Some(byte) = tokenizer.current { - if tokenizer.tokenize_state.size < max && test(&byte) { + if tokenizer.tokenize_state.size < value_max(tokenizer.tokenize_state.marker) + && value_test(tokenizer.tokenize_state.marker)(&byte) + { tokenizer.tokenize_state.size += 1; tokenizer.consume(); return State::Next(StateName::CharacterReferenceValue); diff --git a/src/to_html.rs b/src/to_html.rs index d7d054d..814f7cf 100644 --- a/src/to_html.rs +++ b/src/to_html.rs @@ -2,8 +2,8 @@ use crate::event::{Event, Kind, Name}; use crate::mdast::AlignKind; use crate::util::{ + character_reference::decode as decode_character_reference, constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}, - decode_character_reference::{decode_named, decode_numeric}, encode::encode, gfm_tagfilter::gfm_tagfilter, infer::{gfm_table_align, list_loose}, @@ -783,14 +783,8 @@ fn on_exit_character_reference_value(context: &mut CompileContext) { context.bytes, &Position::from_exit_event(context.events, context.index), ); - let value = slice.as_str(); - - let value = match marker { - b'#' => decode_numeric(value, 10), - b'x' => decode_numeric(value, 16), - b'&' => decode_named(value), - _ => panic!("impossible"), - }; + let value = decode_character_reference(slice.as_str(), marker, true) + .expect("expected to parse only valid named references"); context.push(&encode(&value, context.encode_html)); } diff --git a/src/to_mdast.rs b/src/to_mdast.rs index c47eb22..4db76e6 100644 --- a/src/to_mdast.rs +++ b/src/to_mdast.rs @@ -10,7 +10,9 @@ use crate::mdast::{ }; use crate::unist::{Point, Position}; use crate::util::{ - decode_character_reference::{decode_named, decode_numeric}, + character_reference::{ + decode as decode_character_reference, parse as parse_character_reference, + }, infer::{gfm_table_align, list_item_loose, list_loose}, normalize_identifier::normalize_identifier, slice::{Position as SlicePosition, Slice}, @@ -892,14 +894,9 @@ fn on_exit_character_reference_value(context: &mut CompileContext) { context.bytes, &SlicePosition::from_exit_event(context.events, context.index), ); - let value = slice.as_str(); - - let value = match context.character_reference_marker { - b'#' => decode_numeric(value, 10), - b'x' => decode_numeric(value, 16), - b'&' => decode_named(value), - _ => panic!("impossible"), - }; + let value = + decode_character_reference(slice.as_str(), context.character_reference_marker, true) + .expect("expected to parse only valid named references"); if let Node::Text(node) = context.tail_mut() { node.value.push_str(value.as_str()); @@ -1558,8 +1555,9 @@ fn on_exit_mdx_jsx_tag_attribute_value_literal(context: &mut CompileContext) { .attributes .last_mut() { - // To do: character references. - node.value = Some(AttributeValue::Literal(value.to_string())); + node.value = Some(AttributeValue::Literal(parse_character_reference( + &value.to_string(), + ))); } else { unreachable!("expected property") } diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs new file mode 100644 index 0000000..75db98b --- /dev/null +++ b/src/util/character_reference.rs @@ -0,0 +1,206 @@ +//! Helpers for character references. + +use crate::util::constant::{ + CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, +}; +use alloc::string::{String, ToString}; +use core::str; + +/// Decode named character references. +/// +/// Turn the name coming from a named character reference (without the `&` or +/// `;`) into a string. +/// This looks the given string up at `0` in the tuples of +/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then +/// takes the corresponding value from `1`. +/// +/// The result is `String` instead of `char` because named character references +/// can expand into multiple characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_named; +/// +/// assert_eq!(decode_named("amp", true), "&"); +/// assert_eq!(decode_named("AElig", true), "Æ"); +/// assert_eq!(decode_named("aelig", true), "æ"); +/// ``` +/// +/// ## Panics +/// +/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is +/// given. +/// It is expected that figuring out whether a name is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_named(value: &str, html5: bool) -> Option { + let mut iter = if html5 { + CHARACTER_REFERENCES.iter() + } else { + CHARACTER_REFERENCES_HTML_4.iter() + }; + iter.find(|d| d.0 == value).map(|d| d.1.into()) +} + +/// Decode numeric character references. +/// +/// Turn the number (in string form as either hexadecimal or decimal) coming +/// from a numeric character reference into a string. +/// The base of the string form must be passed as the `radix` parameter, as +/// `10` (decimal) or `16` (hexadecimal). +/// +/// This returns a `String` form of the associated character or a replacement +/// character for C0 control characters (except for ASCII whitespace), C1 +/// control characters, lone surrogates, noncharacters, and out of range +/// characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_numeric; +/// +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. +/// ``` +/// +/// ## Panics +/// +/// This function panics if a invalid string or an out of bounds valid string +/// is given. +/// It is expected that figuring out whether a number is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_numeric(value: &str, radix: u32) -> String { + if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { + if !matches!(char, + // C0 except for HT, LF, FF, CR, space + '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | + // Control character (DEL) of c0, and C1 controls. + '\u{7F}'..='\u{9F}' + // Lone surrogates, noncharacters, and out of range are handled by + // Rust. + ) { + return char.to_string(); + } + } + + char::REPLACEMENT_CHARACTER.to_string() +} + +pub fn decode(value: &str, marker: u8, html5: bool) -> Option { + match marker { + b'#' => Some(decode_numeric(value, 10)), + b'x' => Some(decode_numeric(value, 16)), + b'&' => decode_named(value, html5), + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Get the maximum size of a value for different kinds of references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_max(marker: u8) -> usize { + match marker { + b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, + b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Get a test to check if a byte is allowed as a value for different kinds of +/// references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_test(marker: u8) -> fn(&u8) -> bool { + match marker { + b'&' => u8::is_ascii_alphanumeric, + b'x' => u8::is_ascii_hexdigit, + b'#' => u8::is_ascii_digit, + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Decode character references in a string. +/// +/// Note: this currently only supports HTML 4 references, as it’s only used for +/// them. +/// +/// If it’s ever needed to support HTML 5 (which is what normal markdown uses), +/// a boolean parameter can be added here. +pub fn parse(value: &str) -> String { + let bytes = value.as_bytes(); + let mut index = 0; + let len = bytes.len(); + // Grows a bit smaller with each character reference. + let mut result = String::with_capacity(value.len()); + let mut start = 0; + + while index < len { + if bytes[index] == b'&' { + let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' { + if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') { + (b'x', index + 3) + } else { + (b'#', index + 2) + } + } else { + (b'&', index + 1) + }; + + let max = value_max(marker); + let test = value_test(marker); + let mut value_index = 0; + while value_index < max && (value_start + value_index) < len { + if !test(&bytes[value_start + value_index]) { + break; + } + value_index += 1; + } + + let value_end = value_start + value_index; + + // Non empty and terminated. + if value_index > 0 && bytes[value_end] == b';' { + if let Some(decoded) = decode( + str::from_utf8(&bytes[value_start..value_end]).unwrap(), + marker, + false, + ) { + result.push_str(&value[start..index]); + result.push_str(&decoded); + start = value_end + 1; + index = start; + continue; + } + } + } + + index += 1; + } + + result.push_str(&value[start..]); + + result +} diff --git a/src/util/constant.rs b/src/util/constant.rs index e9deac2..65704d0 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -2433,6 +2433,274 @@ pub const CHARACTER_REFERENCES: [(&str, &str); 2125] = [ ("zwnj", "‌"), ]; +// Important: please touch the below lists as few times as possible to keep Git small. + +/// List of names and values that form named character reference in HTML 4. +/// +/// This list is normally not used in markdown, but it is used in MDX, because +/// in JSX attribute values, only the old HTML 4 character references are +/// supported. +/// +/// This list is sensitive to casing. +/// +/// ## References +/// +/// * [*§ 1.5.2 HTML Character References* in `JSX`](https://facebook.github.io/jsx/#sec-HTMLCharacterReference) +pub const CHARACTER_REFERENCES_HTML_4: [(&str, &str); 252] = [ + ("AElig", "Æ"), + ("Aacute", "Á"), + ("Acirc", "Â"), + ("Agrave", "À"), + ("Alpha", "Α"), + ("Aring", "Å"), + ("Atilde", "Ã"), + ("Auml", "Ä"), + ("Beta", "Β"), + ("Ccedil", "Ç"), + ("Chi", "Χ"), + ("Dagger", "‡"), + ("Delta", "Δ"), + ("ETH", "Ð"), + ("Eacute", "É"), + ("Ecirc", "Ê"), + ("Egrave", "È"), + ("Epsilon", "Ε"), + ("Eta", "Η"), + ("Euml", "Ë"), + ("Gamma", "Γ"), + ("Iacute", "Í"), + ("Icirc", "Î"), + ("Igrave", "Ì"), + ("Iota", "Ι"), + ("Iuml", "Ï"), + ("Kappa", "Κ"), + ("Lambda", "Λ"), + ("Mu", "Μ"), + ("Ntilde", "Ñ"), + ("Nu", "Ν"), + ("OElig", "Œ"), + ("Oacute", "Ó"), + ("Ocirc", "Ô"), + ("Ograve", "Ò"), + ("Omega", "Ω"), + ("Omicron", "Ο"), + ("Oslash", "Ø"), + ("Otilde", "Õ"), + ("Ouml", "Ö"), + ("Phi", "Φ"), + ("Pi", "Π"), + ("Prime", "″"), + ("Psi", "Ψ"), + ("Rho", "Ρ"), + ("Scaron", "Š"), + ("Sigma", "Σ"), + ("THORN", "Þ"), + ("Tau", "Τ"), + ("Theta", "Θ"), + ("Uacute", "Ú"), + ("Ucirc", "Û"), + ("Ugrave", "Ù"), + ("Upsilon", "Υ"), + ("Uuml", "Ü"), + ("Xi", "Ξ"), + ("Yacute", "Ý"), + ("Yuml", "Ÿ"), + ("Zeta", "Ζ"), + ("aacute", "á"), + ("acirc", "â"), + ("acute", "´"), + ("aelig", "æ"), + ("agrave", "à"), + ("alefsym", "ℵ"), + ("alpha", "α"), + ("amp", "&"), + ("and", "∧"), + ("ang", "∠"), + ("aring", "å"), + ("asymp", "≈"), + ("atilde", "ã"), + ("auml", "ä"), + ("bdquo", "„"), + ("beta", "β"), + ("brvbar", "¦"), + ("bull", "•"), + ("cap", "∩"), + ("ccedil", "ç"), + ("cedil", "¸"), + ("cent", "¢"), + ("chi", "χ"), + ("circ", "ˆ"), + ("clubs", "♣"), + ("cong", "≅"), + ("copy", "©"), + ("crarr", "↵"), + ("cup", "∪"), + ("curren", "¤"), + ("dArr", "⇓"), + ("dagger", "†"), + ("darr", "↓"), + ("deg", "°"), + ("delta", "δ"), + ("diams", "♦"), + ("divide", "÷"), + ("eacute", "é"), + ("ecirc", "ê"), + ("egrave", "è"), + ("empty", "∅"), + ("emsp", " "), + ("ensp", " "), + ("epsilon", "ε"), + ("equiv", "≡"), + ("eta", "η"), + ("eth", "ð"), + ("euml", "ë"), + ("euro", "€"), + ("exist", "∃"), + ("fnof", "ƒ"), + ("forall", "∀"), + ("frac12", "½"), + ("frac14", "¼"), + ("frac34", "¾"), + ("frasl", "⁄"), + ("gamma", "γ"), + ("ge", "≥"), + ("gt", ">"), + ("hArr", "⇔"), + ("harr", "↔"), + ("hearts", "♥"), + ("hellip", "…"), + ("iacute", "í"), + ("icirc", "î"), + ("iexcl", "¡"), + ("igrave", "ì"), + ("image", "ℑ"), + ("infin", "∞"), + ("int", "∫"), + ("iota", "ι"), + ("iquest", "¿"), + ("isin", "∈"), + ("iuml", "ï"), + ("kappa", "κ"), + ("lArr", "⇐"), + ("lambda", "λ"), + ("lang", "〈"), + ("laquo", "«"), + ("larr", "←"), + ("lceil", "⌈"), + ("ldquo", "“"), + ("le", "≤"), + ("lfloor", "⌊"), + ("lowast", "∗"), + ("loz", "◊"), + ("lrm", "‎"), + ("lsaquo", "‹"), + ("lsquo", "‘"), + ("lt", "<"), + ("macr", "¯"), + ("mdash", "—"), + ("micro", "µ"), + ("middot", "·"), + ("minus", "−"), + ("mu", "μ"), + ("nabla", "∇"), + ("nbsp", " "), + ("ndash", "–"), + ("ne", "≠"), + ("ni", "∋"), + ("not", "¬"), + ("notin", "∉"), + ("nsub", "⊄"), + ("ntilde", "ñ"), + ("nu", "ν"), + ("oacute", "ó"), + ("ocirc", "ô"), + ("oelig", "œ"), + ("ograve", "ò"), + ("oline", "‾"), + ("omega", "ω"), + ("omicron", "ο"), + ("oplus", "⊕"), + ("or", "∨"), + ("ordf", "ª"), + ("ordm", "º"), + ("oslash", "ø"), + ("otilde", "õ"), + ("otimes", "⊗"), + ("ouml", "ö"), + ("para", "¶"), + ("part", "∂"), + ("permil", "‰"), + ("perp", "⊥"), + ("phi", "φ"), + ("pi", "π"), + ("piv", "ϖ"), + ("plusmn", "±"), + ("pound", "£"), + ("prime", "′"), + ("prod", "∏"), + ("prop", "∝"), + ("psi", "ψ"), + ("quot", "\""), + ("rArr", "⇒"), + ("radic", "√"), + ("rang", "〉"), + ("raquo", "»"), + ("rarr", "→"), + ("rceil", "⌉"), + ("rdquo", "”"), + ("real", "ℜ"), + ("reg", "®"), + ("rfloor", "⌋"), + ("rho", "ρ"), + ("rlm", "‏"), + ("rsaquo", "›"), + ("rsquo", "’"), + ("sbquo", "‚"), + ("scaron", "š"), + ("sdot", "⋅"), + ("sect", "§"), + ("shy", "\u{AD}"), + ("sigma", "σ"), + ("sigmaf", "ς"), + ("sim", "∼"), + ("spades", "♠"), + ("sub", "⊂"), + ("sube", "⊆"), + ("sum", "∑"), + ("sup", "⊃"), + ("sup1", "¹"), + ("sup2", "²"), + ("sup3", "³"), + ("supe", "⊇"), + ("szlig", "ß"), + ("tau", "τ"), + ("there4", "∴"), + ("theta", "θ"), + ("thetasym", "ϑ"), + ("thinsp", " "), + ("thorn", "þ"), + ("tilde", "˜"), + ("times", "×"), + ("trade", "™"), + ("uArr", "⇑"), + ("uacute", "ú"), + ("uarr", "↑"), + ("ucirc", "û"), + ("ugrave", "ù"), + ("uml", "¨"), + ("upsih", "ϒ"), + ("upsilon", "υ"), + ("uuml", "ü"), + ("weierp", "℘"), + ("xi", "ξ"), + ("yacute", "ý"), + ("yen", "¥"), + ("yuml", "ÿ"), + ("zeta", "ζ"), + ("zwj", "‍"), + ("zwnj", "‌"), +]; + #[cfg(test)] mod tests { use super::*; diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs deleted file mode 100644 index d4c983a..0000000 --- a/src/util/decode_character_reference.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Decode character references. - -use crate::util::constant::CHARACTER_REFERENCES; -use alloc::string::{String, ToString}; - -/// Decode named character references. -/// -/// Turn the name coming from a named character reference (without the `&` or -/// `;`) into a string. -/// This looks the given string up at `0` in the tuples of -/// [`CHARACTER_REFERENCES`][] and then takes the corresponding value from `1`. -/// -/// The result is `String` instead of `char` because named character references -/// can expand into multiple characters. -/// -/// ## Examples -/// -/// ```rust ignore -/// use micromark::util::decode_character_reference::decode_named; -/// -/// assert_eq!(decode_named("amp"), "&"); -/// assert_eq!(decode_named("AElig"), "Æ"); -/// assert_eq!(decode_named("aelig"), "æ"); -/// ``` -/// -/// ## Panics -/// -/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is -/// given. -/// It is expected that figuring out whether a name is allowed is handled in -/// the parser. -/// When `micromark` is used, this function never panics. -/// -/// ## References -/// -/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) -/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_named(value: &str) -> String { - let entry = CHARACTER_REFERENCES.iter().find(|d| d.0 == value); - let tuple = entry.expect("expected valid `name`"); - tuple.1.to_string() -} - -/// Decode numeric character references. -/// -/// Turn the number (in string form as either hexadecimal or decimal) coming -/// from a numeric character reference into a string. -/// The base of the string form must be passed as the `radix` parameter, as -/// `10` (decimal) or `16` (hexadecimal). -/// -/// This returns a `String` form of the associated character or a replacement -/// character for C0 control characters (except for ASCII whitespace), C1 -/// control characters, lone surrogates, noncharacters, and out of range -/// characters. -/// -/// ## Examples -/// -/// ```rust ignore -/// use micromark::util::decode_character_reference::decode_numeric; -/// -/// assert_eq!(decode_numeric("123", 10), "{"); -/// assert_eq!(decode_numeric("9", 16), "\t"); -/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. -/// ``` -/// -/// ## Panics -/// -/// This function panics if a invalid string or an out of bounds valid string -/// is given. -/// It is expected that figuring out whether a number is allowed is handled in -/// the parser. -/// When `micromark` is used, this function never panics. -/// -/// ## References -/// -/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) -/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_numeric(value: &str, radix: u32) -> String { - if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { - if !matches!(char, - // C0 except for HT, LF, FF, CR, space - '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | - // Control character (DEL) of c0, and C1 controls. - '\u{7F}'..='\u{9F}' - // Lone surrogates, noncharacters, and out of range are handled by - // Rust. - ) { - return char.to_string(); - } - } - - char::REPLACEMENT_CHARACTER.to_string() -} diff --git a/src/util/mod.rs b/src/util/mod.rs index dcbf1ae..edc7e14 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,8 +1,8 @@ //! Utilities used when processing markdown. pub mod char; +pub mod character_reference; pub mod constant; -pub mod decode_character_reference; pub mod edit_map; pub mod encode; pub mod gfm_tagfilter; diff --git a/tests/mdx_jsx_text.rs b/tests/mdx_jsx_text.rs index ea3502f..0a27bb2 100644 --- a/tests/mdx_jsx_text.rs +++ b/tests/mdx_jsx_text.rs @@ -250,6 +250,97 @@ fn mdx_jsx_text_core() -> Result<(), String> { "should support mdx jsx (text) as `MdxJsxTextElement`s in mdast (attribute values)" ); + assert_eq!( + micromark_to_mdast(".", &mdx)?, + Node::Root(Root { + children: vec![Node::Paragraph(Paragraph { + children: vec![ + Node::MdxJsxTextElement(MdxJsxTextElement { + name: Some("a".to_string()), + attributes: vec![ + AttributeContent::Property(MdxJsxAttribute { + name: "b".to_string(), + value: Some(AttributeValue::Literal("\u{a0} & © Æ Ď ¾ ℋ ⅆ ∲ ≧̸".into())), + }), + ], + children: vec![], + position: Some(Position::new(1, 1, 0, 1, 120, 119)) + }), + Node::Text(Text { + value: ".".to_string(), + position: Some(Position::new(1, 120, 119, 1, 121, 120)) + }) + ], + position: Some(Position::new(1, 1, 0, 1, 121, 120)) + })], + position: Some(Position::new(1, 1, 0, 1, 121, 120)) + }), + "should support character references (HTML 4, named) in JSX attribute values" + ); + + assert_eq!( + micromark_to_mdast( + ".", + &mdx + )?, + Node::Root(Root { + children: vec![Node::Paragraph(Paragraph { + children: vec![ + Node::MdxJsxTextElement(MdxJsxTextElement { + name: Some("a".to_string()), + attributes: vec![ + AttributeContent::Property(MdxJsxAttribute { + name: "b".to_string(), + value: Some(AttributeValue::Literal("# Ӓ Ϡ �".into())), + }), + AttributeContent::Property(MdxJsxAttribute { + name: "c".to_string(), + value: Some(AttributeValue::Literal("\" ആ ಫ".into())), + }), + ], + children: vec![], + position: Some(Position::new(1, 1, 0, 1, 63, 62)) + }), + Node::Text(Text { + value: ".".to_string(), + position: Some(Position::new(1, 63, 62, 1, 64, 63)) + }) + ], + position: Some(Position::new(1, 1, 0, 1, 64, 63)) + })], + position: Some(Position::new(1, 1, 0, 1, 64, 63)) + }), + "should support character references (numeric) in JSX attribute values" + ); + + assert_eq!( + micromark_to_mdast(".", &mdx)?, + Node::Root(Root { + children: vec![Node::Paragraph(Paragraph { + children: vec![ + Node::MdxJsxTextElement(MdxJsxTextElement { + name: Some("a".to_string()), + attributes: vec![ + AttributeContent::Property(MdxJsxAttribute { + name: "b".to_string(), + value: Some(AttributeValue::Literal("  &x; &#; &#x; � &#abcdef0; &ThisIsNotDefined; &hi?;".into())), + }) + ], + children: vec![], + position: Some(Position::new(1, 1, 0, 1, 78, 77)) + }), + Node::Text(Text { + value: ".".to_string(), + position: Some(Position::new(1, 78, 77, 1, 79, 78)) + }) + ], + position: Some(Position::new(1, 1, 0, 1, 79, 78)) + })], + position: Some(Position::new(1, 1, 0, 1, 79, 78)) + }), + "should not support things that look like character references but aren’t" + ); + assert_eq!( micromark_to_mdast("a c", &mdx) .err() -- cgit