diff options
Diffstat (limited to '')
-rw-r--r-- | src/util/character_reference.rs | 206 | ||||
-rw-r--r-- | src/util/constant.rs | 268 | ||||
-rw-r--r-- | src/util/decode_character_reference.rs | 93 | ||||
-rw-r--r-- | src/util/mod.rs | 2 |
4 files changed, 475 insertions, 94 deletions
diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs new file mode 100644 index 0000000..75db98b --- /dev/null +++ b/src/util/character_reference.rs @@ -0,0 +1,206 @@ +//! Helpers for character references. + +use crate::util::constant::{ + CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, +}; +use alloc::string::{String, ToString}; +use core::str; + +/// Decode named character references. +/// +/// Turn the name coming from a named character reference (without the `&` or +/// `;`) into a string. +/// This looks the given string up at `0` in the tuples of +/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then +/// takes the corresponding value from `1`. +/// +/// The result is `String` instead of `char` because named character references +/// can expand into multiple characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_named; +/// +/// assert_eq!(decode_named("amp", true), "&"); +/// assert_eq!(decode_named("AElig", true), "Æ"); +/// assert_eq!(decode_named("aelig", true), "æ"); +/// ``` +/// +/// ## Panics +/// +/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is +/// given. +/// It is expected that figuring out whether a name is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_named(value: &str, html5: bool) -> Option<String> { + let mut iter = if html5 { + CHARACTER_REFERENCES.iter() + } else { + CHARACTER_REFERENCES_HTML_4.iter() + }; + iter.find(|d| d.0 == value).map(|d| d.1.into()) +} + +/// Decode numeric character references. +/// +/// Turn the number (in string form as either hexadecimal or decimal) coming +/// from a numeric character reference into a string. +/// The base of the string form must be passed as the `radix` parameter, as +/// `10` (decimal) or `16` (hexadecimal). +/// +/// This returns a `String` form of the associated character or a replacement +/// character for C0 control characters (except for ASCII whitespace), C1 +/// control characters, lone surrogates, noncharacters, and out of range +/// characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_numeric; +/// +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. +/// ``` +/// +/// ## Panics +/// +/// This function panics if a invalid string or an out of bounds valid string +/// is given. +/// It is expected that figuring out whether a number is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_numeric(value: &str, radix: u32) -> String { + if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { + if !matches!(char, + // C0 except for HT, LF, FF, CR, space + '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | + // Control character (DEL) of c0, and C1 controls. + '\u{7F}'..='\u{9F}' + // Lone surrogates, noncharacters, and out of range are handled by + // Rust. + ) { + return char.to_string(); + } + } + + char::REPLACEMENT_CHARACTER.to_string() +} + +pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> { + match marker { + b'#' => Some(decode_numeric(value, 10)), + b'x' => Some(decode_numeric(value, 16)), + b'&' => decode_named(value, html5), + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Get the maximum size of a value for different kinds of references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_max(marker: u8) -> usize { + match marker { + b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, + b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Get a test to check if a byte is allowed as a value for different kinds of +/// references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_test(marker: u8) -> fn(&u8) -> bool { + match marker { + b'&' => u8::is_ascii_alphanumeric, + b'x' => u8::is_ascii_hexdigit, + b'#' => u8::is_ascii_digit, + _ => unreachable!("Unexpected marker `{}`", marker), + } +} + +/// Decode character references in a string. +/// +/// Note: this currently only supports HTML 4 references, as it’s only used for +/// them. +/// +/// If it’s ever needed to support HTML 5 (which is what normal markdown uses), +/// a boolean parameter can be added here. +pub fn parse(value: &str) -> String { + let bytes = value.as_bytes(); + let mut index = 0; + let len = bytes.len(); + // Grows a bit smaller with each character reference. + let mut result = String::with_capacity(value.len()); + let mut start = 0; + + while index < len { + if bytes[index] == b'&' { + let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' { + if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') { + (b'x', index + 3) + } else { + (b'#', index + 2) + } + } else { + (b'&', index + 1) + }; + + let max = value_max(marker); + let test = value_test(marker); + let mut value_index = 0; + while value_index < max && (value_start + value_index) < len { + if !test(&bytes[value_start + value_index]) { + break; + } + value_index += 1; + } + + let value_end = value_start + value_index; + + // Non empty and terminated. + if value_index > 0 && bytes[value_end] == b';' { + if let Some(decoded) = decode( + str::from_utf8(&bytes[value_start..value_end]).unwrap(), + marker, + false, + ) { + result.push_str(&value[start..index]); + result.push_str(&decoded); + start = value_end + 1; + index = start; + continue; + } + } + } + + index += 1; + } + + result.push_str(&value[start..]); + + result +} diff --git a/src/util/constant.rs b/src/util/constant.rs index e9deac2..65704d0 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -2433,6 +2433,274 @@ pub const CHARACTER_REFERENCES: [(&str, &str); 2125] = [ ("zwnj", ""), ]; +// Important: please touch the below lists as few times as possible to keep Git small. + +/// List of names and values that form named character reference in HTML 4. +/// +/// This list is normally not used in markdown, but it is used in MDX, because +/// in JSX attribute values, only the old HTML 4 character references are +/// supported. +/// +/// This list is sensitive to casing. +/// +/// ## References +/// +/// * [*§ 1.5.2 HTML Character References* in `JSX`](https://facebook.github.io/jsx/#sec-HTMLCharacterReference) +pub const CHARACTER_REFERENCES_HTML_4: [(&str, &str); 252] = [ + ("AElig", "Æ"), + ("Aacute", "Á"), + ("Acirc", "Â"), + ("Agrave", "À"), + ("Alpha", "Α"), + ("Aring", "Å"), + ("Atilde", "Ã"), + ("Auml", "Ä"), + ("Beta", "Β"), + ("Ccedil", "Ç"), + ("Chi", "Χ"), + ("Dagger", "‡"), + ("Delta", "Δ"), + ("ETH", "Ð"), + ("Eacute", "É"), + ("Ecirc", "Ê"), + ("Egrave", "È"), + ("Epsilon", "Ε"), + ("Eta", "Η"), + ("Euml", "Ë"), + ("Gamma", "Γ"), + ("Iacute", "Í"), + ("Icirc", "Î"), + ("Igrave", "Ì"), + ("Iota", "Ι"), + ("Iuml", "Ï"), + ("Kappa", "Κ"), + ("Lambda", "Λ"), + ("Mu", "Μ"), + ("Ntilde", "Ñ"), + ("Nu", "Ν"), + ("OElig", "Œ"), + ("Oacute", "Ó"), + ("Ocirc", "Ô"), + ("Ograve", "Ò"), + ("Omega", "Ω"), + ("Omicron", "Ο"), + ("Oslash", "Ø"), + ("Otilde", "Õ"), + ("Ouml", "Ö"), + ("Phi", "Φ"), + ("Pi", "Π"), + ("Prime", "″"), + ("Psi", "Ψ"), + ("Rho", "Ρ"), + ("Scaron", "Š"), + ("Sigma", "Σ"), + ("THORN", "Þ"), + ("Tau", "Τ"), + ("Theta", "Θ"), + ("Uacute", "Ú"), + ("Ucirc", "Û"), + ("Ugrave", "Ù"), + ("Upsilon", "Υ"), + ("Uuml", "Ü"), + ("Xi", "Ξ"), + ("Yacute", "Ý"), + ("Yuml", "Ÿ"), + ("Zeta", "Ζ"), + ("aacute", "á"), + ("acirc", "â"), + ("acute", "´"), + ("aelig", "æ"), + ("agrave", "à"), + ("alefsym", "ℵ"), + ("alpha", "α"), + ("amp", "&"), + ("and", "∧"), + ("ang", "∠"), + ("aring", "å"), + ("asymp", "≈"), + ("atilde", "ã"), + ("auml", "ä"), + ("bdquo", "„"), + ("beta", "β"), + ("brvbar", "¦"), + ("bull", "•"), + ("cap", "∩"), + ("ccedil", "ç"), + ("cedil", "¸"), + ("cent", "¢"), + ("chi", "χ"), + ("circ", "ˆ"), + ("clubs", "♣"), + ("cong", "≅"), + ("copy", "©"), + ("crarr", "↵"), + ("cup", "∪"), + ("curren", "¤"), + ("dArr", "⇓"), + ("dagger", "†"), + ("darr", "↓"), + ("deg", "°"), + ("delta", "δ"), + ("diams", "♦"), + ("divide", "÷"), + ("eacute", "é"), + ("ecirc", "ê"), + ("egrave", "è"), + ("empty", "∅"), + ("emsp", " "), + ("ensp", " "), + ("epsilon", "ε"), + ("equiv", "≡"), + ("eta", "η"), + ("eth", "ð"), + ("euml", "ë"), + ("euro", "€"), + ("exist", "∃"), + ("fnof", "ƒ"), + ("forall", "∀"), + ("frac12", "½"), + ("frac14", "¼"), + ("frac34", "¾"), + ("frasl", "⁄"), + ("gamma", "γ"), + ("ge", "≥"), + ("gt", ">"), + ("hArr", "⇔"), + ("harr", "↔"), + ("hearts", "♥"), + ("hellip", "…"), + ("iacute", "í"), + ("icirc", "î"), + ("iexcl", "¡"), + ("igrave", "ì"), + ("image", "ℑ"), + ("infin", "∞"), + ("int", "∫"), + ("iota", "ι"), + ("iquest", "¿"), + ("isin", "∈"), + ("iuml", "ï"), + ("kappa", "κ"), + ("lArr", "⇐"), + ("lambda", "λ"), + ("lang", "〈"), + ("laquo", "«"), + ("larr", "←"), + ("lceil", "⌈"), + ("ldquo", "“"), + ("le", "≤"), + ("lfloor", "⌊"), + ("lowast", "∗"), + ("loz", "◊"), + ("lrm", ""), + ("lsaquo", "‹"), + ("lsquo", "‘"), + ("lt", "<"), + ("macr", "¯"), + ("mdash", "—"), + ("micro", "µ"), + ("middot", "·"), + ("minus", "−"), + ("mu", "μ"), + ("nabla", "∇"), + ("nbsp", " "), + ("ndash", "–"), + ("ne", "≠"), + ("ni", "∋"), + ("not", "¬"), + ("notin", "∉"), + ("nsub", "⊄"), + ("ntilde", "ñ"), + ("nu", "ν"), + ("oacute", "ó"), + ("ocirc", "ô"), + ("oelig", "œ"), + ("ograve", "ò"), + ("oline", "‾"), + ("omega", "ω"), + ("omicron", "ο"), + ("oplus", "⊕"), + ("or", "∨"), + ("ordf", "ª"), + ("ordm", "º"), + ("oslash", "ø"), + ("otilde", "õ"), + ("otimes", "⊗"), + ("ouml", "ö"), + ("para", "¶"), + ("part", "∂"), + ("permil", "‰"), + ("perp", "⊥"), + ("phi", "φ"), + ("pi", "π"), + ("piv", "ϖ"), + ("plusmn", "±"), + ("pound", "£"), + ("prime", "′"), + ("prod", "∏"), + ("prop", "∝"), + ("psi", "ψ"), + ("quot", "\""), + ("rArr", "⇒"), + ("radic", "√"), + ("rang", "〉"), + ("raquo", "»"), + ("rarr", "→"), + ("rceil", "⌉"), + ("rdquo", "”"), + ("real", "ℜ"), + ("reg", "®"), + ("rfloor", "⌋"), + ("rho", "ρ"), + ("rlm", ""), + ("rsaquo", "›"), + ("rsquo", "’"), + ("sbquo", "‚"), + ("scaron", "š"), + ("sdot", "⋅"), + ("sect", "§"), + ("shy", "\u{AD}"), + ("sigma", "σ"), + ("sigmaf", "ς"), + ("sim", "∼"), + ("spades", "♠"), + ("sub", "⊂"), + ("sube", "⊆"), + ("sum", "∑"), + ("sup", "⊃"), + ("sup1", "¹"), + ("sup2", "²"), + ("sup3", "³"), + ("supe", "⊇"), + ("szlig", "ß"), + ("tau", "τ"), + ("there4", "∴"), + ("theta", "θ"), + ("thetasym", "ϑ"), + ("thinsp", " "), + ("thorn", "þ"), + ("tilde", "˜"), + ("times", "×"), + ("trade", "™"), + ("uArr", "⇑"), + ("uacute", "ú"), + ("uarr", "↑"), + ("ucirc", "û"), + ("ugrave", "ù"), + ("uml", "¨"), + ("upsih", "ϒ"), + ("upsilon", "υ"), + ("uuml", "ü"), + ("weierp", "℘"), + ("xi", "ξ"), + ("yacute", "ý"), + ("yen", "¥"), + ("yuml", "ÿ"), + ("zeta", "ζ"), + ("zwj", ""), + ("zwnj", ""), +]; + #[cfg(test)] mod tests { use super::*; diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs deleted file mode 100644 index d4c983a..0000000 --- a/src/util/decode_character_reference.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Decode character references. - -use crate::util::constant::CHARACTER_REFERENCES; -use alloc::string::{String, ToString}; - -/// Decode named character references. -/// -/// Turn the name coming from a named character reference (without the `&` or -/// `;`) into a string. -/// This looks the given string up at `0` in the tuples of -/// [`CHARACTER_REFERENCES`][] and then takes the corresponding value from `1`. -/// -/// The result is `String` instead of `char` because named character references -/// can expand into multiple characters. -/// -/// ## Examples -/// -/// ```rust ignore -/// use micromark::util::decode_character_reference::decode_named; -/// -/// assert_eq!(decode_named("amp"), "&"); -/// assert_eq!(decode_named("AElig"), "Æ"); -/// assert_eq!(decode_named("aelig"), "æ"); -/// ``` -/// -/// ## Panics -/// -/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is -/// given. -/// It is expected that figuring out whether a name is allowed is handled in -/// the parser. -/// When `micromark` is used, this function never panics. -/// -/// ## References -/// -/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) -/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_named(value: &str) -> String { - let entry = CHARACTER_REFERENCES.iter().find(|d| d.0 == value); - let tuple = entry.expect("expected valid `name`"); - tuple.1.to_string() -} - -/// Decode numeric character references. -/// -/// Turn the number (in string form as either hexadecimal or decimal) coming -/// from a numeric character reference into a string. -/// The base of the string form must be passed as the `radix` parameter, as -/// `10` (decimal) or `16` (hexadecimal). -/// -/// This returns a `String` form of the associated character or a replacement -/// character for C0 control characters (except for ASCII whitespace), C1 -/// control characters, lone surrogates, noncharacters, and out of range -/// characters. -/// -/// ## Examples -/// -/// ```rust ignore -/// use micromark::util::decode_character_reference::decode_numeric; -/// -/// assert_eq!(decode_numeric("123", 10), "{"); -/// assert_eq!(decode_numeric("9", 16), "\t"); -/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. -/// ``` -/// -/// ## Panics -/// -/// This function panics if a invalid string or an out of bounds valid string -/// is given. -/// It is expected that figuring out whether a number is allowed is handled in -/// the parser. -/// When `micromark` is used, this function never panics. -/// -/// ## References -/// -/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) -/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_numeric(value: &str, radix: u32) -> String { - if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { - if !matches!(char, - // C0 except for HT, LF, FF, CR, space - '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | - // Control character (DEL) of c0, and C1 controls. - '\u{7F}'..='\u{9F}' - // Lone surrogates, noncharacters, and out of range are handled by - // Rust. - ) { - return char.to_string(); - } - } - - char::REPLACEMENT_CHARACTER.to_string() -} diff --git a/src/util/mod.rs b/src/util/mod.rs index dcbf1ae..edc7e14 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,8 +1,8 @@ //! Utilities used when processing markdown. pub mod char; +pub mod character_reference; pub mod constant; -pub mod decode_character_reference; pub mod edit_map; pub mod encode; pub mod gfm_tagfilter; |