aboutsummaryrefslogtreecommitdiffstats
path: root/src/util
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/util/character_reference.rs206
-rw-r--r--src/util/constant.rs268
-rw-r--r--src/util/decode_character_reference.rs93
-rw-r--r--src/util/mod.rs2
4 files changed, 475 insertions, 94 deletions
diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs
new file mode 100644
index 0000000..75db98b
--- /dev/null
+++ b/src/util/character_reference.rs
@@ -0,0 +1,206 @@
+//! Helpers for character references.
+
+use crate::util::constant::{
+ CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+};
+use alloc::string::{String, ToString};
+use core::str;
+
+/// Decode named character references.
+///
+/// Turn the name coming from a named character reference (without the `&` or
+/// `;`) into a string.
+/// This looks the given string up at `0` in the tuples of
+/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then
+/// takes the corresponding value from `1`.
+///
+/// The result is `String` instead of `char` because named character references
+/// can expand into multiple characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_named;
+///
+/// assert_eq!(decode_named("amp", true), "&");
+/// assert_eq!(decode_named("AElig", true), "Æ");
+/// assert_eq!(decode_named("aelig", true), "æ");
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
+/// given.
+/// It is expected that figuring out whether a name is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_named(value: &str, html5: bool) -> Option<String> {
+ let mut iter = if html5 {
+ CHARACTER_REFERENCES.iter()
+ } else {
+ CHARACTER_REFERENCES_HTML_4.iter()
+ };
+ iter.find(|d| d.0 == value).map(|d| d.1.into())
+}
+
+/// Decode numeric character references.
+///
+/// Turn the number (in string form as either hexadecimal or decimal) coming
+/// from a numeric character reference into a string.
+/// The base of the string form must be passed as the `radix` parameter, as
+/// `10` (decimal) or `16` (hexadecimal).
+///
+/// This returns a `String` form of the associated character or a replacement
+/// character for C0 control characters (except for ASCII whitespace), C1
+/// control characters, lone surrogates, noncharacters, and out of range
+/// characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_character_reference::decode_numeric;
+///
+/// assert_eq!(decode_numeric("123", 10), "{");
+/// assert_eq!(decode_numeric("9", 16), "\t");
+/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a invalid string or an out of bounds valid string
+/// is given.
+/// It is expected that figuring out whether a number is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_numeric(value: &str, radix: u32) -> String {
+ if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
+ if !matches!(char,
+ // C0 except for HT, LF, FF, CR, space
+ '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
+ // Control character (DEL) of c0, and C1 controls.
+ '\u{7F}'..='\u{9F}'
+ // Lone surrogates, noncharacters, and out of range are handled by
+ // Rust.
+ ) {
+ return char.to_string();
+ }
+ }
+
+ char::REPLACEMENT_CHARACTER.to_string()
+}
+
+pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> {
+ match marker {
+ b'#' => Some(decode_numeric(value, 10)),
+ b'x' => Some(decode_numeric(value, 16)),
+ b'&' => decode_named(value, html5),
+ _ => unreachable!("Unexpected marker `{}`", marker),
+ }
+}
+
+/// Get the maximum size of a value for different kinds of references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_max(marker: u8) -> usize {
+ match marker {
+ b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
+ b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
+ b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
+ _ => unreachable!("Unexpected marker `{}`", marker),
+ }
+}
+
+/// Get a test to check if a byte is allowed as a value for different kinds of
+/// references.
+///
+/// The value is the stuff after the markers, before the `;`.
+///
+/// ## Panics
+///
+/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`.
+pub fn value_test(marker: u8) -> fn(&u8) -> bool {
+ match marker {
+ b'&' => u8::is_ascii_alphanumeric,
+ b'x' => u8::is_ascii_hexdigit,
+ b'#' => u8::is_ascii_digit,
+ _ => unreachable!("Unexpected marker `{}`", marker),
+ }
+}
+
+/// Decode character references in a string.
+///
+/// Note: this currently only supports HTML 4 references, as it’s only used for
+/// them.
+///
+/// If it’s ever needed to support HTML 5 (which is what normal markdown uses),
+/// a boolean parameter can be added here.
+pub fn parse(value: &str) -> String {
+ let bytes = value.as_bytes();
+ let mut index = 0;
+ let len = bytes.len();
+ // Grows a bit smaller with each character reference.
+ let mut result = String::with_capacity(value.len());
+ let mut start = 0;
+
+ while index < len {
+ if bytes[index] == b'&' {
+ let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' {
+ if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') {
+ (b'x', index + 3)
+ } else {
+ (b'#', index + 2)
+ }
+ } else {
+ (b'&', index + 1)
+ };
+
+ let max = value_max(marker);
+ let test = value_test(marker);
+ let mut value_index = 0;
+ while value_index < max && (value_start + value_index) < len {
+ if !test(&bytes[value_start + value_index]) {
+ break;
+ }
+ value_index += 1;
+ }
+
+ let value_end = value_start + value_index;
+
+ // Non empty and terminated.
+ if value_index > 0 && bytes[value_end] == b';' {
+ if let Some(decoded) = decode(
+ str::from_utf8(&bytes[value_start..value_end]).unwrap(),
+ marker,
+ false,
+ ) {
+ result.push_str(&value[start..index]);
+ result.push_str(&decoded);
+ start = value_end + 1;
+ index = start;
+ continue;
+ }
+ }
+ }
+
+ index += 1;
+ }
+
+ result.push_str(&value[start..]);
+
+ result
+}
diff --git a/src/util/constant.rs b/src/util/constant.rs
index e9deac2..65704d0 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -2433,6 +2433,274 @@ pub const CHARACTER_REFERENCES: [(&str, &str); 2125] = [
("zwnj", "‌"),
];
+// Important: please touch the below lists as few times as possible to keep Git small.
+
+/// List of names and values that form named character reference in HTML 4.
+///
+/// This list is normally not used in markdown, but it is used in MDX, because
+/// in JSX attribute values, only the old HTML 4 character references are
+/// supported.
+///
+/// This list is sensitive to casing.
+///
+/// ## References
+///
+/// * [*§ 1.5.2 HTML Character References* in `JSX`](https://facebook.github.io/jsx/#sec-HTMLCharacterReference)
+pub const CHARACTER_REFERENCES_HTML_4: [(&str, &str); 252] = [
+ ("AElig", "Æ"),
+ ("Aacute", "Á"),
+ ("Acirc", "Â"),
+ ("Agrave", "À"),
+ ("Alpha", "Α"),
+ ("Aring", "Å"),
+ ("Atilde", "Ã"),
+ ("Auml", "Ä"),
+ ("Beta", "Β"),
+ ("Ccedil", "Ç"),
+ ("Chi", "Χ"),
+ ("Dagger", "‡"),
+ ("Delta", "Δ"),
+ ("ETH", "Ð"),
+ ("Eacute", "É"),
+ ("Ecirc", "Ê"),
+ ("Egrave", "È"),
+ ("Epsilon", "Ε"),
+ ("Eta", "Η"),
+ ("Euml", "Ë"),
+ ("Gamma", "Γ"),
+ ("Iacute", "Í"),
+ ("Icirc", "Î"),
+ ("Igrave", "Ì"),
+ ("Iota", "Ι"),
+ ("Iuml", "Ï"),
+ ("Kappa", "Κ"),
+ ("Lambda", "Λ"),
+ ("Mu", "Μ"),
+ ("Ntilde", "Ñ"),
+ ("Nu", "Ν"),
+ ("OElig", "Œ"),
+ ("Oacute", "Ó"),
+ ("Ocirc", "Ô"),
+ ("Ograve", "Ò"),
+ ("Omega", "Ω"),
+ ("Omicron", "Ο"),
+ ("Oslash", "Ø"),
+ ("Otilde", "Õ"),
+ ("Ouml", "Ö"),
+ ("Phi", "Φ"),
+ ("Pi", "Π"),
+ ("Prime", "″"),
+ ("Psi", "Ψ"),
+ ("Rho", "Ρ"),
+ ("Scaron", "Š"),
+ ("Sigma", "Σ"),
+ ("THORN", "Þ"),
+ ("Tau", "Τ"),
+ ("Theta", "Θ"),
+ ("Uacute", "Ú"),
+ ("Ucirc", "Û"),
+ ("Ugrave", "Ù"),
+ ("Upsilon", "Υ"),
+ ("Uuml", "Ü"),
+ ("Xi", "Ξ"),
+ ("Yacute", "Ý"),
+ ("Yuml", "Ÿ"),
+ ("Zeta", "Ζ"),
+ ("aacute", "á"),
+ ("acirc", "â"),
+ ("acute", "´"),
+ ("aelig", "æ"),
+ ("agrave", "à"),
+ ("alefsym", "ℵ"),
+ ("alpha", "α"),
+ ("amp", "&"),
+ ("and", "∧"),
+ ("ang", "∠"),
+ ("aring", "å"),
+ ("asymp", "≈"),
+ ("atilde", "ã"),
+ ("auml", "ä"),
+ ("bdquo", "„"),
+ ("beta", "β"),
+ ("brvbar", "¦"),
+ ("bull", "•"),
+ ("cap", "∩"),
+ ("ccedil", "ç"),
+ ("cedil", "¸"),
+ ("cent", "¢"),
+ ("chi", "χ"),
+ ("circ", "ˆ"),
+ ("clubs", "♣"),
+ ("cong", "≅"),
+ ("copy", "©"),
+ ("crarr", "↵"),
+ ("cup", "∪"),
+ ("curren", "¤"),
+ ("dArr", "⇓"),
+ ("dagger", "†"),
+ ("darr", "↓"),
+ ("deg", "°"),
+ ("delta", "δ"),
+ ("diams", "♦"),
+ ("divide", "÷"),
+ ("eacute", "é"),
+ ("ecirc", "ê"),
+ ("egrave", "è"),
+ ("empty", "∅"),
+ ("emsp", " "),
+ ("ensp", " "),
+ ("epsilon", "ε"),
+ ("equiv", "≡"),
+ ("eta", "η"),
+ ("eth", "ð"),
+ ("euml", "ë"),
+ ("euro", "€"),
+ ("exist", "∃"),
+ ("fnof", "ƒ"),
+ ("forall", "∀"),
+ ("frac12", "½"),
+ ("frac14", "¼"),
+ ("frac34", "¾"),
+ ("frasl", "⁄"),
+ ("gamma", "γ"),
+ ("ge", "≥"),
+ ("gt", ">"),
+ ("hArr", "⇔"),
+ ("harr", "↔"),
+ ("hearts", "♥"),
+ ("hellip", "…"),
+ ("iacute", "í"),
+ ("icirc", "î"),
+ ("iexcl", "¡"),
+ ("igrave", "ì"),
+ ("image", "ℑ"),
+ ("infin", "∞"),
+ ("int", "∫"),
+ ("iota", "ι"),
+ ("iquest", "¿"),
+ ("isin", "∈"),
+ ("iuml", "ï"),
+ ("kappa", "κ"),
+ ("lArr", "⇐"),
+ ("lambda", "λ"),
+ ("lang", "〈"),
+ ("laquo", "«"),
+ ("larr", "←"),
+ ("lceil", "⌈"),
+ ("ldquo", "“"),
+ ("le", "≤"),
+ ("lfloor", "⌊"),
+ ("lowast", "∗"),
+ ("loz", "◊"),
+ ("lrm", "‎"),
+ ("lsaquo", "‹"),
+ ("lsquo", "‘"),
+ ("lt", "<"),
+ ("macr", "¯"),
+ ("mdash", "—"),
+ ("micro", "µ"),
+ ("middot", "·"),
+ ("minus", "−"),
+ ("mu", "μ"),
+ ("nabla", "∇"),
+ ("nbsp", " "),
+ ("ndash", "–"),
+ ("ne", "≠"),
+ ("ni", "∋"),
+ ("not", "¬"),
+ ("notin", "∉"),
+ ("nsub", "⊄"),
+ ("ntilde", "ñ"),
+ ("nu", "ν"),
+ ("oacute", "ó"),
+ ("ocirc", "ô"),
+ ("oelig", "œ"),
+ ("ograve", "ò"),
+ ("oline", "‾"),
+ ("omega", "ω"),
+ ("omicron", "ο"),
+ ("oplus", "⊕"),
+ ("or", "∨"),
+ ("ordf", "ª"),
+ ("ordm", "º"),
+ ("oslash", "ø"),
+ ("otilde", "õ"),
+ ("otimes", "⊗"),
+ ("ouml", "ö"),
+ ("para", "¶"),
+ ("part", "∂"),
+ ("permil", "‰"),
+ ("perp", "⊥"),
+ ("phi", "φ"),
+ ("pi", "π"),
+ ("piv", "ϖ"),
+ ("plusmn", "±"),
+ ("pound", "£"),
+ ("prime", "′"),
+ ("prod", "∏"),
+ ("prop", "∝"),
+ ("psi", "ψ"),
+ ("quot", "\""),
+ ("rArr", "⇒"),
+ ("radic", "√"),
+ ("rang", "〉"),
+ ("raquo", "»"),
+ ("rarr", "→"),
+ ("rceil", "⌉"),
+ ("rdquo", "”"),
+ ("real", "ℜ"),
+ ("reg", "®"),
+ ("rfloor", "⌋"),
+ ("rho", "ρ"),
+ ("rlm", "‏"),
+ ("rsaquo", "›"),
+ ("rsquo", "’"),
+ ("sbquo", "‚"),
+ ("scaron", "š"),
+ ("sdot", "⋅"),
+ ("sect", "§"),
+ ("shy", "\u{AD}"),
+ ("sigma", "σ"),
+ ("sigmaf", "ς"),
+ ("sim", "∼"),
+ ("spades", "♠"),
+ ("sub", "⊂"),
+ ("sube", "⊆"),
+ ("sum", "∑"),
+ ("sup", "⊃"),
+ ("sup1", "¹"),
+ ("sup2", "²"),
+ ("sup3", "³"),
+ ("supe", "⊇"),
+ ("szlig", "ß"),
+ ("tau", "τ"),
+ ("there4", "∴"),
+ ("theta", "θ"),
+ ("thetasym", "ϑ"),
+ ("thinsp", " "),
+ ("thorn", "þ"),
+ ("tilde", "˜"),
+ ("times", "×"),
+ ("trade", "™"),
+ ("uArr", "⇑"),
+ ("uacute", "ú"),
+ ("uarr", "↑"),
+ ("ucirc", "û"),
+ ("ugrave", "ù"),
+ ("uml", "¨"),
+ ("upsih", "ϒ"),
+ ("upsilon", "υ"),
+ ("uuml", "ü"),
+ ("weierp", "℘"),
+ ("xi", "ξ"),
+ ("yacute", "ý"),
+ ("yen", "¥"),
+ ("yuml", "ÿ"),
+ ("zeta", "ζ"),
+ ("zwj", "‍"),
+ ("zwnj", "‌"),
+];
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs
deleted file mode 100644
index d4c983a..0000000
--- a/src/util/decode_character_reference.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-//! Decode character references.
-
-use crate::util::constant::CHARACTER_REFERENCES;
-use alloc::string::{String, ToString};
-
-/// Decode named character references.
-///
-/// Turn the name coming from a named character reference (without the `&` or
-/// `;`) into a string.
-/// This looks the given string up at `0` in the tuples of
-/// [`CHARACTER_REFERENCES`][] and then takes the corresponding value from `1`.
-///
-/// The result is `String` instead of `char` because named character references
-/// can expand into multiple characters.
-///
-/// ## Examples
-///
-/// ```rust ignore
-/// use micromark::util::decode_character_reference::decode_named;
-///
-/// assert_eq!(decode_named("amp"), "&");
-/// assert_eq!(decode_named("AElig"), "Æ");
-/// assert_eq!(decode_named("aelig"), "æ");
-/// ```
-///
-/// ## Panics
-///
-/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is
-/// given.
-/// It is expected that figuring out whether a name is allowed is handled in
-/// the parser.
-/// When `micromark` is used, this function never panics.
-///
-/// ## References
-///
-/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
-/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_named(value: &str) -> String {
- let entry = CHARACTER_REFERENCES.iter().find(|d| d.0 == value);
- let tuple = entry.expect("expected valid `name`");
- tuple.1.to_string()
-}
-
-/// Decode numeric character references.
-///
-/// Turn the number (in string form as either hexadecimal or decimal) coming
-/// from a numeric character reference into a string.
-/// The base of the string form must be passed as the `radix` parameter, as
-/// `10` (decimal) or `16` (hexadecimal).
-///
-/// This returns a `String` form of the associated character or a replacement
-/// character for C0 control characters (except for ASCII whitespace), C1
-/// control characters, lone surrogates, noncharacters, and out of range
-/// characters.
-///
-/// ## Examples
-///
-/// ```rust ignore
-/// use micromark::util::decode_character_reference::decode_numeric;
-///
-/// assert_eq!(decode_numeric("123", 10), "{");
-/// assert_eq!(decode_numeric("9", 16), "\t");
-/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
-/// ```
-///
-/// ## Panics
-///
-/// This function panics if a invalid string or an out of bounds valid string
-/// is given.
-/// It is expected that figuring out whether a number is allowed is handled in
-/// the parser.
-/// When `micromark` is used, this function never panics.
-///
-/// ## References
-///
-/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
-/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_numeric(value: &str, radix: u32) -> String {
- if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
- if !matches!(char,
- // C0 except for HT, LF, FF, CR, space
- '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
- // Control character (DEL) of c0, and C1 controls.
- '\u{7F}'..='\u{9F}'
- // Lone surrogates, noncharacters, and out of range are handled by
- // Rust.
- ) {
- return char.to_string();
- }
- }
-
- char::REPLACEMENT_CHARACTER.to_string()
-}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index dcbf1ae..edc7e14 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,8 +1,8 @@
//! Utilities used when processing markdown.
pub mod char;
+pub mod character_reference;
pub mod constant;
-pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
pub mod gfm_tagfilter;