diff options
Diffstat (limited to '')
| -rw-r--r-- | src/construct/character_reference.rs | 27 | ||||
| -rw-r--r-- | src/to_html.rs | 12 | ||||
| -rw-r--r-- | src/to_mdast.rs | 20 | ||||
| -rw-r--r-- | src/util/character_reference.rs | 206 | ||||
| -rw-r--r-- | src/util/constant.rs | 268 | ||||
| -rw-r--r-- | src/util/decode_character_reference.rs | 93 | ||||
| -rw-r--r-- | src/util/mod.rs | 2 | 
7 files changed, 493 insertions, 135 deletions
| diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 927e3d9..d87050c 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -64,7 +64,7 @@  //! [string]: crate::construct::string  //! [text]: crate::construct::text  //! [character_escape]: crate::construct::character_reference -//! [decode_numeric]: crate::util::decode_character_reference::decode_numeric +//! [decode_numeric]: crate::util::character_reference::decode_numeric  //! [character_references]: crate::util::constant::CHARACTER_REFERENCES  //! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state @@ -72,10 +72,7 @@ use crate::event::Name;  use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer;  use crate::util::{ -    constant::{ -        CHARACTER_REFERENCES, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, -        CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, -    }, +    character_reference::{decode_named, value_max, value_test},      slice::Slice,  }; @@ -173,9 +170,8 @@ pub fn value(tokenizer: &mut Tokenizer) -> State {                  tokenizer.point.index - tokenizer.tokenize_state.size,                  tokenizer.point.index,              ); -            let name = slice.as_str(); -            if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { +            if decode_named(slice.as_str(), true).is_none() {                  tokenizer.tokenize_state.marker = 0;                  tokenizer.tokenize_state.size = 0;                  return State::Nok; @@ -192,21 +188,10 @@ pub fn value(tokenizer: &mut Tokenizer) -> State {          return State::Ok;      } -    let max = match tokenizer.tokenize_state.marker { -        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, -        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, -        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, -        _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker), -    }; -    let test = match tokenizer.tokenize_state.marker { -        b'&' => u8::is_ascii_alphanumeric, -        b'x' => u8::is_ascii_hexdigit, -        b'#' => u8::is_ascii_digit, -        _ => unreachable!("Unexpected marker `{}`", tokenizer.tokenize_state.marker), -    }; -      if let Some(byte) = tokenizer.current { -        if tokenizer.tokenize_state.size < max && test(&byte) { +        if tokenizer.tokenize_state.size < value_max(tokenizer.tokenize_state.marker) +            && value_test(tokenizer.tokenize_state.marker)(&byte) +        {              tokenizer.tokenize_state.size += 1;              tokenizer.consume();              return State::Next(StateName::CharacterReferenceValue); diff --git a/src/to_html.rs b/src/to_html.rs index d7d054d..814f7cf 100644 --- a/src/to_html.rs +++ b/src/to_html.rs @@ -2,8 +2,8 @@  use crate::event::{Event, Kind, Name};  use crate::mdast::AlignKind;  use crate::util::{ +    character_reference::decode as decode_character_reference,      constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}, -    decode_character_reference::{decode_named, decode_numeric},      encode::encode,      gfm_tagfilter::gfm_tagfilter,      infer::{gfm_table_align, list_loose}, @@ -783,14 +783,8 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {          context.bytes,          &Position::from_exit_event(context.events, context.index),      ); -    let value = slice.as_str(); - -    let value = match marker { -        b'#' => decode_numeric(value, 10), -        b'x' => decode_numeric(value, 16), -        b'&' => decode_named(value), -        _ => panic!("impossible"), -    }; +    let value = decode_character_reference(slice.as_str(), marker, true) +        .expect("expected to parse only valid named references");      context.push(&encode(&value, context.encode_html));  } diff --git a/src/to_mdast.rs b/src/to_mdast.rs index c47eb22..4db76e6 100644 --- a/src/to_mdast.rs +++ b/src/to_mdast.rs @@ -10,7 +10,9 @@ use crate::mdast::{  };  use crate::unist::{Point, Position};  use crate::util::{ -    decode_character_reference::{decode_named, decode_numeric}, +    character_reference::{ +        decode as decode_character_reference, parse as parse_character_reference, +    },      infer::{gfm_table_align, list_item_loose, list_loose},      normalize_identifier::normalize_identifier,      slice::{Position as SlicePosition, Slice}, @@ -892,14 +894,9 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {          context.bytes,          &SlicePosition::from_exit_event(context.events, context.index),      ); -    let value = slice.as_str(); - -    let value = match context.character_reference_marker { -        b'#' => decode_numeric(value, 10), -        b'x' => decode_numeric(value, 16), -        b'&' => decode_named(value), -        _ => panic!("impossible"), -    }; +    let value = +        decode_character_reference(slice.as_str(), context.character_reference_marker, true) +            .expect("expected to parse only valid named references");      if let Node::Text(node) = context.tail_mut() {          node.value.push_str(value.as_str()); @@ -1558,8 +1555,9 @@ fn on_exit_mdx_jsx_tag_attribute_value_literal(context: &mut CompileContext) {          .attributes          .last_mut()      { -        // To do: character references. -        node.value = Some(AttributeValue::Literal(value.to_string())); +        node.value = Some(AttributeValue::Literal(parse_character_reference( +            &value.to_string(), +        )));      } else {          unreachable!("expected property")      } diff --git a/src/util/character_reference.rs b/src/util/character_reference.rs new file mode 100644 index 0000000..75db98b --- /dev/null +++ b/src/util/character_reference.rs @@ -0,0 +1,206 @@ +//! Helpers for character references. + +use crate::util::constant::{ +    CHARACTER_REFERENCES, CHARACTER_REFERENCES_HTML_4, CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, +    CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, +}; +use alloc::string::{String, ToString}; +use core::str; + +/// Decode named character references. +/// +/// Turn the name coming from a named character reference (without the `&` or +/// `;`) into a string. +/// This looks the given string up at `0` in the tuples of +/// [`CHARACTER_REFERENCES`][] (or [`CHARACTER_REFERENCES_HTML_4`][]) and then +/// takes the corresponding value from `1`. +/// +/// The result is `String` instead of `char` because named character references +/// can expand into multiple characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_named; +/// +/// assert_eq!(decode_named("amp", true), "&"); +/// assert_eq!(decode_named("AElig", true), "Æ"); +/// assert_eq!(decode_named("aelig", true), "æ"); +/// ``` +/// +/// ## Panics +/// +/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is +/// given. +/// It is expected that figuring out whether a name is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) +/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_named(value: &str, html5: bool) -> Option<String> { +    let mut iter = if html5 { +        CHARACTER_REFERENCES.iter() +    } else { +        CHARACTER_REFERENCES_HTML_4.iter() +    }; +    iter.find(|d| d.0 == value).map(|d| d.1.into()) +} + +/// Decode numeric character references. +/// +/// Turn the number (in string form as either hexadecimal or decimal) coming +/// from a numeric character reference into a string. +/// The base of the string form must be passed as the `radix` parameter, as +/// `10` (decimal) or `16` (hexadecimal). +/// +/// This returns a `String` form of the associated character or a replacement +/// character for C0 control characters (except for ASCII whitespace), C1 +/// control characters, lone surrogates, noncharacters, and out of range +/// characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_character_reference::decode_numeric; +/// +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. +/// ``` +/// +/// ## Panics +/// +/// This function panics if a invalid string or an out of bounds valid string +/// is given. +/// It is expected that figuring out whether a number is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) +/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_numeric(value: &str, radix: u32) -> String { +    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { +        if !matches!(char, +            // C0 except for HT, LF, FF, CR, space +            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | +            // Control character (DEL) of c0, and C1 controls. +            '\u{7F}'..='\u{9F}' +            // Lone surrogates, noncharacters, and out of range are handled by +            // Rust. +        ) { +            return char.to_string(); +        } +    } + +    char::REPLACEMENT_CHARACTER.to_string() +} + +pub fn decode(value: &str, marker: u8, html5: bool) -> Option<String> { +    match marker { +        b'#' => Some(decode_numeric(value, 10)), +        b'x' => Some(decode_numeric(value, 16)), +        b'&' => decode_named(value, html5), +        _ => unreachable!("Unexpected marker `{}`", marker), +    } +} + +/// Get the maximum size of a value for different kinds of references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_max(marker: u8) -> usize { +    match marker { +        b'&' => CHARACTER_REFERENCE_NAMED_SIZE_MAX, +        b'x' => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, +        b'#' => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, +        _ => unreachable!("Unexpected marker `{}`", marker), +    } +} + +/// Get a test to check if a byte is allowed as a value for different kinds of +/// references. +/// +/// The value is the stuff after the markers, before the `;`. +/// +/// ## Panics +/// +/// Panics if `marker` is not `b'&'`, `b'x'`, or `b'#'`. +pub fn value_test(marker: u8) -> fn(&u8) -> bool { +    match marker { +        b'&' => u8::is_ascii_alphanumeric, +        b'x' => u8::is_ascii_hexdigit, +        b'#' => u8::is_ascii_digit, +        _ => unreachable!("Unexpected marker `{}`", marker), +    } +} + +/// Decode character references in a string. +/// +/// Note: this currently only supports HTML 4 references, as it’s only used for +/// them. +/// +/// If it’s ever needed to support HTML 5 (which is what normal markdown uses), +/// a boolean parameter can be added here. +pub fn parse(value: &str) -> String { +    let bytes = value.as_bytes(); +    let mut index = 0; +    let len = bytes.len(); +    // Grows a bit smaller with each character reference. +    let mut result = String::with_capacity(value.len()); +    let mut start = 0; + +    while index < len { +        if bytes[index] == b'&' { +            let (marker, value_start) = if index + 1 < len && bytes[index + 1] == b'#' { +                if index + 2 < len && matches!(bytes[index + 2], b'x' | b'X') { +                    (b'x', index + 3) +                } else { +                    (b'#', index + 2) +                } +            } else { +                (b'&', index + 1) +            }; + +            let max = value_max(marker); +            let test = value_test(marker); +            let mut value_index = 0; +            while value_index < max && (value_start + value_index) < len { +                if !test(&bytes[value_start + value_index]) { +                    break; +                } +                value_index += 1; +            } + +            let value_end = value_start + value_index; + +            // Non empty and terminated. +            if value_index > 0 && bytes[value_end] == b';' { +                if let Some(decoded) = decode( +                    str::from_utf8(&bytes[value_start..value_end]).unwrap(), +                    marker, +                    false, +                ) { +                    result.push_str(&value[start..index]); +                    result.push_str(&decoded); +                    start = value_end + 1; +                    index = start; +                    continue; +                } +            } +        } + +        index += 1; +    } + +    result.push_str(&value[start..]); + +    result +} diff --git a/src/util/constant.rs b/src/util/constant.rs index e9deac2..65704d0 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -2433,6 +2433,274 @@ pub const CHARACTER_REFERENCES: [(&str, &str); 2125] = [      ("zwnj", ""),  ]; +// Important: please touch the below lists as few times as possible to keep Git small. + +/// List of names and values that form named character reference in HTML 4. +/// +/// This list is normally not used in markdown, but it is used in MDX, because +/// in JSX attribute values, only the old HTML 4 character references are +/// supported. +/// +/// This list is sensitive to casing. +/// +/// ## References +/// +/// *   [*§ 1.5.2 HTML Character References* in `JSX`](https://facebook.github.io/jsx/#sec-HTMLCharacterReference) +pub const CHARACTER_REFERENCES_HTML_4: [(&str, &str); 252] = [ +    ("AElig", "Æ"), +    ("Aacute", "Á"), +    ("Acirc", "Â"), +    ("Agrave", "À"), +    ("Alpha", "Α"), +    ("Aring", "Å"), +    ("Atilde", "Ã"), +    ("Auml", "Ä"), +    ("Beta", "Β"), +    ("Ccedil", "Ç"), +    ("Chi", "Χ"), +    ("Dagger", "‡"), +    ("Delta", "Δ"), +    ("ETH", "Ð"), +    ("Eacute", "É"), +    ("Ecirc", "Ê"), +    ("Egrave", "È"), +    ("Epsilon", "Ε"), +    ("Eta", "Η"), +    ("Euml", "Ë"), +    ("Gamma", "Γ"), +    ("Iacute", "Í"), +    ("Icirc", "Î"), +    ("Igrave", "Ì"), +    ("Iota", "Ι"), +    ("Iuml", "Ï"), +    ("Kappa", "Κ"), +    ("Lambda", "Λ"), +    ("Mu", "Μ"), +    ("Ntilde", "Ñ"), +    ("Nu", "Ν"), +    ("OElig", "Œ"), +    ("Oacute", "Ó"), +    ("Ocirc", "Ô"), +    ("Ograve", "Ò"), +    ("Omega", "Ω"), +    ("Omicron", "Ο"), +    ("Oslash", "Ø"), +    ("Otilde", "Õ"), +    ("Ouml", "Ö"), +    ("Phi", "Φ"), +    ("Pi", "Π"), +    ("Prime", "″"), +    ("Psi", "Ψ"), +    ("Rho", "Ρ"), +    ("Scaron", "Š"), +    ("Sigma", "Σ"), +    ("THORN", "Þ"), +    ("Tau", "Τ"), +    ("Theta", "Θ"), +    ("Uacute", "Ú"), +    ("Ucirc", "Û"), +    ("Ugrave", "Ù"), +    ("Upsilon", "Υ"), +    ("Uuml", "Ü"), +    ("Xi", "Ξ"), +    ("Yacute", "Ý"), +    ("Yuml", "Ÿ"), +    ("Zeta", "Ζ"), +    ("aacute", "á"), +    ("acirc", "â"), +    ("acute", "´"), +    ("aelig", "æ"), +    ("agrave", "à"), +    ("alefsym", "ℵ"), +    ("alpha", "α"), +    ("amp", "&"), +    ("and", "∧"), +    ("ang", "∠"), +    ("aring", "å"), +    ("asymp", "≈"), +    ("atilde", "ã"), +    ("auml", "ä"), +    ("bdquo", "„"), +    ("beta", "β"), +    ("brvbar", "¦"), +    ("bull", "•"), +    ("cap", "∩"), +    ("ccedil", "ç"), +    ("cedil", "¸"), +    ("cent", "¢"), +    ("chi", "χ"), +    ("circ", "ˆ"), +    ("clubs", "♣"), +    ("cong", "≅"), +    ("copy", "©"), +    ("crarr", "↵"), +    ("cup", "∪"), +    ("curren", "¤"), +    ("dArr", "⇓"), +    ("dagger", "†"), +    ("darr", "↓"), +    ("deg", "°"), +    ("delta", "δ"), +    ("diams", "♦"), +    ("divide", "÷"), +    ("eacute", "é"), +    ("ecirc", "ê"), +    ("egrave", "è"), +    ("empty", "∅"), +    ("emsp", " "), +    ("ensp", " "), +    ("epsilon", "ε"), +    ("equiv", "≡"), +    ("eta", "η"), +    ("eth", "ð"), +    ("euml", "ë"), +    ("euro", "€"), +    ("exist", "∃"), +    ("fnof", "ƒ"), +    ("forall", "∀"), +    ("frac12", "½"), +    ("frac14", "¼"), +    ("frac34", "¾"), +    ("frasl", "⁄"), +    ("gamma", "γ"), +    ("ge", "≥"), +    ("gt", ">"), +    ("hArr", "⇔"), +    ("harr", "↔"), +    ("hearts", "♥"), +    ("hellip", "…"), +    ("iacute", "í"), +    ("icirc", "î"), +    ("iexcl", "¡"), +    ("igrave", "ì"), +    ("image", "ℑ"), +    ("infin", "∞"), +    ("int", "∫"), +    ("iota", "ι"), +    ("iquest", "¿"), +    ("isin", "∈"), +    ("iuml", "ï"), +    ("kappa", "κ"), +    ("lArr", "⇐"), +    ("lambda", "λ"), +    ("lang", "〈"), +    ("laquo", "«"), +    ("larr", "←"), +    ("lceil", "⌈"), +    ("ldquo", "“"), +    ("le", "≤"), +    ("lfloor", "⌊"), +    ("lowast", "∗"), +    ("loz", "◊"), +    ("lrm", ""), +    ("lsaquo", "‹"), +    ("lsquo", "‘"), +    ("lt", "<"), +    ("macr", "¯"), +    ("mdash", "—"), +    ("micro", "µ"), +    ("middot", "·"), +    ("minus", "−"), +    ("mu", "μ"), +    ("nabla", "∇"), +    ("nbsp", " "), +    ("ndash", "–"), +    ("ne", "≠"), +    ("ni", "∋"), +    ("not", "¬"), +    ("notin", "∉"), +    ("nsub", "⊄"), +    ("ntilde", "ñ"), +    ("nu", "ν"), +    ("oacute", "ó"), +    ("ocirc", "ô"), +    ("oelig", "œ"), +    ("ograve", "ò"), +    ("oline", "‾"), +    ("omega", "ω"), +    ("omicron", "ο"), +    ("oplus", "⊕"), +    ("or", "∨"), +    ("ordf", "ª"), +    ("ordm", "º"), +    ("oslash", "ø"), +    ("otilde", "õ"), +    ("otimes", "⊗"), +    ("ouml", "ö"), +    ("para", "¶"), +    ("part", "∂"), +    ("permil", "‰"), +    ("perp", "⊥"), +    ("phi", "φ"), +    ("pi", "π"), +    ("piv", "ϖ"), +    ("plusmn", "±"), +    ("pound", "£"), +    ("prime", "′"), +    ("prod", "∏"), +    ("prop", "∝"), +    ("psi", "ψ"), +    ("quot", "\""), +    ("rArr", "⇒"), +    ("radic", "√"), +    ("rang", "〉"), +    ("raquo", "»"), +    ("rarr", "→"), +    ("rceil", "⌉"), +    ("rdquo", "”"), +    ("real", "ℜ"), +    ("reg", "®"), +    ("rfloor", "⌋"), +    ("rho", "ρ"), +    ("rlm", ""), +    ("rsaquo", "›"), +    ("rsquo", "’"), +    ("sbquo", "‚"), +    ("scaron", "š"), +    ("sdot", "⋅"), +    ("sect", "§"), +    ("shy", "\u{AD}"), +    ("sigma", "σ"), +    ("sigmaf", "ς"), +    ("sim", "∼"), +    ("spades", "♠"), +    ("sub", "⊂"), +    ("sube", "⊆"), +    ("sum", "∑"), +    ("sup", "⊃"), +    ("sup1", "¹"), +    ("sup2", "²"), +    ("sup3", "³"), +    ("supe", "⊇"), +    ("szlig", "ß"), +    ("tau", "τ"), +    ("there4", "∴"), +    ("theta", "θ"), +    ("thetasym", "ϑ"), +    ("thinsp", " "), +    ("thorn", "þ"), +    ("tilde", "˜"), +    ("times", "×"), +    ("trade", "™"), +    ("uArr", "⇑"), +    ("uacute", "ú"), +    ("uarr", "↑"), +    ("ucirc", "û"), +    ("ugrave", "ù"), +    ("uml", "¨"), +    ("upsih", "ϒ"), +    ("upsilon", "υ"), +    ("uuml", "ü"), +    ("weierp", "℘"), +    ("xi", "ξ"), +    ("yacute", "ý"), +    ("yen", "¥"), +    ("yuml", "ÿ"), +    ("zeta", "ζ"), +    ("zwj", ""), +    ("zwnj", ""), +]; +  #[cfg(test)]  mod tests {      use super::*; diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs deleted file mode 100644 index d4c983a..0000000 --- a/src/util/decode_character_reference.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Decode character references. - -use crate::util::constant::CHARACTER_REFERENCES; -use alloc::string::{String, ToString}; - -/// Decode named character references. -/// -/// Turn the name coming from a named character reference (without the `&` or -/// `;`) into a string. -/// This looks the given string up at `0` in the tuples of -/// [`CHARACTER_REFERENCES`][] and then takes the corresponding value from `1`. -/// -/// The result is `String` instead of `char` because named character references -/// can expand into multiple characters. -/// -/// ## Examples -/// -/// ```rust ignore -/// use micromark::util::decode_character_reference::decode_named; -/// -/// assert_eq!(decode_named("amp"), "&"); -/// assert_eq!(decode_named("AElig"), "Æ"); -/// assert_eq!(decode_named("aelig"), "æ"); -/// ``` -/// -/// ## Panics -/// -/// This function panics if a name not in [`CHARACTER_REFERENCES`][] is -/// given. -/// It is expected that figuring out whether a name is allowed is handled in -/// the parser. -/// When `micromark` is used, this function never panics. -/// -/// ## References -/// -/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) -/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_named(value: &str) -> String { -    let entry = CHARACTER_REFERENCES.iter().find(|d| d.0 == value); -    let tuple = entry.expect("expected valid `name`"); -    tuple.1.to_string() -} - -/// Decode numeric character references. -/// -/// Turn the number (in string form as either hexadecimal or decimal) coming -/// from a numeric character reference into a string. -/// The base of the string form must be passed as the `radix` parameter, as -/// `10` (decimal) or `16` (hexadecimal). -/// -/// This returns a `String` form of the associated character or a replacement -/// character for C0 control characters (except for ASCII whitespace), C1 -/// control characters, lone surrogates, noncharacters, and out of range -/// characters. -/// -/// ## Examples -/// -/// ```rust ignore -/// use micromark::util::decode_character_reference::decode_numeric; -/// -/// assert_eq!(decode_numeric("123", 10), "{"); -/// assert_eq!(decode_numeric("9", 16), "\t"); -/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. -/// ``` -/// -/// ## Panics -/// -/// This function panics if a invalid string or an out of bounds valid string -/// is given. -/// It is expected that figuring out whether a number is allowed is handled in -/// the parser. -/// When `micromark` is used, this function never panics. -/// -/// ## References -/// -/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) -/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_numeric(value: &str, radix: u32) -> String { -    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { -        if !matches!(char, -            // C0 except for HT, LF, FF, CR, space -            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | -            // Control character (DEL) of c0, and C1 controls. -            '\u{7F}'..='\u{9F}' -            // Lone surrogates, noncharacters, and out of range are handled by -            // Rust. -        ) { -            return char.to_string(); -        } -    } - -    char::REPLACEMENT_CHARACTER.to_string() -} diff --git a/src/util/mod.rs b/src/util/mod.rs index dcbf1ae..edc7e14 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,8 +1,8 @@  //! Utilities used when processing markdown.  pub mod char; +pub mod character_reference;  pub mod constant; -pub mod decode_character_reference;  pub mod edit_map;  pub mod encode;  pub mod gfm_tagfilter; | 
