diff options
Diffstat (limited to 'src/util')
-rw-r--r-- | src/util/classify_character.rs | 72 | ||||
-rw-r--r-- | src/util/mod.rs | 1 |
2 files changed, 73 insertions, 0 deletions
diff --git a/src/util/classify_character.rs b/src/util/classify_character.rs new file mode 100644 index 0000000..b938502 --- /dev/null +++ b/src/util/classify_character.rs @@ -0,0 +1,72 @@ +//! Utilities to classify characters as whitespace, punctuation, or rest. + +use crate::unicode::PUNCTUATION; + +/// Character kinds. +#[derive(Debug, PartialEq, Eq)] +pub enum Kind { + /// Whitespace. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Whitespace, + /// Punctuation. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^^ ^ ^ ^ + /// ``` + Punctuation, + /// Everything else. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Other, +} + +/// Classify whether a character code represents whitespace, punctuation, or +/// something else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// > 👉 **Note** that eof (`None`) is seen as whitespace. +/// +/// ## References +/// +/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) +pub fn classify(char: char) -> Kind { + // Unicode whitespace. + if char.is_whitespace() { + Kind::Whitespace + } + // Unicode punctuation. + else if PUNCTUATION.contains(&char) { + Kind::Punctuation + } + // Everything else. + else { + Kind::Other + } +} + +/// Like [`classify`], but supports eof as whitespace. +pub fn classify_opt(char_opt: Option<char>) -> Kind { + if let Some(char) = char_opt { + classify(char) + } + // EOF. + else { + Kind::Whitespace + } +} diff --git a/src/util/mod.rs b/src/util/mod.rs index f51845c..022c7d6 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ //! Utilities used when processing markdown. +pub mod classify_character; pub mod decode_character_reference; pub mod edit_map; pub mod encode; |