diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-09-09 10:54:13 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-09-09 10:54:13 +0200 |
commit | 13337d77954b4c92d1cf4592f43f01d94fce3c77 (patch) | |
tree | d5feef9a971c1af52e58b5c857d1dd9c9e7fedca /src/util/char.rs | |
parent | 71dbc8c0189d6b2032f3d8f21cbfffa3f8fe0f12 (diff) | |
download | markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.gz markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.bz2 markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.zip |
Refactor to move byte, char info to own file
Diffstat (limited to 'src/util/char.rs')
-rw-r--r-- | src/util/char.rs | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/src/util/char.rs b/src/util/char.rs new file mode 100644 index 0000000..cfaacd5 --- /dev/null +++ b/src/util/char.rs @@ -0,0 +1,165 @@ +//! Deal with byte and chars and kinds. + +use crate::util::unicode::PUNCTUATION; +use alloc::{ + format, + string::{String, ToString}, +}; +use core::str; + +/// Character kinds. +#[derive(Debug, PartialEq, Eq)] +pub enum Kind { + /// Whitespace. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Whitespace, + /// Punctuation. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^^ ^ ^ ^ + /// ``` + Punctuation, + /// Everything else. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Other, +} + +/// Get a [`char`][] right before `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn before_index(bytes: &[u8], index: usize) -> Option<char> { + let start = if index < 4 { 0 } else { index - 4 }; + String::from_utf8_lossy(&bytes[start..index]).chars().last() +} + +/// Get a [`char`][] right at `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn after_index(bytes: &[u8], index: usize) -> Option<char> { + let end = if index + 4 > bytes.len() { + bytes.len() + } else { + index + 4 + }; + String::from_utf8_lossy(&bytes[index..end]).chars().next() +} + +/// Classify a char at `index` in bytes (`&[u8]`). +pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind { + if index == bytes.len() { + Kind::Whitespace + } else { + let byte = bytes[index]; + if byte.is_ascii_whitespace() { + Kind::Whitespace + } else if byte.is_ascii_punctuation() { + Kind::Punctuation + } else if byte.is_ascii_alphanumeric() { + Kind::Other + } else { + // Otherwise: seems to be an ASCII control, so it seems to be a + // non-ASCII `char`. + classify_opt(after_index(bytes, index)) + } + } +} + +/// Classify whether a `char` represents whitespace, punctuation, or something +/// else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// ## References +/// +/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) +pub fn classify(char: char) -> Kind { + // Unicode whitespace. + if char.is_whitespace() { + Kind::Whitespace + } + // Unicode punctuation. + else if PUNCTUATION.contains(&char) { + Kind::Punctuation + } + // Everything else. + else { + Kind::Other + } +} + +/// Like [`classify`], but supports eof as whitespace. +pub fn classify_opt(char_opt: Option<char>) -> Kind { + if let Some(char) = char_opt { + classify(char) + } + // EOF. + else { + Kind::Whitespace + } +} + +/// Format an optional `char` (`none` means eof). +pub fn format_opt(char: Option<char>) -> String { + match char { + None => "end of file".to_string(), + Some(char) => format!("character {}", format(char)), + } +} + +/// Format an optional `byte` (`none` means eof). +pub fn format_byte_opt(byte: Option<u8>) -> String { + match byte { + None => "end of file".to_string(), + Some(byte) => format!("byte {}", format_byte(byte)), + } +} + +/// Format a `char`. +pub fn format(char: char) -> String { + let representation = format!("U+{:>04X}", char as u32); + let printable = match char { + '`' => Some("`` ` ``".to_string()), + '!'..='~' => Some(format!("`{}`", char)), + _ => None, + }; + + if let Some(char) = printable { + format!("{} ({})", char, representation) + } else { + representation + } +} + +/// Format a byte (`u8`). +pub fn format_byte(byte: u8) -> String { + let representation = format!("U+{:>04X}", byte); + let printable = match byte { + b'`' => Some("`` ` ``".to_string()), + b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())), + _ => None, + }; + + if let Some(char) = printable { + format!("{} ({})", char, representation) + } else { + representation + } +} |