Refactor to move byte, char info to own file

author: Titus Wormer <tituswormer@gmail.com> 2022-09-09 10:54:13 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-09 10:54:13 +0200
commit: 13337d77954b4c92d1cf4592f43f01d94fce3c77 (patch)
tree: d5feef9a971c1af52e58b5c857d1dd9c9e7fedca /src/util/char.rs
parent: 71dbc8c0189d6b2032f3d8f21cbfffa3f8fe0f12 (diff)
download: markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.gz
markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.bz2
markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.zip
1 files changed, 165 insertions, 0 deletions
diff --git a/src/util/char.rs b/src/util/char.rs
new file mode 100644
index 0000000..cfaacd5
--- /dev/null
+++ b/src/util/char.rs
@@ -0,0 +1,165 @@
+//! Deal with byte and chars and kinds.
+
+use crate::util::unicode::PUNCTUATION;
+use alloc::{
+    format,
+    string::{String, ToString},
+};
+use core::str;
+
+/// Character kinds.
+#[derive(Debug, PartialEq, Eq)]
+pub enum Kind {
+    /// Whitespace.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// > | **a_b_ c**.
+    ///    ^      ^    ^
+    /// ```
+    Whitespace,
+    /// Punctuation.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// > | **a_b_ c**.
+    ///     ^^ ^ ^    ^
+    /// ```
+    Punctuation,
+    /// Everything else.
+    ///
+    /// ## Example
+    ///
+    /// ```markdown
+    /// > | **a_b_ c**.
+    ///       ^ ^  ^
+    /// ```
+    Other,
+}
+
+/// Get a [`char`][] right before `index` in bytes (`&[u8]`).
+///
+/// In most cases, markdown operates on ASCII bytes.
+/// In a few cases, it is unicode aware, so we need to find an actual char.
+pub fn before_index(bytes: &[u8], index: usize) -> Option<char> {
+    let start = if index < 4 { 0 } else { index - 4 };
+    String::from_utf8_lossy(&bytes[start..index]).chars().last()
+}
+
+/// Get a [`char`][] right at `index` in bytes (`&[u8]`).
+///
+/// In most cases, markdown operates on ASCII bytes.
+/// In a few cases, it is unicode aware, so we need to find an actual char.
+pub fn after_index(bytes: &[u8], index: usize) -> Option<char> {
+    let end = if index + 4 > bytes.len() {
+        bytes.len()
+    } else {
+        index + 4
+    };
+    String::from_utf8_lossy(&bytes[index..end]).chars().next()
+}
+
+/// Classify a char at `index` in bytes (`&[u8]`).
+pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind {
+    if index == bytes.len() {
+        Kind::Whitespace
+    } else {
+        let byte = bytes[index];
+        if byte.is_ascii_whitespace() {
+            Kind::Whitespace
+        } else if byte.is_ascii_punctuation() {
+            Kind::Punctuation
+        } else if byte.is_ascii_alphanumeric() {
+            Kind::Other
+        } else {
+            // Otherwise: seems to be an ASCII control, so it seems to be a
+            // non-ASCII `char`.
+            classify_opt(after_index(bytes, index))
+        }
+    }
+}
+
+/// Classify whether a `char` represents whitespace, punctuation, or something
+/// else.
+///
+/// Used for attention (emphasis, strong), whose sequences can open or close
+/// based on the class of surrounding characters.
+///
+/// ## References
+///
+/// *   [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
+pub fn classify(char: char) -> Kind {
+    // Unicode whitespace.
+    if char.is_whitespace() {
+        Kind::Whitespace
+    }
+    // Unicode punctuation.
+    else if PUNCTUATION.contains(&char) {
+        Kind::Punctuation
+    }
+    // Everything else.
+    else {
+        Kind::Other
+    }
+}
+
+/// Like [`classify`], but supports eof as whitespace.
+pub fn classify_opt(char_opt: Option<char>) -> Kind {
+    if let Some(char) = char_opt {
+        classify(char)
+    }
+    // EOF.
+    else {
+        Kind::Whitespace
+    }
+}
+
+/// Format an optional `char` (`none` means eof).
+pub fn format_opt(char: Option<char>) -> String {
+    match char {
+        None => "end of file".to_string(),
+        Some(char) => format!("character {}", format(char)),
+    }
+}
+
+/// Format an optional `byte` (`none` means eof).
+pub fn format_byte_opt(byte: Option<u8>) -> String {
+    match byte {
+        None => "end of file".to_string(),
+        Some(byte) => format!("byte {}", format_byte(byte)),
+    }
+}
+
+/// Format a `char`.
+pub fn format(char: char) -> String {
+    let representation = format!("U+{:>04X}", char as u32);
+    let printable = match char {
+        '`' => Some("`` ` ``".to_string()),
+        '!'..='~' => Some(format!("`{}`", char)),
+        _ => None,
+    };
+
+    if let Some(char) = printable {
+        format!("{} ({})", char, representation)
+    } else {
+        representation
+    }
+}
+
+/// Format a byte (`u8`).
+pub fn format_byte(byte: u8) -> String {
+    let representation = format!("U+{:>04X}", byte);
+    let printable = match byte {
+        b'`' => Some("`` ` ``".to_string()),
+        b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())),
+        _ => None,
+    };
+
+    if let Some(char) = printable {
+        format!("{} ({})", char, representation)
+    } else {
+        representation
+    }
+}
author	Titus Wormer <tituswormer@gmail.com>	2022-09-09 10:54:13 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-09 10:54:13 +0200
commit	13337d77954b4c92d1cf4592f43f01d94fce3c77 (patch)
tree	d5feef9a971c1af52e58b5c857d1dd9c9e7fedca /src/util/char.rs
parent	71dbc8c0189d6b2032f3d8f21cbfffa3f8fe0f12 (diff)
download	markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.gz markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.bz2 markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.zip