path: root/src/util/char.rs

                                      

//! Deal with bytes, chars, and kinds.

use crate::util::unicode::PUNCTUATION;
use alloc::{format, string::String};
use core::str;

/// Character kinds.
#[derive(Debug, PartialEq, Eq)]
pub enum Kind {
    /// Whitespace.
    ///
    /// ## Example
    ///
    /// ```markdown
    /// > | **a_b_ c**.
    ///    ^      ^    ^
    /// ```
    Whitespace,
    /// Punctuation.
    ///
    /// ## Example
    ///
    /// ```markdown
    /// > | **a_b_ c**.
    ///     ^^ ^ ^    ^
    /// ```
    Punctuation,
    /// Everything else.
    ///
    /// ## Example
    ///
    /// ```markdown
    /// > | **a_b_ c**.
    ///       ^ ^  ^
    /// ```
    Other,
}

/// Get a [`char`][] right before `index` in bytes (`&[u8]`).
///
/// In most cases, markdown operates on ASCII bytes.
/// In a few cases, it is unicode aware, so we need to find an actual char.
pub fn before_index(bytes: &[u8], index: usize) -> Option<char> {
    let start = if index < 4 { 0 } else { index - 4 };
    String::from_utf8_lossy(&bytes[start..index]).chars().last()
}

/// Get a [`char`][] right at `index` in bytes (`&[u8]`).
///
/// In most cases, markdown operates on ASCII bytes.
/// In a few cases, it is unicode aware, so we need to find an actual char.
pub fn after_index(bytes: &[u8], index: usize) -> Option<char> {
    let end = if index + 4 > bytes.len() {
        bytes.len()
    } else {
        index + 4
    };
    String::from_utf8_lossy(&bytes[index..end]).chars().next()
}

/// Classify a char at `index` in bytes (`&[u8]`).
pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind {
    if index == bytes.len() {
        Kind::Whitespace
    } else {
        let byte = bytes[index];
        if byte.is_ascii_whitespace() {
            Kind::Whitespace
        } else if byte.is_ascii_punctuation() {
            Kind::Punctuation
        } else if byte.is_ascii_alphanumeric() {
            Kind::Other
        } else {
            // Otherwise: seems to be an ASCII control, so it seems to be a
            // non-ASCII `char`.
            classify_opt(after_index(bytes, index))
        }
    }
}

/// Classify whether a `char` represents whitespace, punctuation, or something
/// else.
///
/// Used for attention (emphasis, strong), whose sequences can open or close
/// based on the class of surrounding characters.
///
/// ## References
///
/// *   [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
pub fn classify(char: char) -> Kind {
    // Unicode whitespace.
    if char.is_whitespace() {
        Kind::Whitespace
    }
    // Unicode punctuation.
    else if PUNCTUATION.contains(&char) {
        Kind::Punctuation
    }
    // Everything else.
    else {
        Kind::Other
    }
}

/// Like [`classify`], but supports eof as whitespace.
pub fn classify_opt(char_opt: Option<char>) -> Kind {
    char_opt.map_or(Kind::Whitespace, classify)
}

/// Format an optional `char` (`none` means eof).
pub fn format_opt(char: Option<char>) -> String {
    char.map_or("end of file".into(), |char| {
        format!("character {}", format(char))
    })
}

/// Format an optional `byte` (`none` means eof).
pub fn format_byte_opt(byte: Option<u8>) -> String {
    byte.map_or("end of file".into(), |byte| {
        format!("byte {}", format_byte(byte))
    })
}

/// Format a `char`.
pub fn format(char: char) -> String {
    let representation = format!("U+{:>04X}", char as u32);
    let printable = match char {
        '`' => Some("`` ` ``".into()),
        '!'..='~' => Some(format!("`{}`", char)),
        _ => None,
    };

    if let Some(char) = printable {
        format!("{} ({})", char, representation)
    } else {
        representation
    }
}

/// Format a byte (`u8`).
pub fn format_byte(byte: u8) -> String {
    let representation = format!("U+{:>04X}", byte);
    let printable = match byte {
        b'`' => Some("`` ` ``".into()),
        b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())),
        _ => None,
    };

    if let Some(char) = printable {
        format!("{} ({})", char, representation)
    } else {
        representation
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::string::ToString;

    #[test]
    fn test_classify() {
        assert_eq!(
            classify(' '),
            Kind::Whitespace,
            "should classify whitespace"
        );

        assert_eq!(
            classify('.'),
            Kind::Punctuation,
            "should classify punctuation"
        );

        assert_eq!(classify('a'), Kind::Other, "should classify other");
    }

    #[test]
    fn test_format_opt() {
        assert_eq!(
            format_opt(None),
            "end of file".to_string(),
            "should format an optional char: none -> eof"
        );

        assert_eq!(
            format_opt(Some('!')),
            "character `!` (U+0021)".to_string(),
            "should format an optional char: char -> pretty"
        );
    }

    #[test]
    fn test_format_byte_opt() {
        assert_eq!(
            format_byte_opt(None),
            "end of file".to_string(),
            "should format an optional byte: none -> eof"
        );

        assert_eq!(
            format_byte_opt(Some(b'!')),
            "byte `!` (U+0021)".to_string(),
            "should format an optional byte: char -> pretty"
        );
    }

    #[test]
    fn test_format() {
        assert_eq!(
            format('`'),
            "`` ` `` (U+0060)".to_string(),
            "should format a char: grave accent"
        );

        assert_eq!(
            format('!'),
            "`!` (U+0021)".to_string(),
            "should format a char: regular"
        );

        assert_eq!(
            format(' '),
            "U+0020".to_string(),
            "should format a char: unprintable"
        );
    }

    #[test]
    fn test_format_byte() {
        assert_eq!(
            format_byte(b'`'),
            "`` ` `` (U+0060)".to_string(),
            "should format a byte: grave accent"
        );

        assert_eq!(
            format_byte(b'!'),
            "`!` (U+0021)".to_string(),
            "should format a byte: regular"
        );

        assert_eq!(
            format_byte(b' '),
            "U+0020".to_string(),
            "should format a byte: unprintable"
        );
    }
}
//! Deal with bytes, chars, and kinds.

use crate::util::unicode::PUNCTUATION;
use alloc::{format, string::String};
use core::str;

/// Character kinds.
#[derive(Debug, PartialEq, Eq)]
pub enum Kind {
    /// Whitespace.
    ///
    /// ## Example
    ///
    /// ```markdown
    /// > | **a_b_ c**.
    ///    ^      ^    ^
    /// ```
    Whitespace,
    /// Punctuation.
    ///
    /// ## Example
    ///
    /// ```markdown
    /// > | **a_b_ c**.
    ///     ^^ ^ ^    ^
    /// ```
    Punctuation,
    /// Everything else.
    ///
    /// ## Example
    ///
    /// ```markdown
    /// > | **a_b_ c**.
    ///       ^ ^  ^
    /// ```
    Other,
}

/// Get a [`char`][] right before `index` in bytes (`&[u8]`).
///
/// In most cases, markdown operates on ASCII bytes.
/// In a few cases, it is unicode aware, so we need to find an actual char.
pub fn before_index(bytes: &[u8], index: usize) -> Option<char> {
    let start = if index < 4 { 0 } else { index - 4 };
    String::from_utf8_lossy(&bytes[start..index]).chars().last()
}

/// Get a [`char`][] right at `index` in bytes (`&[u8]`).
///
/// In most cases, markdown operates on ASCII bytes.
/// In a few cases, it is unicode aware, so we need to find an actual char.
pub fn after_index(bytes: &[u8], index: usize) -> Option<char> {
    let end = if index + 4 > bytes.len() {
        bytes.len()
    } else {
        index + 4
    };
    String::from_utf8_lossy(&bytes[index..end]).chars().next()
}

/// Classify a char at `index` in bytes (`&[u8]`).
pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind {
    if index == bytes.len() {
        Kind::Whitespace
    } else {
        let byte = bytes[index];
        if byte.is_ascii_whitespace() {
            Kind::Whitespace
        } else if byte.is_ascii_punctuation() {
            Kind::Punctuation
        } else if byte.is_ascii_alphanumeric() {
            Kind::Other
        } else {
            // Otherwise: seems to be an ASCII control, so it seems to be a
            // non-ASCII `char`.
            classify_opt(after_index(bytes, index))
        }
    }
}

/// Classify whether a `char` represents whitespace, punctuation, or something
/// else.
///
/// Used for attention (emphasis, strong), whose sequences can open or close
/// based on the class of surrounding characters.
///
/// ## References
///
/// *   [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
pub fn classify(char: char) -> Kind {
    // Unicode whitespace.
    if char.is_whitespace() {
        Kind::Whitespace
    }
    // Unicode punctuation.
    else if PUNCTUATION.contains(&char) {
        Kind::Punctuation
    }
    // Everything else.
    else {
        Kind::Other
    }
}

/// Like [`classify`], but supports eof as whitespace.
pub fn classify_opt(char_opt: Option<char>) -> Kind {
    char_opt.map_or(Kind::Whitespace, classify)
}

/// Format an optional `char` (`none` means eof).
pub fn format_opt(char: Option<char>) -> String {
    char.map_or("end of file".into(), |char| {
        format!("character {}", format(char))
    })
}

/// Format an optional `byte` (`none` means eof).
pub fn format_byte_opt(byte: Option<u8>) -> String {
    byte.map_or("end of file".into(), |byte| {
        format!("byte {}", format_byte(byte))
    })
}

/// Format a `char`.
pub fn format(char: char) -> String {
    let representation = format!("U+{:>04X}", char as u32);
    let printable = match char {
        '`' => Some("`` ` ``".into()),
        '!'..='~' => Some(format!("`{}`", char)),
        _ => None,
    };

    if let Some(char) = printable {
        format!("{} ({})", char, representation)
    } else {
        representation
    }
}

/// Format a byte (`u8`).
pub fn format_byte(byte: u8) -> String {
    let representation = format!("U+{:>04X}", byte);
    let printable = match byte {
        b'`' => Some("`` ` ``".into()),
        b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())),
        _ => None,
    };

    if let Some(char) = printable {
        format!("{} ({})", char, representation)
    } else {
        representation
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::string::ToString;

    #[test]
    fn test_classify() {
        assert_eq!(
            classify(' '),
            Kind::Whitespace,
            "should classify whitespace"
        );

        assert_eq!(
            classify('.'),
            Kind::Punctuation,
            "should classify punctuation"
        );

        assert_eq!(classify('a'), Kind::Other, "should classify other");
    }

    #[test]
    fn test_format_opt() {
        assert_eq!(
            format_opt(None),
            "end of file".to_string(),
            "should format an optional char: none -> eof"
        );

        assert_eq!(
            format_opt(Some('!')),
            "character `!` (U+0021)".to_string(),
            "should format an optional char: char -> pretty"
        );
    }

    #[test]
    fn test_format_byte_opt() {
        assert_eq!(
            format_byte_opt(None),
            "end of file".to_string(),
            "should format an optional byte: none -> eof"
        );

        assert_eq!(
            format_byte_opt(Some(b'!')),
            "byte `!` (U+0021)".to_string(),
            "should format an optional byte: char -> pretty"
        );
    }

    #[test]
    fn test_format() {
        assert_eq!(
            format('`'),
            "`` ` `` (U+0060)".to_string(),
            "should format a char: grave accent"
        );

        assert_eq!(
            format('!'),
            "`!` (U+0021)".to_string(),
            "should format a char: regular"
        );

        assert_eq!(
            format(' '),
            "U+0020".to_string(),
            "should format a char: unprintable"
        );
    }

    #[test]
    fn test_format_byte() {
        assert_eq!(
            format_byte(b'`'),
            "`` ` `` (U+0060)".to_string(),
            "should format a byte: grave accent"
        );

        assert_eq!(
            format_byte(b'!'),
            "`!` (U+0021)".to_string(),
            "should format a byte: regular"
        );

        assert_eq!(
            format_byte(b' '),
            "U+0020".to_string(),
            "should format a byte: unprintable"
        );
    }
}