aboutsummaryrefslogtreecommitdiffstats
path: root/src/util
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/util/char.rs165
-rw-r--r--src/util/classify_character.rs72
-rw-r--r--src/util/mod.rs2
-rw-r--r--src/util/slice.rs47
4 files changed, 167 insertions, 119 deletions
diff --git a/src/util/char.rs b/src/util/char.rs
new file mode 100644
index 0000000..cfaacd5
--- /dev/null
+++ b/src/util/char.rs
@@ -0,0 +1,165 @@
+//! Deal with byte and chars and kinds.
+
+use crate::util::unicode::PUNCTUATION;
+use alloc::{
+ format,
+ string::{String, ToString},
+};
+use core::str;
+
+/// Character kinds.
+#[derive(Debug, PartialEq, Eq)]
+pub enum Kind {
+ /// Whitespace.
+ ///
+ /// ## Example
+ ///
+ /// ```markdown
+ /// > | **a_b_ c**.
+ /// ^ ^ ^
+ /// ```
+ Whitespace,
+ /// Punctuation.
+ ///
+ /// ## Example
+ ///
+ /// ```markdown
+ /// > | **a_b_ c**.
+ /// ^^ ^ ^ ^
+ /// ```
+ Punctuation,
+ /// Everything else.
+ ///
+ /// ## Example
+ ///
+ /// ```markdown
+ /// > | **a_b_ c**.
+ /// ^ ^ ^
+ /// ```
+ Other,
+}
+
+/// Get a [`char`][] right before `index` in bytes (`&[u8]`).
+///
+/// In most cases, markdown operates on ASCII bytes.
+/// In a few cases, it is unicode aware, so we need to find an actual char.
+pub fn before_index(bytes: &[u8], index: usize) -> Option<char> {
+ let start = if index < 4 { 0 } else { index - 4 };
+ String::from_utf8_lossy(&bytes[start..index]).chars().last()
+}
+
+/// Get a [`char`][] right at `index` in bytes (`&[u8]`).
+///
+/// In most cases, markdown operates on ASCII bytes.
+/// In a few cases, it is unicode aware, so we need to find an actual char.
+pub fn after_index(bytes: &[u8], index: usize) -> Option<char> {
+ let end = if index + 4 > bytes.len() {
+ bytes.len()
+ } else {
+ index + 4
+ };
+ String::from_utf8_lossy(&bytes[index..end]).chars().next()
+}
+
+/// Classify a char at `index` in bytes (`&[u8]`).
+pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind {
+ if index == bytes.len() {
+ Kind::Whitespace
+ } else {
+ let byte = bytes[index];
+ if byte.is_ascii_whitespace() {
+ Kind::Whitespace
+ } else if byte.is_ascii_punctuation() {
+ Kind::Punctuation
+ } else if byte.is_ascii_alphanumeric() {
+ Kind::Other
+ } else {
+ // Otherwise: seems to be an ASCII control, so it seems to be a
+ // non-ASCII `char`.
+ classify_opt(after_index(bytes, index))
+ }
+ }
+}
+
+/// Classify whether a `char` represents whitespace, punctuation, or something
+/// else.
+///
+/// Used for attention (emphasis, strong), whose sequences can open or close
+/// based on the class of surrounding characters.
+///
+/// ## References
+///
+/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
+pub fn classify(char: char) -> Kind {
+ // Unicode whitespace.
+ if char.is_whitespace() {
+ Kind::Whitespace
+ }
+ // Unicode punctuation.
+ else if PUNCTUATION.contains(&char) {
+ Kind::Punctuation
+ }
+ // Everything else.
+ else {
+ Kind::Other
+ }
+}
+
+/// Like [`classify`], but supports eof as whitespace.
+pub fn classify_opt(char_opt: Option<char>) -> Kind {
+ if let Some(char) = char_opt {
+ classify(char)
+ }
+ // EOF.
+ else {
+ Kind::Whitespace
+ }
+}
+
+/// Format an optional `char` (`none` means eof).
+pub fn format_opt(char: Option<char>) -> String {
+ match char {
+ None => "end of file".to_string(),
+ Some(char) => format!("character {}", format(char)),
+ }
+}
+
+/// Format an optional `byte` (`none` means eof).
+pub fn format_byte_opt(byte: Option<u8>) -> String {
+ match byte {
+ None => "end of file".to_string(),
+ Some(byte) => format!("byte {}", format_byte(byte)),
+ }
+}
+
+/// Format a `char`.
+pub fn format(char: char) -> String {
+ let representation = format!("U+{:>04X}", char as u32);
+ let printable = match char {
+ '`' => Some("`` ` ``".to_string()),
+ '!'..='~' => Some(format!("`{}`", char)),
+ _ => None,
+ };
+
+ if let Some(char) = printable {
+ format!("{} ({})", char, representation)
+ } else {
+ representation
+ }
+}
+
+/// Format a byte (`u8`).
+pub fn format_byte(byte: u8) -> String {
+ let representation = format!("U+{:>04X}", byte);
+ let printable = match byte {
+ b'`' => Some("`` ` ``".to_string()),
+ b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())),
+ _ => None,
+ };
+
+ if let Some(char) = printable {
+ format!("{} ({})", char, representation)
+ } else {
+ representation
+ }
+}
diff --git a/src/util/classify_character.rs b/src/util/classify_character.rs
deleted file mode 100644
index 79ed46a..0000000
--- a/src/util/classify_character.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-//! Utilities to classify characters as whitespace, punctuation, or rest.
-
-use crate::util::unicode::PUNCTUATION;
-
-/// Character kinds.
-#[derive(Debug, PartialEq, Eq)]
-pub enum Kind {
- /// Whitespace.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// > | **a_b_ c**.
- /// ^ ^ ^
- /// ```
- Whitespace,
- /// Punctuation.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// > | **a_b_ c**.
- /// ^^ ^ ^ ^
- /// ```
- Punctuation,
- /// Everything else.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// > | **a_b_ c**.
- /// ^ ^ ^
- /// ```
- Other,
-}
-
-/// Classify whether a character code represents whitespace, punctuation, or
-/// something else.
-///
-/// Used for attention (emphasis, strong), whose sequences can open or close
-/// based on the class of surrounding characters.
-///
-/// > 👉 **Note** that eof (`None`) is seen as whitespace.
-///
-/// ## References
-///
-/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
-pub fn classify(char: char) -> Kind {
- // Unicode whitespace.
- if char.is_whitespace() {
- Kind::Whitespace
- }
- // Unicode punctuation.
- else if PUNCTUATION.contains(&char) {
- Kind::Punctuation
- }
- // Everything else.
- else {
- Kind::Other
- }
-}
-
-/// Like [`classify`], but supports eof as whitespace.
-pub fn classify_opt(char_opt: Option<char>) -> Kind {
- if let Some(char) = char_opt {
- classify(char)
- }
- // EOF.
- else {
- Kind::Whitespace
- }
-}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index e5823cf..2ea372c 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,6 +1,6 @@
//! Utilities used when processing markdown.
-pub mod classify_character;
+pub mod char;
pub mod constant;
pub mod decode_character_reference;
pub mod edit_map;
diff --git a/src/util/slice.rs b/src/util/slice.rs
index 54524c3..0734d78 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -1,55 +1,10 @@
//! Deal with bytes.
use crate::event::{Event, Kind, Point};
-use crate::util::{
- classify_character::{classify_opt, Kind as CharacterKind},
- constant::TAB_SIZE,
-};
+use crate::util::constant::TAB_SIZE;
use alloc::string::String;
use core::str;
-/// Get a [`char`][] right before `index` in bytes (`&[u8]`).
-///
-/// In most cases, markdown operates on ASCII bytes.
-/// In a few cases, it is unicode aware, so we need to find an actual char.
-pub fn char_before_index(bytes: &[u8], index: usize) -> Option<char> {
- let start = if index < 4 { 0 } else { index - 4 };
- String::from_utf8_lossy(&bytes[start..index]).chars().last()
-}
-
-/// Get a [`char`][] right at `index` in bytes (`&[u8]`).
-///
-/// In most cases, markdown operates on ASCII bytes.
-/// In a few cases, it is unicode aware, so we need to find an actual char.
-pub fn char_after_index(bytes: &[u8], index: usize) -> Option<char> {
- let end = if index + 4 > bytes.len() {
- bytes.len()
- } else {
- index + 4
- };
- String::from_utf8_lossy(&bytes[index..end]).chars().next()
-}
-
-/// Classify a byte (or `char`).
-pub fn byte_to_kind(bytes: &[u8], index: usize) -> CharacterKind {
- if index == bytes.len() {
- CharacterKind::Whitespace
- } else {
- let byte = bytes[index];
- if byte.is_ascii_whitespace() {
- CharacterKind::Whitespace
- } else if byte.is_ascii_punctuation() {
- CharacterKind::Punctuation
- } else if byte.is_ascii_alphanumeric() {
- CharacterKind::Other
- } else {
- // Otherwise: seems to be an ASCII control, so it seems to be a
- // non-ASCII `char`.
- classify_opt(char_after_index(bytes, index))
- }
- }
-}
-
/// A range between two points.
#[derive(Debug)]
pub struct Position<'a> {