From 16de10fe2395002644d685fdfcf76823346d1cc4 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 5 Sep 2022 12:00:33 +0200 Subject: Add support for getting `char`s from bytes --- src/construct/attention.rs | 31 ++++++++++--------------------- src/util/slice.rs | 22 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 526f58c..947a79b 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -81,8 +81,8 @@ use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use crate::util::classify_character::{classify_opt, Kind as CharacterKind}; -use crate::util::slice::Slice; -use alloc::{string::String, vec, vec::Vec}; +use crate::util::slice::{char_after_index, char_before_index, Slice}; +use alloc::{vec, vec::Vec}; /// Attentention sequence that we can take markers from. #[derive(Debug)] @@ -234,28 +234,17 @@ fn get_sequences(tokenizer: &mut Tokenizer) -> Vec { let end = index + 1; let exit = &tokenizer.events[end]; - let before_end = enter.point.index; - let before_start = if before_end < 4 { 0 } else { before_end - 4 }; - let after_start = exit.point.index; - let after_end = if after_start + 4 > tokenizer.parse_state.bytes.len() { - tokenizer.parse_state.bytes.len() - } else { - after_start + 4 - }; - let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) .head() .unwrap(); - let before = classify_opt( - String::from_utf8_lossy(&tokenizer.parse_state.bytes[before_start..before_end]) - .chars() - .last(), - ); - let after = classify_opt( - String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]) - .chars() - .next(), - ); + let before = classify_opt(char_before_index( + tokenizer.parse_state.bytes, + enter.point.index, + )); + let after = classify_opt(char_after_index( + tokenizer.parse_state.bytes, + exit.point.index, + )); let open = after == CharacterKind::Other || (after == CharacterKind::Punctuation && before != CharacterKind::Other); let close = before == CharacterKind::Other diff --git a/src/util/slice.rs b/src/util/slice.rs index 0734d78..d02a526 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -5,6 +5,28 @@ use crate::util::constant::TAB_SIZE; use alloc::string::String; use core::str; +/// Get a [`char`][] right before `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn char_before_index(bytes: &[u8], index: usize) -> Option { + let start = if index < 4 { 0 } else { index - 4 }; + String::from_utf8_lossy(&bytes[start..index]).chars().last() +} + +/// Get a [`char`][] right at `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn char_after_index(bytes: &[u8], index: usize) -> Option { + let end = if index + 4 > bytes.len() { + bytes.len() + } else { + index + 4 + }; + String::from_utf8_lossy(&bytes[index..end]).chars().next() +} + /// A range between two points. #[derive(Debug)] pub struct Position<'a> { -- cgit