From 13337d77954b4c92d1cf4592f43f01d94fce3c77 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 9 Sep 2022 10:54:13 +0200 Subject: Refactor to move byte, char info to own file --- src/construct/attention.rs | 9 +- src/construct/gfm_autolink_literal.rs | 10 +- src/construct/partial_mdx_jsx.rs | 79 +++++--------- src/construct/partial_space_or_tab_eol.rs | 36 ++++--- src/tokenizer.rs | 20 +--- src/util/char.rs | 165 ++++++++++++++++++++++++++++++ src/util/classify_character.rs | 72 ------------- src/util/mod.rs | 2 +- src/util/slice.rs | 47 +-------- 9 files changed, 225 insertions(+), 215 deletions(-) create mode 100644 src/util/char.rs delete mode 100644 src/util/classify_character.rs (limited to 'src') diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 947a79b..4a208df 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -80,8 +80,13 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::util::classify_character::{classify_opt, Kind as CharacterKind}; -use crate::util::slice::{char_after_index, char_before_index, Slice}; +use crate::util::{ + char::{ + after_index as char_after_index, before_index as char_before_index, classify_opt, + Kind as CharacterKind, + }, + slice::Slice, +}; use alloc::{vec, vec::Vec}; /// Attentention sequence that we can take markers from. diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index ae483a7..c25f04c 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name}; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use crate::util::{ - classify_character::Kind as CharacterKind, - slice::{byte_to_kind, Position, Slice}, + char::{kind_after_index, Kind as CharacterKind}, + slice::{Position, Slice}, }; use alloc::vec::Vec; @@ -366,7 +366,7 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State { } _ => { // Source: . - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Other { tokenizer.tokenize_state.seen = true; @@ -470,7 +470,7 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State { } _ => { // Source: . - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace { State::Retry(StateName::GfmAutolinkLiteralPathAfter) @@ -543,7 +543,7 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State { } _ => { // Whitespace is the end of the URL, anything else is continuation. - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace { State::Ok diff --git a/src/construct/partial_mdx_jsx.rs b/src/construct/partial_mdx_jsx.rs index 1a51608..2daa448 100644 --- a/src/construct/partial_mdx_jsx.rs +++ b/src/construct/partial_mdx_jsx.rs @@ -164,14 +164,11 @@ use crate::event::Name; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::util::{ - classify_character::Kind as CharacterKind, - slice::{byte_to_kind, char_after_index}, -}; -use alloc::{ - format, - string::{String, ToString}, +use crate::util::char::{ + after_index as char_after_index, format_byte, format_opt as format_char_opt, kind_after_index, + Kind as CharacterKind, }; +use alloc::format; use core::str; use unicode_id::UnicodeID; @@ -305,7 +302,8 @@ pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn primary_name(tokenizer: &mut Tokenizer) -> State { // End of name. - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace || matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{')) { tokenizer.exit(Name::MdxJsxTagNamePrimary); @@ -418,7 +416,8 @@ pub fn member_name_before(tokenizer: &mut Tokenizer) -> State { pub fn member_name(tokenizer: &mut Tokenizer) -> State { // End of name. // Note: no `:` allowed here. - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace || matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{')) { tokenizer.exit(Name::MdxJsxTagNameMember); @@ -529,7 +528,8 @@ pub fn local_name_before(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn local_name(tokenizer: &mut Tokenizer) -> State { // End of local name (note that we don’t expect another colon, or a member). - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace || matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) { tokenizer.exit(Name::MdxJsxTagNameLocal); @@ -645,7 +645,8 @@ pub fn attribute_before(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State { // End of attribute name or tag. - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace || matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{')) { tokenizer.exit(Name::MdxJsxTagAttributePrimaryName); @@ -711,7 +712,7 @@ pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State { } _ => { // End of tag / new attribute. - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace || matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(char_after_index( @@ -768,7 +769,8 @@ pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State { // End of local name (note that we don’t expect another colon). - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace || matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{')) { tokenizer.exit(Name::MdxJsxTagAttributeNameLocal); @@ -986,7 +988,7 @@ pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => State::Retry(StateName::MdxJsxEsWhitespaceEol), _ => { - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace { tokenizer.enter(Name::MdxJsxEsWhitespace); @@ -1016,7 +1018,7 @@ pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::MdxJsxEsWhitespaceInside) } _ => { - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace { tokenizer.consume(); @@ -1044,7 +1046,7 @@ pub fn es_whitespace_eol(tokenizer: &mut Tokenizer) -> State { pub fn es_whitespace_eol_after(tokenizer: &mut Tokenizer) -> State { if tokenizer.tokenize_state.token_1 == Name::MdxJsxFlowTag && tokenizer.lazy { crash_lazy(tokenizer) - } else if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + } else if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace { tokenizer.enter(Name::MdxJsxEsWhitespace); @@ -1064,7 +1066,7 @@ pub fn es_whitespace_eol_after_inside(tokenizer: &mut Tokenizer) -> State { State::Next(StateName::MdxJsxEsWhitespaceEolAfterInside) } _ => { - if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace { tokenizer.consume(); @@ -1107,45 +1109,12 @@ fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> State { char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) }; - // To do: externalize this, and the print mechanism in the tokenizer, - // to one proper formatter. - let actual = match char { - None => "end of file".to_string(), - Some(char) => format!("character {}", format_char(char)), - }; - State::Error(format!( "{}:{}: Unexpected {} {}, expected {}", - tokenizer.point.line, tokenizer.point.column, actual, at, expect + tokenizer.point.line, + tokenizer.point.column, + format_char_opt(char), + at, + expect )) } - -fn format_char(char: char) -> String { - let unicode = format!("U+{:>04X}", char as u32); - let printable = match char { - '`' => Some("`` ` ``".to_string()), - ' '..='~' => Some(format!("`{}`", char)), - _ => None, - }; - - if let Some(char) = printable { - format!("{} ({})", char, unicode) - } else { - unicode - } -} - -fn format_byte(byte: u8) -> String { - let unicode = format!("U+{:>04X}", byte); - let printable = match byte { - b'`' => Some("`` ` ``".to_string()), - b' '..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())), - _ => None, - }; - - if let Some(char) = printable { - format!("{} ({})", char, unicode) - } else { - unicode - } -} diff --git a/src/construct/partial_space_or_tab_eol.rs b/src/construct/partial_space_or_tab_eol.rs index 01f440e..1247639 100644 --- a/src/construct/partial_space_or_tab_eol.rs +++ b/src/construct/partial_space_or_tab_eol.rs @@ -64,24 +64,26 @@ pub fn space_or_tab_eol_with_options(tokenizer: &mut Tokenizer, options: Options /// | ␠␠b /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if matches!(tokenizer.current, Some(b'\t' | b'\n' | b' ')) { - tokenizer.attempt( - State::Next(StateName::SpaceOrTabEolAfterFirst), - State::Next(StateName::SpaceOrTabEolAtEol), - ); + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.attempt( + State::Next(StateName::SpaceOrTabEolAfterFirst), + State::Next(StateName::SpaceOrTabEolAtEol), + ); - State::Retry(space_or_tab_with_options( - tokenizer, - SpaceOrTabOptions { - kind: Name::SpaceOrTab, - min: 1, - max: usize::MAX, - content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(), - connect: tokenizer.tokenize_state.space_or_tab_eol_connect, - }, - )) - } else { - State::Nok + State::Retry(space_or_tab_with_options( + tokenizer, + SpaceOrTabOptions { + kind: Name::SpaceOrTab, + min: 1, + max: usize::MAX, + content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(), + connect: tokenizer.tokenize_state.space_or_tab_eol_connect, + }, + )) + } + Some(b'\n') => State::Retry(StateName::SpaceOrTabEolAtEol), + _ => State::Nok, } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 04523b3..aca8ec2 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,15 +12,8 @@ use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS}; use crate::parser::ParseState; use crate::resolve::{call as call_resolve, Name as ResolveName}; use crate::state::{call, State}; -use crate::util::{constant::TAB_SIZE, edit_map::EditMap}; -use alloc::{ - boxed::Box, - format, - string::{String, ToString}, - vec, - vec::Vec, -}; -use core::str; +use crate::util::{char::format_byte_opt, constant::TAB_SIZE, edit_map::EditMap}; +use alloc::{boxed::Box, string::String, vec, vec::Vec}; /// Containers. /// @@ -725,14 +718,7 @@ fn push_impl( None }; - let visible = byte.map(|d| { - if (b' '..=b'~').contains(&d) { - str::from_utf8(&[d]).unwrap().to_string() - } else { - format!("0x{:x}", d) - } - }); - log::debug!("feed: `{:?}` to {:?}", visible, name); + log::debug!("feed: {} to {:?}", format_byte_opt(byte), name); tokenizer.expect(byte); state = call(tokenizer, name); }; diff --git a/src/util/char.rs b/src/util/char.rs new file mode 100644 index 0000000..cfaacd5 --- /dev/null +++ b/src/util/char.rs @@ -0,0 +1,165 @@ +//! Deal with byte and chars and kinds. + +use crate::util::unicode::PUNCTUATION; +use alloc::{ + format, + string::{String, ToString}, +}; +use core::str; + +/// Character kinds. +#[derive(Debug, PartialEq, Eq)] +pub enum Kind { + /// Whitespace. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Whitespace, + /// Punctuation. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^^ ^ ^ ^ + /// ``` + Punctuation, + /// Everything else. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Other, +} + +/// Get a [`char`][] right before `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn before_index(bytes: &[u8], index: usize) -> Option { + let start = if index < 4 { 0 } else { index - 4 }; + String::from_utf8_lossy(&bytes[start..index]).chars().last() +} + +/// Get a [`char`][] right at `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn after_index(bytes: &[u8], index: usize) -> Option { + let end = if index + 4 > bytes.len() { + bytes.len() + } else { + index + 4 + }; + String::from_utf8_lossy(&bytes[index..end]).chars().next() +} + +/// Classify a char at `index` in bytes (`&[u8]`). +pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind { + if index == bytes.len() { + Kind::Whitespace + } else { + let byte = bytes[index]; + if byte.is_ascii_whitespace() { + Kind::Whitespace + } else if byte.is_ascii_punctuation() { + Kind::Punctuation + } else if byte.is_ascii_alphanumeric() { + Kind::Other + } else { + // Otherwise: seems to be an ASCII control, so it seems to be a + // non-ASCII `char`. + classify_opt(after_index(bytes, index)) + } + } +} + +/// Classify whether a `char` represents whitespace, punctuation, or something +/// else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// ## References +/// +/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) +pub fn classify(char: char) -> Kind { + // Unicode whitespace. + if char.is_whitespace() { + Kind::Whitespace + } + // Unicode punctuation. + else if PUNCTUATION.contains(&char) { + Kind::Punctuation + } + // Everything else. + else { + Kind::Other + } +} + +/// Like [`classify`], but supports eof as whitespace. +pub fn classify_opt(char_opt: Option) -> Kind { + if let Some(char) = char_opt { + classify(char) + } + // EOF. + else { + Kind::Whitespace + } +} + +/// Format an optional `char` (`none` means eof). +pub fn format_opt(char: Option) -> String { + match char { + None => "end of file".to_string(), + Some(char) => format!("character {}", format(char)), + } +} + +/// Format an optional `byte` (`none` means eof). +pub fn format_byte_opt(byte: Option) -> String { + match byte { + None => "end of file".to_string(), + Some(byte) => format!("byte {}", format_byte(byte)), + } +} + +/// Format a `char`. +pub fn format(char: char) -> String { + let representation = format!("U+{:>04X}", char as u32); + let printable = match char { + '`' => Some("`` ` ``".to_string()), + '!'..='~' => Some(format!("`{}`", char)), + _ => None, + }; + + if let Some(char) = printable { + format!("{} ({})", char, representation) + } else { + representation + } +} + +/// Format a byte (`u8`). +pub fn format_byte(byte: u8) -> String { + let representation = format!("U+{:>04X}", byte); + let printable = match byte { + b'`' => Some("`` ` ``".to_string()), + b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())), + _ => None, + }; + + if let Some(char) = printable { + format!("{} ({})", char, representation) + } else { + representation + } +} diff --git a/src/util/classify_character.rs b/src/util/classify_character.rs deleted file mode 100644 index 79ed46a..0000000 --- a/src/util/classify_character.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! Utilities to classify characters as whitespace, punctuation, or rest. - -use crate::util::unicode::PUNCTUATION; - -/// Character kinds. -#[derive(Debug, PartialEq, Eq)] -pub enum Kind { - /// Whitespace. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Whitespace, - /// Punctuation. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^^ ^ ^ ^ - /// ``` - Punctuation, - /// Everything else. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Other, -} - -/// Classify whether a character code represents whitespace, punctuation, or -/// something else. -/// -/// Used for attention (emphasis, strong), whose sequences can open or close -/// based on the class of surrounding characters. -/// -/// > 👉 **Note** that eof (`None`) is seen as whitespace. -/// -/// ## References -/// -/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -pub fn classify(char: char) -> Kind { - // Unicode whitespace. - if char.is_whitespace() { - Kind::Whitespace - } - // Unicode punctuation. - else if PUNCTUATION.contains(&char) { - Kind::Punctuation - } - // Everything else. - else { - Kind::Other - } -} - -/// Like [`classify`], but supports eof as whitespace. -pub fn classify_opt(char_opt: Option) -> Kind { - if let Some(char) = char_opt { - classify(char) - } - // EOF. - else { - Kind::Whitespace - } -} diff --git a/src/util/mod.rs b/src/util/mod.rs index e5823cf..2ea372c 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,6 +1,6 @@ //! Utilities used when processing markdown. -pub mod classify_character; +pub mod char; pub mod constant; pub mod decode_character_reference; pub mod edit_map; diff --git a/src/util/slice.rs b/src/util/slice.rs index 54524c3..0734d78 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -1,55 +1,10 @@ //! Deal with bytes. use crate::event::{Event, Kind, Point}; -use crate::util::{ - classify_character::{classify_opt, Kind as CharacterKind}, - constant::TAB_SIZE, -}; +use crate::util::constant::TAB_SIZE; use alloc::string::String; use core::str; -/// Get a [`char`][] right before `index` in bytes (`&[u8]`). -/// -/// In most cases, markdown operates on ASCII bytes. -/// In a few cases, it is unicode aware, so we need to find an actual char. -pub fn char_before_index(bytes: &[u8], index: usize) -> Option { - let start = if index < 4 { 0 } else { index - 4 }; - String::from_utf8_lossy(&bytes[start..index]).chars().last() -} - -/// Get a [`char`][] right at `index` in bytes (`&[u8]`). -/// -/// In most cases, markdown operates on ASCII bytes. -/// In a few cases, it is unicode aware, so we need to find an actual char. -pub fn char_after_index(bytes: &[u8], index: usize) -> Option { - let end = if index + 4 > bytes.len() { - bytes.len() - } else { - index + 4 - }; - String::from_utf8_lossy(&bytes[index..end]).chars().next() -} - -/// Classify a byte (or `char`). -pub fn byte_to_kind(bytes: &[u8], index: usize) -> CharacterKind { - if index == bytes.len() { - CharacterKind::Whitespace - } else { - let byte = bytes[index]; - if byte.is_ascii_whitespace() { - CharacterKind::Whitespace - } else if byte.is_ascii_punctuation() { - CharacterKind::Punctuation - } else if byte.is_ascii_alphanumeric() { - CharacterKind::Other - } else { - // Otherwise: seems to be an ASCII control, so it seems to be a - // non-ASCII `char`. - classify_opt(char_after_index(bytes, index)) - } - } -} - /// A range between two points. #[derive(Debug)] pub struct Position<'a> { -- cgit