diff options
Diffstat (limited to '')
| -rw-r--r-- | src/construct/attention.rs | 9 | ||||
| -rw-r--r-- | src/construct/gfm_autolink_literal.rs | 10 | ||||
| -rw-r--r-- | src/construct/partial_mdx_jsx.rs | 79 | ||||
| -rw-r--r-- | src/construct/partial_space_or_tab_eol.rs | 36 | ||||
| -rw-r--r-- | src/tokenizer.rs | 20 | ||||
| -rw-r--r-- | src/util/char.rs | 165 | ||||
| -rw-r--r-- | src/util/classify_character.rs | 72 | ||||
| -rw-r--r-- | src/util/mod.rs | 2 | ||||
| -rw-r--r-- | src/util/slice.rs | 47 | 
9 files changed, 225 insertions, 215 deletions
| diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 947a79b..4a208df 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -80,8 +80,13 @@ use crate::event::{Event, Kind, Name, Point};  use crate::resolve::Name as ResolveName;  use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer; -use crate::util::classify_character::{classify_opt, Kind as CharacterKind}; -use crate::util::slice::{char_after_index, char_before_index, Slice}; +use crate::util::{ +    char::{ +        after_index as char_after_index, before_index as char_before_index, classify_opt, +        Kind as CharacterKind, +    }, +    slice::Slice, +};  use alloc::{vec, vec::Vec};  /// Attentention sequence that we can take markers from. diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index ae483a7..c25f04c 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name};  use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer;  use crate::util::{ -    classify_character::Kind as CharacterKind, -    slice::{byte_to_kind, Position, Slice}, +    char::{kind_after_index, Kind as CharacterKind}, +    slice::{Position, Slice},  };  use alloc::vec::Vec; @@ -366,7 +366,7 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Other              {                  tokenizer.tokenize_state.seen = true; @@ -470,7 +470,7 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Whitespace              {                  State::Retry(StateName::GfmAutolinkLiteralPathAfter) @@ -543,7 +543,7 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // Whitespace is the end of the URL, anything else is continuation. -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Whitespace              {                  State::Ok diff --git a/src/construct/partial_mdx_jsx.rs b/src/construct/partial_mdx_jsx.rs index 1a51608..2daa448 100644 --- a/src/construct/partial_mdx_jsx.rs +++ b/src/construct/partial_mdx_jsx.rs @@ -164,14 +164,11 @@  use crate::event::Name;  use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer; -use crate::util::{ -    classify_character::Kind as CharacterKind, -    slice::{byte_to_kind, char_after_index}, -}; -use alloc::{ -    format, -    string::{String, ToString}, +use crate::util::char::{ +    after_index as char_after_index, format_byte, format_opt as format_char_opt, kind_after_index, +    Kind as CharacterKind,  }; +use alloc::format;  use core::str;  use unicode_id::UnicodeID; @@ -305,7 +302,8 @@ pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State {  /// ```  pub fn primary_name(tokenizer: &mut Tokenizer) -> State {      // End of name. -    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +    if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) +        == CharacterKind::Whitespace          || matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{'))      {          tokenizer.exit(Name::MdxJsxTagNamePrimary); @@ -418,7 +416,8 @@ pub fn member_name_before(tokenizer: &mut Tokenizer) -> State {  pub fn member_name(tokenizer: &mut Tokenizer) -> State {      // End of name.      // Note: no `:` allowed here. -    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +    if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) +        == CharacterKind::Whitespace          || matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{'))      {          tokenizer.exit(Name::MdxJsxTagNameMember); @@ -529,7 +528,8 @@ pub fn local_name_before(tokenizer: &mut Tokenizer) -> State {  /// ```  pub fn local_name(tokenizer: &mut Tokenizer) -> State {      // End of local name (note that we don’t expect another colon, or a member). -    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +    if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) +        == CharacterKind::Whitespace          || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))      {          tokenizer.exit(Name::MdxJsxTagNameLocal); @@ -645,7 +645,8 @@ pub fn attribute_before(tokenizer: &mut Tokenizer) -> State {  /// ```  pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {      // End of attribute name or tag. -    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +    if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) +        == CharacterKind::Whitespace          || matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{'))      {          tokenizer.exit(Name::MdxJsxTagAttributePrimaryName); @@ -711,7 +712,7 @@ pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // End of tag / new attribute. -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Whitespace                  || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))                  || id_start(char_after_index( @@ -768,7 +769,8 @@ pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State {  /// ```  pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State {      // End of local name (note that we don’t expect another colon). -    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +    if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index) +        == CharacterKind::Whitespace          || matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{'))      {          tokenizer.exit(Name::MdxJsxTagAttributeNameLocal); @@ -986,7 +988,7 @@ pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current {          Some(b'\n') => State::Retry(StateName::MdxJsxEsWhitespaceEol),          _ => { -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Whitespace              {                  tokenizer.enter(Name::MdxJsxEsWhitespace); @@ -1016,7 +1018,7 @@ pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State {              State::Next(StateName::MdxJsxEsWhitespaceInside)          }          _ => { -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Whitespace              {                  tokenizer.consume(); @@ -1044,7 +1046,7 @@ pub fn es_whitespace_eol(tokenizer: &mut Tokenizer) -> State {  pub fn es_whitespace_eol_after(tokenizer: &mut Tokenizer) -> State {      if tokenizer.tokenize_state.token_1 == Name::MdxJsxFlowTag && tokenizer.lazy {          crash_lazy(tokenizer) -    } else if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +    } else if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)          == CharacterKind::Whitespace      {          tokenizer.enter(Name::MdxJsxEsWhitespace); @@ -1064,7 +1066,7 @@ pub fn es_whitespace_eol_after_inside(tokenizer: &mut Tokenizer) -> State {              State::Next(StateName::MdxJsxEsWhitespaceEolAfterInside)          }          _ => { -            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +            if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)                  == CharacterKind::Whitespace              {                  tokenizer.consume(); @@ -1107,45 +1109,12 @@ fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> State {          char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)      }; -    // To do: externalize this, and the print mechanism in the tokenizer, -    // to one proper formatter. -    let actual = match char { -        None => "end of file".to_string(), -        Some(char) => format!("character {}", format_char(char)), -    }; -      State::Error(format!(          "{}:{}: Unexpected {} {}, expected {}", -        tokenizer.point.line, tokenizer.point.column, actual, at, expect +        tokenizer.point.line, +        tokenizer.point.column, +        format_char_opt(char), +        at, +        expect      ))  } - -fn format_char(char: char) -> String { -    let unicode = format!("U+{:>04X}", char as u32); -    let printable = match char { -        '`' => Some("`` ` ``".to_string()), -        ' '..='~' => Some(format!("`{}`", char)), -        _ => None, -    }; - -    if let Some(char) = printable { -        format!("{} ({})", char, unicode) -    } else { -        unicode -    } -} - -fn format_byte(byte: u8) -> String { -    let unicode = format!("U+{:>04X}", byte); -    let printable = match byte { -        b'`' => Some("`` ` ``".to_string()), -        b' '..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())), -        _ => None, -    }; - -    if let Some(char) = printable { -        format!("{} ({})", char, unicode) -    } else { -        unicode -    } -} diff --git a/src/construct/partial_space_or_tab_eol.rs b/src/construct/partial_space_or_tab_eol.rs index 01f440e..1247639 100644 --- a/src/construct/partial_space_or_tab_eol.rs +++ b/src/construct/partial_space_or_tab_eol.rs @@ -64,24 +64,26 @@ pub fn space_or_tab_eol_with_options(tokenizer: &mut Tokenizer, options: Options  ///   | ␠␠b  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    if matches!(tokenizer.current, Some(b'\t' | b'\n' | b' ')) { -        tokenizer.attempt( -            State::Next(StateName::SpaceOrTabEolAfterFirst), -            State::Next(StateName::SpaceOrTabEolAtEol), -        ); +    match tokenizer.current { +        Some(b'\t' | b' ') => { +            tokenizer.attempt( +                State::Next(StateName::SpaceOrTabEolAfterFirst), +                State::Next(StateName::SpaceOrTabEolAtEol), +            ); -        State::Retry(space_or_tab_with_options( -            tokenizer, -            SpaceOrTabOptions { -                kind: Name::SpaceOrTab, -                min: 1, -                max: usize::MAX, -                content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(), -                connect: tokenizer.tokenize_state.space_or_tab_eol_connect, -            }, -        )) -    } else { -        State::Nok +            State::Retry(space_or_tab_with_options( +                tokenizer, +                SpaceOrTabOptions { +                    kind: Name::SpaceOrTab, +                    min: 1, +                    max: usize::MAX, +                    content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(), +                    connect: tokenizer.tokenize_state.space_or_tab_eol_connect, +                }, +            )) +        } +        Some(b'\n') => State::Retry(StateName::SpaceOrTabEolAtEol), +        _ => State::Nok,      }  } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 04523b3..aca8ec2 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,15 +12,8 @@ use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS};  use crate::parser::ParseState;  use crate::resolve::{call as call_resolve, Name as ResolveName};  use crate::state::{call, State}; -use crate::util::{constant::TAB_SIZE, edit_map::EditMap}; -use alloc::{ -    boxed::Box, -    format, -    string::{String, ToString}, -    vec, -    vec::Vec, -}; -use core::str; +use crate::util::{char::format_byte_opt, constant::TAB_SIZE, edit_map::EditMap}; +use alloc::{boxed::Box, string::String, vec, vec::Vec};  /// Containers.  /// @@ -725,14 +718,7 @@ fn push_impl(                              None                          }; -                    let visible = byte.map(|d| { -                        if (b' '..=b'~').contains(&d) { -                            str::from_utf8(&[d]).unwrap().to_string() -                        } else { -                            format!("0x{:x}", d) -                        } -                    }); -                    log::debug!("feed:    `{:?}` to {:?}", visible, name); +                    log::debug!("feed:    {} to {:?}", format_byte_opt(byte), name);                      tokenizer.expect(byte);                      state = call(tokenizer, name);                  }; diff --git a/src/util/char.rs b/src/util/char.rs new file mode 100644 index 0000000..cfaacd5 --- /dev/null +++ b/src/util/char.rs @@ -0,0 +1,165 @@ +//! Deal with byte and chars and kinds. + +use crate::util::unicode::PUNCTUATION; +use alloc::{ +    format, +    string::{String, ToString}, +}; +use core::str; + +/// Character kinds. +#[derive(Debug, PartialEq, Eq)] +pub enum Kind { +    /// Whitespace. +    /// +    /// ## Example +    /// +    /// ```markdown +    /// > | **a_b_ c**. +    ///    ^      ^    ^ +    /// ``` +    Whitespace, +    /// Punctuation. +    /// +    /// ## Example +    /// +    /// ```markdown +    /// > | **a_b_ c**. +    ///     ^^ ^ ^    ^ +    /// ``` +    Punctuation, +    /// Everything else. +    /// +    /// ## Example +    /// +    /// ```markdown +    /// > | **a_b_ c**. +    ///       ^ ^  ^ +    /// ``` +    Other, +} + +/// Get a [`char`][] right before `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn before_index(bytes: &[u8], index: usize) -> Option<char> { +    let start = if index < 4 { 0 } else { index - 4 }; +    String::from_utf8_lossy(&bytes[start..index]).chars().last() +} + +/// Get a [`char`][] right at `index` in bytes (`&[u8]`). +/// +/// In most cases, markdown operates on ASCII bytes. +/// In a few cases, it is unicode aware, so we need to find an actual char. +pub fn after_index(bytes: &[u8], index: usize) -> Option<char> { +    let end = if index + 4 > bytes.len() { +        bytes.len() +    } else { +        index + 4 +    }; +    String::from_utf8_lossy(&bytes[index..end]).chars().next() +} + +/// Classify a char at `index` in bytes (`&[u8]`). +pub fn kind_after_index(bytes: &[u8], index: usize) -> Kind { +    if index == bytes.len() { +        Kind::Whitespace +    } else { +        let byte = bytes[index]; +        if byte.is_ascii_whitespace() { +            Kind::Whitespace +        } else if byte.is_ascii_punctuation() { +            Kind::Punctuation +        } else if byte.is_ascii_alphanumeric() { +            Kind::Other +        } else { +            // Otherwise: seems to be an ASCII control, so it seems to be a +            // non-ASCII `char`. +            classify_opt(after_index(bytes, index)) +        } +    } +} + +/// Classify whether a `char` represents whitespace, punctuation, or something +/// else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// ## References +/// +/// *   [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) +pub fn classify(char: char) -> Kind { +    // Unicode whitespace. +    if char.is_whitespace() { +        Kind::Whitespace +    } +    // Unicode punctuation. +    else if PUNCTUATION.contains(&char) { +        Kind::Punctuation +    } +    // Everything else. +    else { +        Kind::Other +    } +} + +/// Like [`classify`], but supports eof as whitespace. +pub fn classify_opt(char_opt: Option<char>) -> Kind { +    if let Some(char) = char_opt { +        classify(char) +    } +    // EOF. +    else { +        Kind::Whitespace +    } +} + +/// Format an optional `char` (`none` means eof). +pub fn format_opt(char: Option<char>) -> String { +    match char { +        None => "end of file".to_string(), +        Some(char) => format!("character {}", format(char)), +    } +} + +/// Format an optional `byte` (`none` means eof). +pub fn format_byte_opt(byte: Option<u8>) -> String { +    match byte { +        None => "end of file".to_string(), +        Some(byte) => format!("byte {}", format_byte(byte)), +    } +} + +/// Format a `char`. +pub fn format(char: char) -> String { +    let representation = format!("U+{:>04X}", char as u32); +    let printable = match char { +        '`' => Some("`` ` ``".to_string()), +        '!'..='~' => Some(format!("`{}`", char)), +        _ => None, +    }; + +    if let Some(char) = printable { +        format!("{} ({})", char, representation) +    } else { +        representation +    } +} + +/// Format a byte (`u8`). +pub fn format_byte(byte: u8) -> String { +    let representation = format!("U+{:>04X}", byte); +    let printable = match byte { +        b'`' => Some("`` ` ``".to_string()), +        b'!'..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())), +        _ => None, +    }; + +    if let Some(char) = printable { +        format!("{} ({})", char, representation) +    } else { +        representation +    } +} diff --git a/src/util/classify_character.rs b/src/util/classify_character.rs deleted file mode 100644 index 79ed46a..0000000 --- a/src/util/classify_character.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! Utilities to classify characters as whitespace, punctuation, or rest. - -use crate::util::unicode::PUNCTUATION; - -/// Character kinds. -#[derive(Debug, PartialEq, Eq)] -pub enum Kind { -    /// Whitespace. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// > | **a_b_ c**. -    ///    ^      ^    ^ -    /// ``` -    Whitespace, -    /// Punctuation. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// > | **a_b_ c**. -    ///     ^^ ^ ^    ^ -    /// ``` -    Punctuation, -    /// Everything else. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// > | **a_b_ c**. -    ///       ^ ^  ^ -    /// ``` -    Other, -} - -/// Classify whether a character code represents whitespace, punctuation, or -/// something else. -/// -/// Used for attention (emphasis, strong), whose sequences can open or close -/// based on the class of surrounding characters. -/// -/// > 👉 **Note** that eof (`None`) is seen as whitespace. -/// -/// ## References -/// -/// *   [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -pub fn classify(char: char) -> Kind { -    // Unicode whitespace. -    if char.is_whitespace() { -        Kind::Whitespace -    } -    // Unicode punctuation. -    else if PUNCTUATION.contains(&char) { -        Kind::Punctuation -    } -    // Everything else. -    else { -        Kind::Other -    } -} - -/// Like [`classify`], but supports eof as whitespace. -pub fn classify_opt(char_opt: Option<char>) -> Kind { -    if let Some(char) = char_opt { -        classify(char) -    } -    // EOF. -    else { -        Kind::Whitespace -    } -} diff --git a/src/util/mod.rs b/src/util/mod.rs index e5823cf..2ea372c 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,6 +1,6 @@  //! Utilities used when processing markdown. -pub mod classify_character; +pub mod char;  pub mod constant;  pub mod decode_character_reference;  pub mod edit_map; diff --git a/src/util/slice.rs b/src/util/slice.rs index 54524c3..0734d78 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -1,55 +1,10 @@  //! Deal with bytes.  use crate::event::{Event, Kind, Point}; -use crate::util::{ -    classify_character::{classify_opt, Kind as CharacterKind}, -    constant::TAB_SIZE, -}; +use crate::util::constant::TAB_SIZE;  use alloc::string::String;  use core::str; -/// Get a [`char`][] right before `index` in bytes (`&[u8]`). -/// -/// In most cases, markdown operates on ASCII bytes. -/// In a few cases, it is unicode aware, so we need to find an actual char. -pub fn char_before_index(bytes: &[u8], index: usize) -> Option<char> { -    let start = if index < 4 { 0 } else { index - 4 }; -    String::from_utf8_lossy(&bytes[start..index]).chars().last() -} - -/// Get a [`char`][] right at `index` in bytes (`&[u8]`). -/// -/// In most cases, markdown operates on ASCII bytes. -/// In a few cases, it is unicode aware, so we need to find an actual char. -pub fn char_after_index(bytes: &[u8], index: usize) -> Option<char> { -    let end = if index + 4 > bytes.len() { -        bytes.len() -    } else { -        index + 4 -    }; -    String::from_utf8_lossy(&bytes[index..end]).chars().next() -} - -/// Classify a byte (or `char`). -pub fn byte_to_kind(bytes: &[u8], index: usize) -> CharacterKind { -    if index == bytes.len() { -        CharacterKind::Whitespace -    } else { -        let byte = bytes[index]; -        if byte.is_ascii_whitespace() { -            CharacterKind::Whitespace -        } else if byte.is_ascii_punctuation() { -            CharacterKind::Punctuation -        } else if byte.is_ascii_alphanumeric() { -            CharacterKind::Other -        } else { -            // Otherwise: seems to be an ASCII control, so it seems to be a -            // non-ASCII `char`. -            classify_opt(char_after_index(bytes, index)) -        } -    } -} -  /// A range between two points.  #[derive(Debug)]  pub struct Position<'a> { | 
