aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-09-09 10:54:13 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-09-09 10:54:13 +0200
commit13337d77954b4c92d1cf4592f43f01d94fce3c77 (patch)
treed5feef9a971c1af52e58b5c857d1dd9c9e7fedca /src/construct
parent71dbc8c0189d6b2032f3d8f21cbfffa3f8fe0f12 (diff)
downloadmarkdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.gz
markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.tar.bz2
markdown-rs-13337d77954b4c92d1cf4592f43f01d94fce3c77.zip
Refactor to move byte, char info to own file
Diffstat (limited to '')
-rw-r--r--src/construct/attention.rs9
-rw-r--r--src/construct/gfm_autolink_literal.rs10
-rw-r--r--src/construct/partial_mdx_jsx.rs79
-rw-r--r--src/construct/partial_space_or_tab_eol.rs36
4 files changed, 55 insertions, 79 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index 947a79b..4a208df 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -80,8 +80,13 @@ use crate::event::{Event, Kind, Name, Point};
use crate::resolve::Name as ResolveName;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
-use crate::util::classify_character::{classify_opt, Kind as CharacterKind};
-use crate::util::slice::{char_after_index, char_before_index, Slice};
+use crate::util::{
+ char::{
+ after_index as char_after_index, before_index as char_before_index, classify_opt,
+ Kind as CharacterKind,
+ },
+ slice::Slice,
+};
use alloc::{vec, vec::Vec};
/// Attentention sequence that we can take markers from.
diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs
index ae483a7..c25f04c 100644
--- a/src/construct/gfm_autolink_literal.rs
+++ b/src/construct/gfm_autolink_literal.rs
@@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name};
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::{
- classify_character::Kind as CharacterKind,
- slice::{byte_to_kind, Position, Slice},
+ char::{kind_after_index, Kind as CharacterKind},
+ slice::{Position, Slice},
};
use alloc::vec::Vec;
@@ -366,7 +366,7 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Other
{
tokenizer.tokenize_state.seen = true;
@@ -470,7 +470,7 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
{
State::Retry(StateName::GfmAutolinkLiteralPathAfter)
@@ -543,7 +543,7 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// Whitespace is the end of the URL, anything else is continuation.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
{
State::Ok
diff --git a/src/construct/partial_mdx_jsx.rs b/src/construct/partial_mdx_jsx.rs
index 1a51608..2daa448 100644
--- a/src/construct/partial_mdx_jsx.rs
+++ b/src/construct/partial_mdx_jsx.rs
@@ -164,14 +164,11 @@
use crate::event::Name;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
-use crate::util::{
- classify_character::Kind as CharacterKind,
- slice::{byte_to_kind, char_after_index},
-};
-use alloc::{
- format,
- string::{String, ToString},
+use crate::util::char::{
+ after_index as char_after_index, format_byte, format_opt as format_char_opt, kind_after_index,
+ Kind as CharacterKind,
};
+use alloc::format;
use core::str;
use unicode_id::UnicodeID;
@@ -305,7 +302,8 @@ pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn primary_name(tokenizer: &mut Tokenizer) -> State {
// End of name.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
|| matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{'))
{
tokenizer.exit(Name::MdxJsxTagNamePrimary);
@@ -418,7 +416,8 @@ pub fn member_name_before(tokenizer: &mut Tokenizer) -> State {
pub fn member_name(tokenizer: &mut Tokenizer) -> State {
// End of name.
// Note: no `:` allowed here.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
|| matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{'))
{
tokenizer.exit(Name::MdxJsxTagNameMember);
@@ -529,7 +528,8 @@ pub fn local_name_before(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn local_name(tokenizer: &mut Tokenizer) -> State {
// End of local name (note that we don’t expect another colon, or a member).
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
|| matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
{
tokenizer.exit(Name::MdxJsxTagNameLocal);
@@ -645,7 +645,8 @@ pub fn attribute_before(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {
// End of attribute name or tag.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
|| matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{'))
{
tokenizer.exit(Name::MdxJsxTagAttributePrimaryName);
@@ -711,7 +712,7 @@ pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// End of tag / new attribute.
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
|| matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
|| id_start(char_after_index(
@@ -768,7 +769,8 @@ pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State {
// End of local name (note that we don’t expect another colon).
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
|| matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{'))
{
tokenizer.exit(Name::MdxJsxTagAttributeNameLocal);
@@ -986,7 +988,7 @@ pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'\n') => State::Retry(StateName::MdxJsxEsWhitespaceEol),
_ => {
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
{
tokenizer.enter(Name::MdxJsxEsWhitespace);
@@ -1016,7 +1018,7 @@ pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State {
State::Next(StateName::MdxJsxEsWhitespaceInside)
}
_ => {
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
{
tokenizer.consume();
@@ -1044,7 +1046,7 @@ pub fn es_whitespace_eol(tokenizer: &mut Tokenizer) -> State {
pub fn es_whitespace_eol_after(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.token_1 == Name::MdxJsxFlowTag && tokenizer.lazy {
crash_lazy(tokenizer)
- } else if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ } else if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
{
tokenizer.enter(Name::MdxJsxEsWhitespace);
@@ -1064,7 +1066,7 @@ pub fn es_whitespace_eol_after_inside(tokenizer: &mut Tokenizer) -> State {
State::Next(StateName::MdxJsxEsWhitespaceEolAfterInside)
}
_ => {
- if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ if kind_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
== CharacterKind::Whitespace
{
tokenizer.consume();
@@ -1107,45 +1109,12 @@ fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> State {
char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index)
};
- // To do: externalize this, and the print mechanism in the tokenizer,
- // to one proper formatter.
- let actual = match char {
- None => "end of file".to_string(),
- Some(char) => format!("character {}", format_char(char)),
- };
-
State::Error(format!(
"{}:{}: Unexpected {} {}, expected {}",
- tokenizer.point.line, tokenizer.point.column, actual, at, expect
+ tokenizer.point.line,
+ tokenizer.point.column,
+ format_char_opt(char),
+ at,
+ expect
))
}
-
-fn format_char(char: char) -> String {
- let unicode = format!("U+{:>04X}", char as u32);
- let printable = match char {
- '`' => Some("`` ` ``".to_string()),
- ' '..='~' => Some(format!("`{}`", char)),
- _ => None,
- };
-
- if let Some(char) = printable {
- format!("{} ({})", char, unicode)
- } else {
- unicode
- }
-}
-
-fn format_byte(byte: u8) -> String {
- let unicode = format!("U+{:>04X}", byte);
- let printable = match byte {
- b'`' => Some("`` ` ``".to_string()),
- b' '..=b'~' => Some(format!("`{}`", str::from_utf8(&[byte]).unwrap())),
- _ => None,
- };
-
- if let Some(char) = printable {
- format!("{} ({})", char, unicode)
- } else {
- unicode
- }
-}
diff --git a/src/construct/partial_space_or_tab_eol.rs b/src/construct/partial_space_or_tab_eol.rs
index 01f440e..1247639 100644
--- a/src/construct/partial_space_or_tab_eol.rs
+++ b/src/construct/partial_space_or_tab_eol.rs
@@ -64,24 +64,26 @@ pub fn space_or_tab_eol_with_options(tokenizer: &mut Tokenizer, options: Options
/// | ␠␠b
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- if matches!(tokenizer.current, Some(b'\t' | b'\n' | b' ')) {
- tokenizer.attempt(
- State::Next(StateName::SpaceOrTabEolAfterFirst),
- State::Next(StateName::SpaceOrTabEolAtEol),
- );
+ match tokenizer.current {
+ Some(b'\t' | b' ') => {
+ tokenizer.attempt(
+ State::Next(StateName::SpaceOrTabEolAfterFirst),
+ State::Next(StateName::SpaceOrTabEolAtEol),
+ );
- State::Retry(space_or_tab_with_options(
- tokenizer,
- SpaceOrTabOptions {
- kind: Name::SpaceOrTab,
- min: 1,
- max: usize::MAX,
- content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(),
- connect: tokenizer.tokenize_state.space_or_tab_eol_connect,
- },
- ))
- } else {
- State::Nok
+ State::Retry(space_or_tab_with_options(
+ tokenizer,
+ SpaceOrTabOptions {
+ kind: Name::SpaceOrTab,
+ min: 1,
+ max: usize::MAX,
+ content: tokenizer.tokenize_state.space_or_tab_eol_content.clone(),
+ connect: tokenizer.tokenize_state.space_or_tab_eol_connect,
+ },
+ ))
+ }
+ Some(b'\n') => State::Retry(StateName::SpaceOrTabEolAtEol),
+ _ => State::Nok,
}
}