aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-29 18:22:59 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-29 18:22:59 +0200
commit0eeff9148e327183e532752f46421a75506dd7a6 (patch)
tree4f0aed04f90aa759ce96a2e87aa719e7fa95c450 /src/construct
parent148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff)
downloadmarkdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz
markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2
markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip
Refactor to improve states
* Remove custom kind wrappers, use plain bytes instead * Remove `Into`s, use the explicit expected types instead * Refactor to use `slice.as_str` in most places * Remove unneeded unique check before adding a definition * Use a shared CDATA prefix in constants * Inline byte checks into matches * Pass bytes back from parser instead of whole parse state * Refactor to work more often on bytes * Rename custom `size` to `len`
Diffstat (limited to 'src/construct')
-rw-r--r--src/construct/attention.rs88
-rw-r--r--src/construct/autolink.rs57
-rw-r--r--src/construct/character_escape.rs3
-rw-r--r--src/construct/character_reference.rs132
-rw-r--r--src/construct/code_fenced.rs123
-rw-r--r--src/construct/code_indented.rs37
-rw-r--r--src/construct/code_text.rs7
-rw-r--r--src/construct/definition.rs21
-rw-r--r--src/construct/hard_break_escape.rs4
-rw-r--r--src/construct/heading_atx.rs28
-rw-r--r--src/construct/heading_setext.rs96
-rw-r--r--src/construct/html_flow.rs212
-rw-r--r--src/construct/html_text.rs46
-rw-r--r--src/construct/label_end.rs47
-rw-r--r--src/construct/label_start_image.rs3
-rw-r--r--src/construct/list.rs135
-rw-r--r--src/construct/paragraph.rs3
-rw-r--r--src/construct/partial_bom.rs37
-rw-r--r--src/construct/partial_destination.rs53
-rw-r--r--src/construct/partial_label.rs101
-rw-r--r--src/construct/partial_title.rs93
-rw-r--r--src/construct/partial_whitespace.rs18
-rw-r--r--src/construct/thematic_break.rs85
23 files changed, 522 insertions, 907 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index b042645..583fde2 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -88,54 +88,11 @@ enum GroupKind {
Other,
}
-/// Type of sequence.
-#[derive(Debug, PartialEq)]
-enum MarkerKind {
- /// In a run with asterisks.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// *a*
- /// ```
- Asterisk,
- /// In a run with underscores.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// _a_
- /// ```
- Underscore,
-}
-
-impl MarkerKind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- MarkerKind::Asterisk => b'*',
- MarkerKind::Underscore => b'_',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `*` or `_`.
- fn from_byte(byte: u8) -> MarkerKind {
- match byte {
- b'*' => MarkerKind::Asterisk,
- b'_' => MarkerKind::Underscore,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// Attentention sequence that we can take markers from.
#[derive(Debug)]
struct Sequence {
- /// Marker used in this sequence.
- marker: MarkerKind,
+ /// Marker as a byte (`u8`) used in this sequence.
+ marker: u8,
/// The depth in events where this sequence resides.
balance: usize,
/// The index into events where this sequence’s `Enter` currently resides.
@@ -160,9 +117,9 @@ struct Sequence {
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => {
+ Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => {
tokenizer.enter(Token::AttentionSequence);
- inside(tokenizer, MarkerKind::from_byte(byte))
+ inside(tokenizer, tokenizer.current.unwrap())
}
_ => State::Nok,
}
@@ -174,14 +131,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// > | **
/// ^^
/// ```
-fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State {
- if tokenizer.current == Some(marker.as_byte()) {
- tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, marker)))
- } else {
- tokenizer.exit(Token::AttentionSequence);
- tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention));
- State::Ok
+fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State {
+ match tokenizer.current {
+ Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => {
+ tokenizer.consume();
+ State::Fn(Box::new(move |t| inside(t, marker)))
+ }
+ _ => {
+ tokenizer.exit(Token::AttentionSequence);
+ tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention));
+ State::Ok
+ }
}
}
@@ -219,16 +179,10 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]);
let char_after = string_after.chars().next();
- let marker = MarkerKind::from_byte(
- Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
- .head()
- .unwrap(),
- );
- let before = classify_character(if enter.point.index > 0 {
- char_before
- } else {
- None
- });
+ let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
+ .head()
+ .unwrap();
+ let before = classify_character(char_before);
let after = classify_character(char_after);
let open = after == GroupKind::Other
|| (after == GroupKind::Punctuation && before != GroupKind::Other);
@@ -245,12 +199,12 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
start_point: enter.point.clone(),
end_point: exit.point.clone(),
size: exit.point.index - enter.point.index,
- open: if marker == MarkerKind::Asterisk {
+ open: if marker == b'*' {
open
} else {
open && (before != GroupKind::Other || !close)
},
- close: if marker == MarkerKind::Asterisk {
+ close: if marker == b'*' {
close
} else {
close && (after != GroupKind::Other || !open)
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
index b843af8..c0514ae 100644
--- a/src/construct/autolink.rs
+++ b/src/construct/autolink.rs
@@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_alphabetic() => {
+ // ASCII alphabetic.
+ Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(scheme_or_email_atext))
}
- Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer),
- _ => State::Nok,
+ _ => email_atext(tokenizer),
}
}
@@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Token::AutolinkProtocol);
end(tokenizer)
}
- Some(byte) if byte.is_ascii_control() => State::Nok,
- None | Some(b' ') => State::Nok,
+ // ASCII control or space.
+ None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok,
Some(_) => {
tokenizer.consume();
State::Fn(Box::new(url_inside))
@@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0)))
}
- Some(byte) if is_ascii_atext(byte) => {
+ // ASCII atext.
+ //
+ // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or
+ // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027
+ // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`),
+ // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`),
+ // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE
+ // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE
+ // (`~`).
+ //
+ // See:
+ // **\[RFC5322]**:
+ // [Internet Message Format](https://tools.ietf.org/html/rfc5322).
+ // P. Resnick.
+ // IETF.
+ //
+ // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
+ Some(
+ b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~',
+ ) => {
tokenizer.consume();
State::Fn(Box::new(email_atext))
}
@@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {
/// ```
fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size),
+ // ASCII alphanumeric.
+ Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size),
_ => State::Nok,
}
}
@@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State {
tokenizer.consume();
State::Fn(Box::new(move |t| email_value(t, size + 1)))
}
- Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => {
+ // ASCII alphanumeric.
+ Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |t| email_label(t, size + 1)))
}
@@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State {
_ => unreachable!("expected `>`"),
}
}
-
-/// Check whether the character code represents an ASCII atext.
-///
-/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in
-/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`),
-/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F
-/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E
-/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE
-/// (`{`) to U+007E TILDE (`~`).
-///
-/// See:
-/// **\[RFC5322]**:
-/// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
-/// P. Resnick.
-/// IETF.
-///
-/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
-fn is_ascii_atext(byte: u8) -> bool {
- matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~')
-}
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
index 02e8b62..4419d7a 100644
--- a/src/construct/character_escape.rs
+++ b/src/construct/character_escape.rs
@@ -63,7 +63,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_punctuation() => {
+ // ASCII punctuation.
+ Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => {
tokenizer.enter(Token::CharacterEscapeValue);
tokenizer.consume();
tokenizer.exit(Token::CharacterEscapeValue);
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs
index 90763c1..cd489a4 100644
--- a/src/construct/character_reference.rs
+++ b/src/construct/character_reference.rs
@@ -66,67 +66,18 @@ use crate::constant::{
CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
};
use crate::token::Token;
-use crate::tokenizer::{Point, State, Tokenizer};
-use crate::util::slice::{Position, Slice};
-
-/// Kind of a character reference.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Kind {
- /// Numeric decimal character reference.
- ///
- /// ```markdown
- /// > | a&#x9;b
- /// ^^^^^
- /// ```
- Decimal,
- /// Numeric hexadecimal character reference.
- ///
- /// ```markdown
- /// > | a&#123;b
- /// ^^^^^^
- /// ```
- Hexadecimal,
- /// Named character reference.
- ///
- /// ```markdown
- /// > | a&amp;b
- /// ^^^^^
- /// ```
- Named,
-}
-
-impl Kind {
- /// Get the maximum size of characters allowed in the value of a character
- /// reference.
- fn max(&self) -> usize {
- match self {
- Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
- Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
- Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
- }
- }
-
- /// Check if a byte ([`u8`]) is allowed.
- fn allowed(&self, byte: u8) -> bool {
- let check = match self {
- Kind::Hexadecimal => u8::is_ascii_hexdigit,
- Kind::Decimal => u8::is_ascii_digit,
- Kind::Named => u8::is_ascii_alphanumeric,
- };
-
- check(&byte)
- }
-}
+use crate::tokenizer::{State, Tokenizer};
+use crate::util::slice::Slice;
/// State needed to parse character references.
#[derive(Debug, Clone)]
struct Info {
- /// Place of value start.
- start: Point,
- /// Size of value.
- size: usize,
- /// Kind of character reference.
- kind: Kind,
+ /// Index of where value starts.
+ start: usize,
+ /// Marker of character reference.
+ marker: u8,
+ /// Maximum number of characters in the value for this kind.
+ max: usize,
}
/// Start of a character reference.
@@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State {
value(
tokenizer,
Info {
- start: tokenizer.point.clone(),
- size: 0,
- kind: Kind::Named,
+ start: tokenizer.point.index,
+ marker: b'&',
+ max: CHARACTER_REFERENCE_NAMED_SIZE_MAX,
},
)
}
@@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal);
tokenizer.enter(Token::CharacterReferenceValue);
let info = Info {
- start: tokenizer.point.clone(),
- size: 0,
- kind: Kind::Hexadecimal,
+ start: tokenizer.point.index,
+ marker: b'x',
+ max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
};
State::Fn(Box::new(|t| value(t, info)))
} else {
tokenizer.enter(Token::CharacterReferenceValue);
let info = Info {
- start: tokenizer.point.clone(),
- size: 0,
- kind: Kind::Decimal,
+ start: tokenizer.point.index,
+ marker: b'#',
+ max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
};
value(tokenizer, info)
}
@@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {
/// > | a&#x9;b
/// ^
/// ```
-fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {
+fn value(tokenizer: &mut Tokenizer, info: Info) -> State {
+ let size = tokenizer.point.index - info.start;
+
match tokenizer.current {
- Some(b';') if info.size > 0 => {
- if Kind::Named == info.kind {
- // To do: fix slice.
- let value = Slice::from_position(
+ Some(b';') if size > 0 => {
+ // Named.
+ if info.marker == b'&' {
+ // Guaranteed to be valid ASCII bytes.
+ let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &info.start,
- end: &tokenizer.point,
- },
- )
- .serialize();
+ info.start,
+ tokenizer.point.index,
+ );
+ let name = slice.as_str();
- if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) {
+ if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) {
return State::Nok;
}
}
@@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.exit(Token::CharacterReference);
State::Ok
}
- Some(byte) => {
- if info.size < info.kind.max() && info.kind.allowed(byte) {
- info.size += 1;
- tokenizer.consume();
- State::Fn(Box::new(|t| value(t, info)))
- } else {
- State::Nok
- }
+ // ASCII digit, for named, decimal, and hexadecimal references.
+ Some(b'0'..=b'9') if size < info.max => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| value(t, info)))
+ }
+ // ASCII hex letters, for named and hexadecimal references.
+ Some(b'A'..=b'F' | b'a'..=b'f')
+ if matches!(info.marker, b'&' | b'x') && size < info.max =>
+ {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| value(t, info)))
+ }
+ // Non-hex ASCII alphabeticals, for named references.
+ Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| value(t, info)))
}
_ => State::Nok,
}
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index 21e9259..c4c3e86 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -110,53 +110,6 @@ use crate::token::Token;
use crate::tokenizer::{ContentType, State, Tokenizer};
use crate::util::slice::{Position, Slice};
-/// Kind of fences.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Kind {
- /// Grave accent (tick) code.
- ///
- /// ## Example
- ///
- /// ````markdown
- /// ```rust
- /// println!("I <3 🦀");
- /// ```
- /// ````
- GraveAccent,
- /// Tilde code.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ~~~rust
- /// println!("I <3 🦀");
- /// ~~~
- /// ```
- Tilde,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- Kind::GraveAccent => b'`',
- Kind::Tilde => b'~',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `~` or `` ` ``.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'`' => Kind::GraveAccent,
- b'~' => Kind::Tilde,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse code (fenced).
#[derive(Debug, Clone)]
struct Info {
@@ -165,8 +118,8 @@ struct Info {
/// Number of tabs or spaces of indentation before the opening fence
/// sequence.
prefix: usize,
- /// Kind of fences.
- kind: Kind,
+ /// Marker of fences (`u8`).
+ marker: u8,
}
/// Start of fenced code.
@@ -178,15 +131,20 @@ struct Info {
/// | ~~~
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
if tokenizer.parse_state.constructs.code_fenced {
tokenizer.enter(Token::CodeFenced);
tokenizer.enter(Token::CodeFencedFence);
- tokenizer.go(space_or_tab_min_max(0, max), before_sequence_open)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before_sequence_open,
+ )(tokenizer)
} else {
State::Nok
}
@@ -210,23 +168,22 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
tokenizer.parse_state.bytes,
&Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1),
)
- .size();
+ .len();
}
}
- match tokenizer.current {
- Some(byte) if matches!(byte, b'`' | b'~') => {
- tokenizer.enter(Token::CodeFencedFenceSequence);
- sequence_open(
- tokenizer,
- Info {
- prefix,
- size: 0,
- kind: Kind::from_byte(byte),
- },
- )
- }
- _ => State::Nok,
+ if let Some(b'`' | b'~') = tokenizer.current {
+ tokenizer.enter(Token::CodeFencedFenceSequence);
+ sequence_open(
+ tokenizer,
+ Info {
+ prefix,
+ size: 0,
+ marker: tokenizer.current.unwrap(),
+ },
+ )
+ } else {
+ State::Nok
}
}
@@ -240,7 +197,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {
tokenizer.consume();
State::Fn(Box::new(|t| {
info.size += 1;
@@ -302,7 +259,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.exit(Token::CodeFencedFenceInfo);
tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer)
}
- Some(b'`') if info.kind == Kind::GraveAccent => State::Nok,
+ Some(b'`') if info.marker == b'`' => State::Nok,
Some(_) => {
tokenizer.consume();
State::Fn(Box::new(|t| info_inside(t, info)))
@@ -352,7 +309,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.concrete = true;
at_break(tokenizer, info)
}
- Some(b'`') if info.kind == Kind::GraveAccent => State::Nok,
+ Some(b'`') if info.marker == b'`' => State::Nok,
_ => {
tokenizer.consume();
State::Fn(Box::new(|t| meta(t, info)))
@@ -432,14 +389,18 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ^
/// ```
fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
tokenizer.enter(Token::CodeFencedFence);
- tokenizer.go(space_or_tab_min_max(0, max), |t| close_before(t, info))(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ |t| close_before(t, info),
+ )(tokenizer)
}
/// In a closing fence, after optional whitespace, before sequence.
@@ -452,7 +413,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {
tokenizer.enter(Token::CodeFencedFenceSequence);
close_sequence(tokenizer, info, 0)
}
@@ -470,7 +431,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {
tokenizer.consume();
State::Fn(Box::new(move |t| close_sequence(t, info, size + 1)))
}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 4a3a9f6..81a3080 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -62,11 +62,11 @@ use crate::tokenizer::{State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
// Do not interrupt paragraphs.
- if tokenizer.interrupt || !tokenizer.parse_state.constructs.code_indented {
- State::Nok
- } else {
+ if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented {
tokenizer.enter(Token::CodeIndented);
tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer)
+ } else {
+ State::Nok
}
}
@@ -129,29 +129,26 @@ fn after(tokenizer: &mut Tokenizer) -> State {
/// | bbb
/// ```
fn further_start(tokenizer: &mut Tokenizer) -> State {
- if tokenizer.lazy {
- State::Nok
- } else {
- match tokenizer.current {
- Some(b'\n') => {
- tokenizer.enter(Token::LineEnding);
- tokenizer.consume();
- tokenizer.exit(Token::LineEnding);
- State::Fn(Box::new(further_start))
- }
- _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
- Box::new(if ok { further_end } else { further_begin })
- })(tokenizer),
+ match tokenizer.current {
+ Some(b'\n') if !tokenizer.lazy => {
+ tokenizer.enter(Token::LineEnding);
+ tokenizer.consume();
+ tokenizer.exit(Token::LineEnding);
+ State::Fn(Box::new(further_start))
}
+ _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
+ Box::new(if ok { further_end } else { further_begin })
+ })(tokenizer),
+ _ => State::Nok,
}
}
-/// After a proper indent.
+/// At an eol, which is followed by an indented line.
///
/// ```markdown
-/// | aaa
-/// > | bbb
-/// ^
+/// > | aaa
+/// ^
+/// | bbb
/// ```
fn further_end(_tokenizer: &mut Tokenizer) -> State {
State::Ok
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
index b36a208..d70fbc2 100644
--- a/src/construct/code_text.rs
+++ b/src/construct/code_text.rs
@@ -95,14 +95,13 @@ use crate::tokenizer::{State, Tokenizer};
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let len = tokenizer.events.len();
-
match tokenizer.current {
Some(b'`')
if tokenizer.parse_state.constructs.code_text
&& (tokenizer.previous != Some(b'`')
- || (len > 0
- && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) =>
+ || (!tokenizer.events.is_empty()
+ && tokenizer.events[tokenizer.events.len() - 1].token_type
+ == Token::CharacterEscape)) =>
{
tokenizer.enter(Token::CodeText);
tokenizer.enter(Token::CodeTextSequence);
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 14755c9..bd7df82 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -110,17 +110,18 @@ use crate::util::skip::opt_back as skip_opt_back;
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let definition_before = !tokenizer.events.is_empty()
- && tokenizer.events[skip_opt_back(
- &tokenizer.events,
- tokenizer.events.len() - 1,
- &[Token::LineEnding, Token::SpaceOrTab],
- )]
- .token_type
- == Token::Definition;
-
// Do not interrupt paragraphs (but do follow definitions).
- if (!tokenizer.interrupt || definition_before) && tokenizer.parse_state.constructs.definition {
+ let possible = !tokenizer.interrupt
+ || (!tokenizer.events.is_empty()
+ && tokenizer.events[skip_opt_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::LineEnding, Token::SpaceOrTab],
+ )]
+ .token_type
+ == Token::Definition);
+
+ if possible && tokenizer.parse_state.constructs.definition {
tokenizer.enter(Token::Definition);
// Note: arbitrary whitespace allowed even if code (indented) is on.
tokenizer.attempt_opt(space_or_tab(), before)(tokenizer)
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
index cdbc192..d09bf54 100644
--- a/src/construct/hard_break_escape.rs
+++ b/src/construct/hard_break_escape.rs
@@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => {
tokenizer.enter(Token::HardBreakEscape);
tokenizer.consume();
- State::Fn(Box::new(inside))
+ State::Fn(Box::new(after))
}
_ => State::Nok,
}
@@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// | b
/// ```
-fn inside(tokenizer: &mut Tokenizer) -> State {
+fn after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'\n') => {
tokenizer.exit(Token::HardBreakEscape);
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 9a73b77..aa388ee 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -66,15 +66,19 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer};
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.heading_atx {
tokenizer.enter(Token::HeadingAtx);
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -101,19 +105,19 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// > | ## aa
/// ^
/// ```
-fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State {
+fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- None | Some(b'\n') if rank > 0 => {
+ None | Some(b'\n') if size > 0 => {
tokenizer.exit(Token::HeadingAtxSequence);
at_break(tokenizer)
}
- Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
+ Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |tokenizer| {
- sequence_open(tokenizer, rank + 1)
+ sequence_open(tokenizer, size + 1)
}))
}
- _ if rank > 0 => {
+ _ if size > 0 => {
tokenizer.exit(Token::HeadingAtxSequence);
tokenizer.go(space_or_tab(), at_break)(tokenizer)
}
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
index 2a4adbf..98d7843 100644
--- a/src/construct/heading_setext.rs
+++ b/src/construct/heading_setext.rs
@@ -63,52 +63,6 @@ use crate::token::Token;
use crate::tokenizer::{EventType, State, Tokenizer};
use crate::util::skip::opt_back as skip_opt_back;
-/// Kind of underline.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Kind {
- /// Dash (rank 2) heading.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// alpha
- /// -----
- /// ```
- Dash,
-
- /// Equals to (rank 1) heading.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// alpha
- /// =====
- /// ```
- EqualsTo,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- Kind::Dash => b'-',
- Kind::EqualsTo => b'=',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `-` or `=`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'-' => Kind::Dash,
- b'=' => Kind::EqualsTo,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// At a line ending, presumably an underline.
///
/// ```markdown
@@ -117,23 +71,29 @@ impl Kind {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
- let paragraph_before = !tokenizer.events.is_empty()
- && tokenizer.events[skip_opt_back(
- &tokenizer.events,
- tokenizer.events.len() - 1,
- &[Token::LineEnding, Token::SpaceOrTab],
- )]
- .token_type
- == Token::Paragraph;
-
- // Require a paragraph before and do not allow on a lazy line.
- if paragraph_before && !tokenizer.lazy && tokenizer.parse_state.constructs.heading_setext {
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ if tokenizer.parse_state.constructs.heading_setext
+ && !tokenizer.lazy
+ // Require a paragraph before.
+ && (!tokenizer.events.is_empty()
+ && tokenizer.events[skip_opt_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::LineEnding, Token::SpaceOrTab],
+ )]
+ .token_type
+ == Token::Paragraph)
+ {
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -148,9 +108,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if matches!(byte, b'-' | b'=') => {
+ Some(b'-' | b'=') => {
tokenizer.enter(Token::HeadingSetextUnderline);
- inside(tokenizer, Kind::from_byte(byte))
+ inside(tokenizer, tokenizer.current.unwrap())
}
_ => State::Nok,
}
@@ -163,11 +123,11 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// > | ==
/// ^
/// ```
-fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State {
+fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State {
match tokenizer.current {
- Some(byte) if byte == kind.as_byte() => {
+ Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => {
tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, kind)))
+ State::Fn(Box::new(move |t| inside(t, marker)))
}
_ => {
tokenizer.exit(Token::HeadingSetextUnderline);
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index 5860c5d..064da35 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -98,17 +98,17 @@
//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
-use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE};
+use crate::constant::{
+ HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE,
+};
use crate::construct::{
blank_line::start as blank_line,
partial_non_lazy_continuation::start as partial_non_lazy_continuation,
partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions},
};
use crate::token::Token;
-use crate::tokenizer::{Point, State, Tokenizer};
-use crate::util::slice::{Position, Slice};
-
-const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'['];
+use crate::tokenizer::{State, Tokenizer};
+use crate::util::slice::Slice;
/// Kind of HTML (flow).
#[derive(Debug, PartialEq)]
@@ -129,49 +129,6 @@ enum Kind {
Complete,
}
-/// Type of quote, if we’re in a quoted attribute, in complete (condition 7).
-#[derive(Debug, PartialEq)]
-enum QuoteKind {
- /// In a double quoted (`"`) attribute value.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// <a b="c" />
- /// ```
- Double,
- /// In a single quoted (`'`) attribute value.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// <a b='c' />
- /// ```
- Single,
-}
-
-impl QuoteKind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- QuoteKind::Double => b'"',
- QuoteKind::Single => b'\'',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `"` or `'`.
- fn from_byte(byte: u8) -> QuoteKind {
- match byte {
- b'"' => QuoteKind::Double,
- b'\'' => QuoteKind::Single,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse HTML (flow).
#[derive(Debug)]
struct Info {
@@ -179,12 +136,10 @@ struct Info {
kind: Kind,
/// Whether this is a start tag (`<` not followed by `/`).
start_tag: bool,
- /// Used depending on `kind` to collect all parsed bytes.
- start: Option<Point>,
- /// Collected index, for various reasons.
- size: usize,
+ /// Start index of a tag name or cdata prefix.
+ start: usize,
/// Current quote, when in a double or single quoted attribute value.
- quote: Option<QuoteKind>,
+ quote: u8,
}
/// Start of HTML (flow), before optional whitespace.
@@ -194,19 +149,17 @@ struct Info {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.html_flow {
tokenizer.enter(Token::HtmlFlow);
tokenizer.go(
space_or_tab_with_options(SpaceOrTabOptions {
kind: Token::HtmlFlowData,
min: 0,
- max,
+ max: if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
connect: false,
content_type: None,
}),
@@ -249,9 +202,8 @@ fn open(tokenizer: &mut Tokenizer) -> State {
kind: Kind::Basic,
// Assume closing tag (or no tag).
start_tag: false,
- start: None,
- size: 0,
- quote: None,
+ start: 0,
+ quote: 0,
};
match tokenizer.current {
@@ -261,7 +213,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {
}
Some(b'/') => {
tokenizer.consume();
- info.start = Some(tokenizer.point.clone());
+ info.start = tokenizer.point.index;
State::Fn(Box::new(|t| tag_close_start(t, info)))
}
Some(b'?') => {
@@ -273,9 +225,10 @@ fn open(tokenizer: &mut Tokenizer) -> State {
// right now, so we do need to search for `>`, similar to declarations.
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
info.start_tag = true;
- info.start = Some(tokenizer.point.clone());
+ info.start = tokenizer.point.index;
tag_name(tokenizer, info)
}
_ => State::Nok,
@@ -299,12 +252,6 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
info.kind = Kind::Comment;
State::Fn(Box::new(|t| comment_open_inside(t, info)))
}
- Some(b'[') => {
- tokenizer.consume();
- info.kind = Kind::Cdata;
- info.size = 0;
- State::Fn(Box::new(|t| cdata_open_inside(t, info)))
- }
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
info.kind = Kind::Declaration;
@@ -312,6 +259,12 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.concrete = true;
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
+ Some(b'[') => {
+ tokenizer.consume();
+ info.kind = Kind::Cdata;
+ info.start = tokenizer.point.index;
+ State::Fn(Box::new(|t| cdata_open_inside(t, info)))
+ }
_ => State::Nok,
}
}
@@ -342,12 +295,11 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == CDATA_SEARCH[info.size] => {
- info.size += 1;
+ Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => {
tokenizer.consume();
- if info.size == CDATA_SEARCH.len() {
- info.size = 0;
+ if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() {
+ info.start = 0;
// Do not form containers.
tokenizer.concrete = true;
State::Fn(Box::new(|t| continuation(t, info)))
@@ -367,6 +319,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| tag_name(t, info)))
@@ -387,17 +340,18 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => {
let slash = matches!(tokenizer.current, Some(b'/'));
- let start = info.start.take().unwrap();
- let name = Slice::from_position(
+ // Guaranteed to be valid ASCII bytes.
+ let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &start,
- end: &tokenizer.point,
- },
- )
- .serialize()
- .trim()
- .to_lowercase();
+ info.start,
+ tokenizer.point.index,
+ );
+ let name = slice
+ .as_str()
+ // The line ending case might result in a `\r` that is already accounted for.
+ .trim()
+ .to_ascii_lowercase();
+ info.start = 0;
if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) {
info.kind = Kind::Raw;
@@ -427,6 +381,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
}
}
}
+ // ASCII alphanumerical and `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| tag_name(t, info)))
@@ -490,18 +445,19 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ Some(b'\t' | b' ') => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| complete_attribute_name_before(t, info)))
+ }
Some(b'/') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_end(t, info)))
}
+ // ASCII alphanumerical and `:` and `_`.
Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
}
- Some(b'\t' | b' ') => {
- tokenizer.consume();
- State::Fn(Box::new(|t| complete_attribute_name_before(t, info)))
- }
_ => complete_end(tokenizer, info),
}
}
@@ -518,6 +474,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat
/// ```
fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ // ASCII alphanumerical and `-`, `.`, `:`, and `_`.
Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
@@ -537,14 +494,14 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(b'=') => {
- tokenizer.consume();
- State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
- }
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name_after(t, info)))
}
+ Some(b'=') => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
+ }
_ => complete_attribute_name_before(tokenizer, info),
}
}
@@ -561,15 +518,15 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State
fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok,
- Some(byte) if matches!(byte, b'"' | b'\'') => {
- info.quote = Some(QuoteKind::from_byte(byte));
- tokenizer.consume();
- State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info)))
- }
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
}
+ Some(b'"' | b'\'') => {
+ info.quote = tokenizer.current.unwrap();
+ tokenizer.consume();
+ State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info)))
+ }
_ => complete_attribute_value_unquoted(tokenizer, info),
}
}
@@ -585,7 +542,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) ->
fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
None | Some(b'\n') => State::Nok,
- Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => {
+ Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info)))
}
@@ -673,6 +630,21 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => {
+ tokenizer.exit(Token::HtmlFlowData);
+ tokenizer.check(blank_line_before, |ok| {
+ if ok {
+ Box::new(continuation_after)
+ } else {
+ Box::new(move |t| continuation_start(t, info))
+ }
+ })(tokenizer)
+ }
+ // Note: important that this is after the basic/complete case.
+ None | Some(b'\n') => {
+ tokenizer.exit(Token::HtmlFlowData);
+ continuation_start(tokenizer, info)
+ }
Some(b'-') if info.kind == Kind::Comment => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_comment_inside(t, info)))
@@ -693,20 +665,6 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_character_data_inside(t, info)))
}
- Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => {
- tokenizer.exit(Token::HtmlFlowData);
- tokenizer.check(blank_line_before, |ok| {
- if ok {
- Box::new(continuation_after)
- } else {
- Box::new(move |t| continuation_start(t, info))
- }
- })(tokenizer)
- }
- None | Some(b'\n') => {
- tokenizer.exit(Token::HtmlFlowData);
- continuation_start(tokenizer, info)
- }
_ => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation(t, info)))
@@ -793,7 +751,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State
match tokenizer.current {
Some(b'/') => {
tokenizer.consume();
- info.start = Some(tokenizer.point.clone());
+ info.start = tokenizer.point.index;
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => continuation(tokenizer, info),
@@ -809,18 +767,15 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State
fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
Some(b'>') => {
- info.size = 0;
-
- let start = info.start.take().unwrap();
- let name = Slice::from_position(
+ // Guaranteed to be valid ASCII bytes.
+ let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &start,
- end: &tokenizer.point,
- },
- )
- .serialize()
- .to_lowercase();
+ info.start,
+ tokenizer.point.index,
+ );
+ let name = slice.as_str().to_ascii_lowercase();
+
+ info.start = 0;
if HTML_RAW_NAMES.contains(&name.as_str()) {
tokenizer.consume();
@@ -829,13 +784,14 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State
continuation(tokenizer, info)
}
}
- Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => {
+ Some(b'A'..=b'Z' | b'a'..=b'z')
+ if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX =>
+ {
tokenizer.consume();
- info.size += 1;
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => {
- info.size = 0;
+ info.start = 0;
continuation(tokenizer, info)
}
}
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index f10a476..51beda5 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -54,12 +54,11 @@
//! [html_flow]: crate::construct::html_flow
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
+use crate::constant::HTML_CDATA_PREFIX;
use crate::construct::partial_space_or_tab::space_or_tab;
use crate::token::Token;
use crate::tokenizer::{State, StateFn, Tokenizer};
-const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'['];
-
/// Start of HTML (text)
///
/// ```markdown
@@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(instruction))
}
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open))
@@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(comment_open_inside))
}
- Some(b'[') => {
- tokenizer.consume();
- State::Fn(Box::new(|t| cdata_open_inside(t, 0)))
- }
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(declaration))
}
+ Some(b'[') => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| cdata_open_inside(t, 0)))
+ }
_ => State::Nok,
}
}
@@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State {
/// > | a <![CDATA[>&<]]> b
/// ^^^^^^
/// ```
-fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State {
- match tokenizer.current {
- Some(byte) if byte == CDATA_SEARCH[index] => {
- tokenizer.consume();
+fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State {
+ if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) {
+ tokenizer.consume();
- if index + 1 == CDATA_SEARCH.len() {
- State::Fn(Box::new(cdata))
- } else {
- State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1)))
- }
+ if size + 1 == HTML_CDATA_PREFIX.len() {
+ State::Fn(Box::new(cdata))
+ } else {
+ State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1)))
}
- _ => State::Nok,
+ } else {
+ State::Nok
}
}
@@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_close))
@@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphanumerical and `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_close))
@@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphanumerical and `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open))
@@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(end))
}
+ // ASCII alphabetical and `:` and `_`.
Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name))
@@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphabetical and `-`, `.`, `:`, and `_`.
Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name))
@@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_before))
}
- Some(byte) if byte == b'"' || byte == b'\'' => {
+ Some(b'"' | b'\'') => {
+ let marker = tokenizer.current.unwrap();
tokenizer.consume();
- State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte)))
+ State::Fn(Box::new(move |t| {
+ tag_open_attribute_value_quoted(t, marker)
+ }))
}
Some(_) => {
tokenizer.consume();
@@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta
tokenizer,
Box::new(move |t| tag_open_attribute_value_quoted(t, marker)),
),
- Some(byte) if byte == marker => {
+ Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_quoted_after))
}
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 6399f81..a1ec8d9 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -214,16 +214,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
media: Media {
start: label_start.start,
end: (label_end_start, label_end_start + 3),
- // To do: virtual spaces not needed, create a `to_str`?
id: normalize_identifier(
- &Slice::from_position(
+ // We don’t care about virtual spaces, so `indices` and `as_str` are fine.
+ Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &tokenizer.events[label_start.start.1].point,
- end: &tokenizer.events[label_end_start - 1].point,
- },
+ tokenizer.events[label_start.start.1].point.index,
+ tokenizer.events[label_end_start - 1].point.index,
)
- .serialize(),
+ .as_str(),
),
},
};
@@ -366,11 +364,11 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ^
/// ```
fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State {
- let label_start = tokenizer
+ tokenizer
.label_start_stack
.get_mut(label_start_index)
- .unwrap();
- label_start.balanced = true;
+ .unwrap()
+ .balanced = true;
State::Nok
}
@@ -529,23 +527,24 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn full_reference_after(tokenizer: &mut Tokenizer) -> State {
- let end = skip::to_back(
- &tokenizer.events,
- tokenizer.events.len() - 1,
- &[Token::ReferenceString],
- );
-
- // To do: virtual spaces not needed, create a `to_str`?
- let id = Slice::from_position(
- tokenizer.parse_state.bytes,
- &Position::from_exit_event(&tokenizer.events, end),
- )
- .serialize();
-
if tokenizer
.parse_state
.definitions
- .contains(&normalize_identifier(&id))
+ // We don’t care about virtual spaces, so `as_str` is fine.
+ .contains(&normalize_identifier(
+ Slice::from_position(
+ tokenizer.parse_state.bytes,
+ &Position::from_exit_event(
+ &tokenizer.events,
+ skip::to_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::ReferenceString],
+ ),
+ ),
+ )
+ .as_str(),
+ ))
{
State::Ok
} else {
diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs
index d30b8dd..4a3508e 100644
--- a/src/construct/label_start_image.rs
+++ b/src/construct/label_start_image.rs
@@ -64,9 +64,8 @@ pub fn open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
tokenizer.exit(Token::LabelMarker);
tokenizer.exit(Token::LabelImage);
- let end = tokenizer.events.len() - 1;
tokenizer.label_start_stack.push(LabelStart {
- start: (end - 5, end),
+ start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1),
balanced: false,
inactive: false,
});
diff --git a/src/construct/list.rs b/src/construct/list.rs
index 9b59130..d5a9899 100644
--- a/src/construct/list.rs
+++ b/src/construct/list.rs
@@ -56,69 +56,6 @@ use crate::util::{
slice::{Position, Slice},
};
-/// Type of list.
-#[derive(Debug, PartialEq)]
-enum Kind {
- /// In a dot (`.`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// 1. a
- /// ```
- Dot,
- /// In a paren (`)`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// 1) a
- /// ```
- Paren,
- /// In an asterisk (`*`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// * a
- /// ```
- Asterisk,
- /// In a plus (`+`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// + a
- /// ```
- Plus,
- /// In a dash (`-`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// - a
- /// ```
- Dash,
-}
-
-impl Kind {
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'.' => Kind::Dot,
- b')' => Kind::Paren,
- b'*' => Kind::Asterisk,
- b'+' => Kind::Plus,
- b'-' => Kind::Dash,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// Start of list item.
///
/// ```markdown
@@ -126,15 +63,19 @@ impl Kind {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.list {
tokenizer.enter(Token::ListItem);
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -149,15 +90,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Unordered.
- Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| {
+ Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| {
Box::new(if ok { nok } else { before_unordered })
})(tokenizer),
+ Some(b'+') => before_unordered(tokenizer),
// Ordered.
- Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => {
- tokenizer.enter(Token::ListItemPrefix);
- tokenizer.enter(Token::ListItemValue);
- inside(tokenizer, 0)
- }
+ Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer),
+ Some(b'1') => before_ordered(tokenizer),
_ => State::Nok,
}
}
@@ -175,6 +114,18 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {
marker(tokenizer)
}
+/// Start of an ordered list item.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+fn before_ordered(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.enter(Token::ListItemPrefix);
+ tokenizer.enter(Token::ListItemValue);
+ inside(tokenizer, 0)
+}
+
/// In an ordered list item value.
///
/// ```markdown
@@ -183,14 +134,14 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => {
- tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, size + 1)))
- }
Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => {
tokenizer.exit(Token::ListItemValue);
marker(tokenizer)
}
+ Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => {
+ tokenizer.consume();
+ State::Fn(Box::new(move |t| inside(t, size + 1)))
+ }
_ => State::Nok,
}
}
@@ -262,7 +213,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn whitespace_after(tokenizer: &mut Tokenizer) -> State {
- if matches!(tokenizer.current, Some(b'\t' | b' ')) {
+ if let Some(b'\t' | b' ') = tokenizer.current {
State::Nok
} else {
State::Ok
@@ -309,7 +260,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State {
end: &tokenizer.point,
},
)
- .size();
+ .len();
if blank {
prefix += 1;
@@ -389,8 +340,8 @@ fn nok(_tokenizer: &mut Tokenizer) -> State {
pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
let mut index = 0;
let mut balance = 0;
- let mut lists_wip: Vec<(Kind, usize, usize, usize)> = vec![];
- let mut lists: Vec<(Kind, usize, usize, usize)> = vec![];
+ let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![];
+ let mut lists: Vec<(u8, usize, usize, usize)> = vec![];
// Merge list items.
while index < tokenizer.events.len() {
@@ -400,12 +351,14 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
if event.event_type == EventType::Enter {
let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1;
let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]);
- let kind = Kind::from_byte(
- Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point)
- .head()
- .unwrap(),
- );
- let current = (kind, balance, index, end);
+ // Guaranteed to be a valid ASCII byte.
+ let marker = Slice::from_index(
+ tokenizer.parse_state.bytes,
+ tokenizer.events[marker].point.index,
+ )
+ .head()
+ .unwrap();
+ let current = (marker, balance, index, end);
let mut list_index = lists_wip.len();
let mut matched = false;
@@ -475,7 +428,7 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
let mut list_start = tokenizer.events[list_item.2].clone();
let mut list_end = tokenizer.events[list_item.3].clone();
let token_type = match list_item.0 {
- Kind::Paren | Kind::Dot => Token::ListOrdered,
+ b'.' | b')' => Token::ListOrdered,
_ => Token::ListUnordered,
};
list_start.token_type = token_type.clone();
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index 146dc40..ec5669c 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -81,10 +81,9 @@ fn inside(tokenizer: &mut Tokenizer) -> State {
/// Merge “`Paragraph`”s, which currently span a single line, into actual
/// `Paragraph`s that span multiple lines.
pub fn resolve(tokenizer: &mut Tokenizer) {
- let len = tokenizer.events.len();
let mut index = 0;
- while index < len {
+ while index < tokenizer.events.len() {
let event = &tokenizer.events[index];
if event.event_type == EventType::Enter && event.token_type == Token::Paragraph {
diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs
index be8d6c8..155a1a3 100644
--- a/src/construct/partial_bom.rs
+++ b/src/construct/partial_bom.rs
@@ -10,13 +10,12 @@ use crate::tokenizer::{State, Tokenizer};
/// ^^^^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Some(0xEF) => {
- tokenizer.enter(Token::ByteOrderMark);
- tokenizer.consume();
- State::Fn(Box::new(cont))
- }
- _ => State::Nok,
+ if tokenizer.current == Some(0xEF) {
+ tokenizer.enter(Token::ByteOrderMark);
+ tokenizer.consume();
+ State::Fn(Box::new(cont))
+ } else {
+ State::Nok
}
}
@@ -27,12 +26,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^^^^
/// ```
fn cont(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Some(0xBB) => {
- tokenizer.consume();
- State::Fn(Box::new(end))
- }
- _ => State::Nok,
+ if tokenizer.current == Some(0xBB) {
+ tokenizer.consume();
+ State::Fn(Box::new(end))
+ } else {
+ State::Nok
}
}
@@ -43,12 +41,11 @@ fn cont(tokenizer: &mut Tokenizer) -> State {
/// ^^^^
/// ```
fn end(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Some(0xBF) => {
- tokenizer.consume();
- tokenizer.exit(Token::ByteOrderMark);
- State::Ok
- }
- _ => State::Nok,
+ if tokenizer.current == Some(0xBF) {
+ tokenizer.consume();
+ tokenizer.exit(Token::ByteOrderMark);
+ State::Ok
+ } else {
+ State::Nok
}
}
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index 0a3721c..809aa27 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
tokenizer.exit(info.options.marker.clone());
State::Fn(Box::new(|t| enclosed_before(t, info)))
}
- None | Some(b' ' | b')') => State::Nok,
- Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok,
+ // ASCII control, space, closing paren, but *not* `\0`.
+ None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok,
Some(_) => {
tokenizer.enter(info.options.destination.clone());
tokenizer.enter(info.options.raw.clone());
@@ -166,12 +166,12 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ None | Some(b'\n' | b'<') => State::Nok,
Some(b'>') => {
tokenizer.exit(Token::Data);
tokenizer.exit(info.options.string.clone());
enclosed_before(tokenizer, info)
}
- None | Some(b'\n' | b'<') => State::Nok,
Some(b'\\') => {
tokenizer.consume();
State::Fn(Box::new(|t| enclosed_escape(t, info)))
@@ -207,40 +207,25 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(b'(') => {
- if info.balance >= info.options.limit {
- State::Nok
- } else {
- tokenizer.consume();
- info.balance += 1;
- State::Fn(Box::new(move |t| raw(t, info)))
- }
+ None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => {
+ tokenizer.exit(Token::Data);
+ tokenizer.exit(info.options.string.clone());
+ tokenizer.exit(info.options.raw.clone());
+ tokenizer.exit(info.options.destination);
+ State::Ok
}
- Some(b')') => {
- if info.balance == 0 {
- tokenizer.exit(Token::Data);
- tokenizer.exit(info.options.string.clone());
- tokenizer.exit(info.options.raw.clone());
- tokenizer.exit(info.options.destination);
- State::Ok
- } else {
- tokenizer.consume();
- info.balance -= 1;
- State::Fn(Box::new(move |t| raw(t, info)))
- }
+ Some(b'(') if info.balance < info.options.limit => {
+ tokenizer.consume();
+ info.balance += 1;
+ State::Fn(Box::new(move |t| raw(t, info)))
}
- None | Some(b'\t' | b'\n' | b' ') => {
- if info.balance > 0 {
- State::Nok
- } else {
- tokenizer.exit(Token::Data);
- tokenizer.exit(info.options.string.clone());
- tokenizer.exit(info.options.raw.clone());
- tokenizer.exit(info.options.destination);
- State::Ok
- }
+ // ASCII control (but *not* `\0`) and space and `(`.
+ None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok,
+ Some(b')') => {
+ tokenizer.consume();
+ info.balance -= 1;
+ State::Fn(Box::new(move |t| raw(t, info)))
}
- Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok,
Some(b'\\') => {
tokenizer.consume();
State::Fn(Box::new(move |t| raw_escape(t, info)))
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index 7e40a2d..6fdb70d 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -123,39 +123,43 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ^
/// ```
fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
- match tokenizer.current {
- None | Some(b'[') => State::Nok,
- Some(b']') if !info.data => State::Nok,
- _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok,
- Some(b']') => {
- tokenizer.exit(info.options.string.clone());
- tokenizer.enter(info.options.marker.clone());
- tokenizer.consume();
- tokenizer.exit(info.options.marker.clone());
- tokenizer.exit(info.options.label);
- State::Ok
- }
- Some(b'\n') => tokenizer.go(
- space_or_tab_eol_with_options(EolOptions {
- content_type: Some(ContentType::String),
- connect: info.connect,
- }),
- |t| {
- info.connect = true;
- at_break(t, info)
- },
- )(tokenizer),
- _ => {
- tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
-
- if info.connect {
- let index = tokenizer.events.len() - 1;
- link(&mut tokenizer.events, index);
- } else {
- info.connect = true;
+ if info.size > LINK_REFERENCE_SIZE_MAX
+ || matches!(tokenizer.current, None | Some(b'['))
+ || (matches!(tokenizer.current, Some(b']')) && !info.data)
+ {
+ State::Nok
+ } else {
+ match tokenizer.current {
+ Some(b'\n') => tokenizer.go(
+ space_or_tab_eol_with_options(EolOptions {
+ content_type: Some(ContentType::String),
+ connect: info.connect,
+ }),
+ |t| {
+ info.connect = true;
+ at_break(t, info)
+ },
+ )(tokenizer),
+ Some(b']') => {
+ tokenizer.exit(info.options.string.clone());
+ tokenizer.enter(info.options.marker.clone());
+ tokenizer.consume();
+ tokenizer.exit(info.options.marker.clone());
+ tokenizer.exit(info.options.label);
+ State::Ok
}
+ _ => {
+ tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
+
+ if info.connect {
+ let index = tokenizer.events.len() - 1;
+ link(&mut tokenizer.events, index);
+ } else {
+ info.connect = true;
+ }
- label(tokenizer, info)
+ label(tokenizer, info)
+ }
}
}
}
@@ -172,30 +176,19 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- _ if info.size > LINK_REFERENCE_SIZE_MAX => {
- tokenizer.exit(Token::Data);
- at_break(tokenizer, info)
- }
- Some(b'\t' | b' ') => {
- tokenizer.consume();
- info.size += 1;
- State::Fn(Box::new(|t| label(t, info)))
- }
- Some(b'\\') => {
- tokenizer.consume();
- info.size += 1;
- if !info.data {
- info.data = true;
- }
- State::Fn(Box::new(|t| escape(t, info)))
- }
- Some(_) => {
- tokenizer.consume();
- info.size += 1;
- if !info.data {
- info.data = true;
+ Some(byte) => {
+ if info.size > LINK_REFERENCE_SIZE_MAX {
+ tokenizer.exit(Token::Data);
+ at_break(tokenizer, info)
+ } else {
+ let func = if matches!(byte, b'\\') { escape } else { label };
+ tokenizer.consume();
+ info.size += 1;
+ if !info.data && !matches!(byte, b'\t' | b' ') {
+ info.data = true;
+ }
+ State::Fn(Box::new(move |t| func(t, info)))
}
- State::Fn(Box::new(|t| label(t, info)))
}
}
}
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index 80861af..9cf2f14 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -48,70 +48,13 @@ pub struct Options {
pub string: Token,
}
-/// Type of title.
-#[derive(Debug, PartialEq)]
-enum Kind {
- /// In a parenthesized (`(` and `)`) title.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// (a)
- /// ```
- Paren,
- /// In a double quoted (`"`) title.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// "a"
- /// ```
- Double,
- /// In a single quoted (`'`) title.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// 'a'
- /// ```
- Single,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- ///
- /// > 👉 **Note**: a closing paren is used for `Kind::Paren`.
- fn as_byte(&self) -> u8 {
- match self {
- Kind::Paren => b')',
- Kind::Double => b'"',
- Kind::Single => b'\'',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `(`, `"`, or `'`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'(' => Kind::Paren,
- b'"' => Kind::Double,
- b'\'' => Kind::Single,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse titles.
#[derive(Debug)]
struct Info {
/// Whether we’ve seen data.
connect: bool,
- /// Kind of title.
- kind: Kind,
+ /// Closing marker.
+ marker: u8,
/// Configuration.
options: Options,
}
@@ -124,10 +67,11 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
match tokenizer.current {
- Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => {
+ Some(b'"' | b'\'' | b'(') => {
+ let marker = tokenizer.current.unwrap();
let info = Info {
connect: false,
- kind: Kind::from_byte(byte),
+ marker: if marker == b'(' { b')' } else { marker },
options,
};
tokenizer.enter(info.options.title.clone());
@@ -150,7 +94,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ```
fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {
tokenizer.enter(info.options.marker.clone());
tokenizer.consume();
tokenizer.exit(info.options.marker.clone());
@@ -172,10 +116,6 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
- tokenizer.exit(info.options.string.clone());
- begin(tokenizer, info)
- }
None => State::Nok,
Some(b'\n') => tokenizer.go(
space_or_tab_eol_with_options(EolOptions {
@@ -187,7 +127,11 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
at_break(t, info)
},
)(tokenizer),
- _ => {
+ Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {
+ tokenizer.exit(info.options.string.clone());
+ begin(tokenizer, info)
+ }
+ Some(_) => {
tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
if info.connect {
@@ -210,21 +154,18 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn title(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ None | Some(b'\n') => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- None | Some(b'\n') => {
+ Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- Some(b'\\') => {
+ Some(byte) => {
+ let func = if matches!(byte, b'\\') { escape } else { title };
tokenizer.consume();
- State::Fn(Box::new(|t| escape(t, info)))
- }
- _ => {
- tokenizer.consume();
- State::Fn(Box::new(|t| title(t, info)))
+ State::Fn(Box::new(move |t| func(t, info)))
}
}
}
@@ -237,7 +178,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn escape(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'"' | b'\'' | b')') => {
tokenizer.consume();
State::Fn(Box::new(|t| title(t, info)))
}
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index 13815cb..4f872ba 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -92,8 +92,7 @@ fn trim_data(
if trim_end {
let mut index = slice.bytes.len();
- let vs = slice.after;
- let mut spaces_only = vs == 0;
+ let mut spaces_only = slice.after == 0;
while index > 0 {
match slice.bytes[index - 1] {
b' ' => {}
@@ -105,10 +104,10 @@ fn trim_data(
}
let diff = slice.bytes.len() - index;
- let token_type = if spaces_only
- && hard_break
- && exit_index + 1 < tokenizer.events.len()
+ let token_type = if hard_break
+ && spaces_only
&& diff >= HARD_BREAK_PREFIX_SIZE_MIN
+ && exit_index + 1 < tokenizer.events.len()
{
Token::HardBreakTrailing
} else {
@@ -123,7 +122,7 @@ fn trim_data(
return;
}
- if diff > 0 || vs > 0 {
+ if diff > 0 || slice.after > 0 {
let exit_point = tokenizer.events[exit_index].point.clone();
let mut enter_point = exit_point.clone();
enter_point.index -= diff;
@@ -156,14 +155,11 @@ fn trim_data(
if trim_start {
let mut index = 0;
- let vs = slice.before;
while index < slice.bytes.len() {
match slice.bytes[index] {
- b' ' | b'\t' => {}
+ b' ' | b'\t' => index += 1,
_ => break,
}
-
- index += 1;
}
// The whole data is whitespace.
@@ -174,7 +170,7 @@ fn trim_data(
return;
}
- if index > 0 || vs > 0 {
+ if index > 0 || slice.before > 0 {
let enter_point = tokenizer.events[exit_index - 1].point.clone();
let mut exit_point = enter_point.clone();
exit_point.index += index;
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 4fc4dc4..785d132 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -53,64 +53,11 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN};
use crate::token::Token;
use crate::tokenizer::{State, Tokenizer};
-/// Type of thematic break.
-#[derive(Debug, PartialEq)]
-enum Kind {
- /// In a thematic break using asterisks (`*`).
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ***
- /// ```
- Asterisk,
- /// In a thematic break using dashes (`-`).
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ---
- /// ```
- Dash,
- /// In a thematic break using underscores (`_`).
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ___
- /// ```
- Underscore,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- Kind::Asterisk => b'*',
- Kind::Dash => b'-',
- Kind::Underscore => b'_',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `*`, `-`, or `_`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'*' => Kind::Asterisk,
- b'-' => Kind::Dash,
- b'_' => Kind::Underscore,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse thematic breaks.
#[derive(Debug)]
struct Info {
- /// Kind of marker.
- kind: Kind,
+ /// Marker.
+ marker: u8,
/// Number of markers.
size: usize,
}
@@ -122,15 +69,19 @@ struct Info {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.thematic_break {
tokenizer.enter(Token::ThematicBreak);
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -144,10 +95,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break(
+ Some(b'*' | b'-' | b'_') => at_break(
tokenizer,
Info {
- kind: Kind::from_byte(byte),
+ marker: tokenizer.current.unwrap(),
size: 0,
},
),
@@ -163,13 +114,13 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {
+ None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {
tokenizer.exit(Token::ThematicBreak);
// Feel free to interrupt.
tokenizer.interrupt = false;
State::Ok
}
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {
tokenizer.enter(Token::ThematicBreakSequence);
sequence(tokenizer, info)
}
@@ -185,7 +136,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {
tokenizer.consume();
info.size += 1;
State::Fn(Box::new(|t| sequence(t, info)))