From 0eeff9148e327183e532752f46421a75506dd7a6 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 29 Jul 2022 18:22:59 +0200 Subject: Refactor to improve states * Remove custom kind wrappers, use plain bytes instead * Remove `Into`s, use the explicit expected types instead * Refactor to use `slice.as_str` in most places * Remove unneeded unique check before adding a definition * Use a shared CDATA prefix in constants * Inline byte checks into matches * Pass bytes back from parser instead of whole parse state * Refactor to work more often on bytes * Rename custom `size` to `len` --- src/construct/attention.rs | 88 ++++----------- src/construct/autolink.rs | 57 +++++----- src/construct/character_escape.rs | 3 +- src/construct/character_reference.rs | 132 ++++++++-------------- src/construct/code_fenced.rs | 123 +++++++------------- src/construct/code_indented.rs | 37 +++--- src/construct/code_text.rs | 7 +- src/construct/definition.rs | 21 ++-- src/construct/hard_break_escape.rs | 4 +- src/construct/heading_atx.rs | 28 +++-- src/construct/heading_setext.rs | 96 +++++----------- src/construct/html_flow.rs | 212 ++++++++++++++--------------------- src/construct/html_text.rs | 46 ++++---- src/construct/label_end.rs | 47 ++++---- src/construct/label_start_image.rs | 3 +- src/construct/list.rs | 135 ++++++++-------------- src/construct/paragraph.rs | 3 +- src/construct/partial_bom.rs | 37 +++--- src/construct/partial_destination.rs | 53 ++++----- src/construct/partial_label.rs | 101 ++++++++--------- src/construct/partial_title.rs | 93 +++------------ src/construct/partial_whitespace.rs | 18 ++- src/construct/thematic_break.rs | 85 +++----------- 23 files changed, 522 insertions(+), 907 deletions(-) (limited to 'src/construct') diff --git a/src/construct/attention.rs b/src/construct/attention.rs index b042645..583fde2 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -88,54 +88,11 @@ enum GroupKind { Other, } -/// Type of sequence. -#[derive(Debug, PartialEq)] -enum MarkerKind { - /// In a run with asterisks. - /// - /// ## Example - /// - /// ```markdown - /// *a* - /// ``` - Asterisk, - /// In a run with underscores. - /// - /// ## Example - /// - /// ```markdown - /// _a_ - /// ``` - Underscore, -} - -impl MarkerKind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - MarkerKind::Asterisk => b'*', - MarkerKind::Underscore => b'_', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `*` or `_`. - fn from_byte(byte: u8) -> MarkerKind { - match byte { - b'*' => MarkerKind::Asterisk, - b'_' => MarkerKind::Underscore, - _ => unreachable!("invalid byte"), - } - } -} - /// Attentention sequence that we can take markers from. #[derive(Debug)] struct Sequence { - /// Marker used in this sequence. - marker: MarkerKind, + /// Marker as a byte (`u8`) used in this sequence. + marker: u8, /// The depth in events where this sequence resides. balance: usize, /// The index into events where this sequence’s `Enter` currently resides. @@ -160,9 +117,9 @@ struct Sequence { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => { + Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => { tokenizer.enter(Token::AttentionSequence); - inside(tokenizer, MarkerKind::from_byte(byte)) + inside(tokenizer, tokenizer.current.unwrap()) } _ => State::Nok, } @@ -174,14 +131,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | ** /// ^^ /// ``` -fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { - if tokenizer.current == Some(marker.as_byte()) { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, marker))) - } else { - tokenizer.exit(Token::AttentionSequence); - tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); - State::Ok +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { + match tokenizer.current { + Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => { + tokenizer.consume(); + State::Fn(Box::new(move |t| inside(t, marker))) + } + _ => { + tokenizer.exit(Token::AttentionSequence); + tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); + State::Ok + } } } @@ -219,16 +179,10 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]); let char_after = string_after.chars().next(); - let marker = MarkerKind::from_byte( - Slice::from_point(tokenizer.parse_state.bytes, &enter.point) - .head() - .unwrap(), - ); - let before = classify_character(if enter.point.index > 0 { - char_before - } else { - None - }); + let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) + .head() + .unwrap(); + let before = classify_character(char_before); let after = classify_character(char_after); let open = after == GroupKind::Other || (after == GroupKind::Punctuation && before != GroupKind::Other); @@ -245,12 +199,12 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { start_point: enter.point.clone(), end_point: exit.point.clone(), size: exit.point.index - enter.point.index, - open: if marker == MarkerKind::Asterisk { + open: if marker == b'*' { open } else { open && (before != GroupKind::Other || !close) }, - close: if marker == MarkerKind::Asterisk { + close: if marker == b'*' { close } else { close && (after != GroupKind::Other || !open) diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index b843af8..c0514ae 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_alphabetic() => { + // ASCII alphabetic. + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(scheme_or_email_atext)) } - Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer), - _ => State::Nok, + _ => email_atext(tokenizer), } } @@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::AutolinkProtocol); end(tokenizer) } - Some(byte) if byte.is_ascii_control() => State::Nok, - None | Some(b' ') => State::Nok, + // ASCII control or space. + None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(url_inside)) @@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Some(byte) if is_ascii_atext(byte) => { + // ASCII atext. + // + // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or + // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 + // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), + // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`), + // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE + // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE + // (`~`). + // + // See: + // **\[RFC5322]**: + // [Internet Message Format](https://tools.ietf.org/html/rfc5322). + // P. Resnick. + // IETF. + // + // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric + Some( + b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~', + ) => { tokenizer.consume(); State::Fn(Box::new(email_atext)) } @@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size), + // ASCII alphanumeric. + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size), _ => State::Nok, } } @@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { tokenizer.consume(); State::Fn(Box::new(move |t| email_value(t, size + 1))) } - Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + // ASCII alphanumeric. + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_label(t, size + 1))) } @@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State { _ => unreachable!("expected `>`"), } } - -/// Check whether the character code represents an ASCII atext. -/// -/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in -/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), -/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F -/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E -/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE -/// (`{`) to U+007E TILDE (`~`). -/// -/// See: -/// **\[RFC5322]**: -/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). -/// P. Resnick. -/// IETF. -/// -/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric -fn is_ascii_atext(byte: u8) -> bool { - matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~') -} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 02e8b62..4419d7a 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -63,7 +63,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_punctuation() => { + // ASCII punctuation. + Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => { tokenizer.enter(Token::CharacterEscapeValue); tokenizer.consume(); tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 90763c1..cd489a4 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,67 +66,18 @@ use crate::constant::{ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, }; use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -/// Kind of a character reference. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Numeric decimal character reference. - /// - /// ```markdown - /// > | a b - /// ^^^^^ - /// ``` - Decimal, - /// Numeric hexadecimal character reference. - /// - /// ```markdown - /// > | a{b - /// ^^^^^^ - /// ``` - Hexadecimal, - /// Named character reference. - /// - /// ```markdown - /// > | a&b - /// ^^^^^ - /// ``` - Named, -} - -impl Kind { - /// Get the maximum size of characters allowed in the value of a character - /// reference. - fn max(&self) -> usize { - match self { - Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, - Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, - Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX, - } - } - - /// Check if a byte ([`u8`]) is allowed. - fn allowed(&self, byte: u8) -> bool { - let check = match self { - Kind::Hexadecimal => u8::is_ascii_hexdigit, - Kind::Decimal => u8::is_ascii_digit, - Kind::Named => u8::is_ascii_alphanumeric, - }; - - check(&byte) - } -} +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice; /// State needed to parse character references. #[derive(Debug, Clone)] struct Info { - /// Place of value start. - start: Point, - /// Size of value. - size: usize, - /// Kind of character reference. - kind: Kind, + /// Index of where value starts. + start: usize, + /// Marker of character reference. + marker: u8, + /// Maximum number of characters in the value for this kind. + max: usize, } /// Start of a character reference. @@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { value( tokenizer, Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Named, + start: tokenizer.point.index, + marker: b'&', + max: CHARACTER_REFERENCE_NAMED_SIZE_MAX, }, ) } @@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); let info = Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Hexadecimal, + start: tokenizer.point.index, + marker: b'x', + max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, }; State::Fn(Box::new(|t| value(t, info))) } else { tokenizer.enter(Token::CharacterReferenceValue); let info = Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Decimal, + start: tokenizer.point.index, + marker: b'#', + max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, }; value(tokenizer, info) } @@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { /// > | a b /// ^ /// ``` -fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn value(tokenizer: &mut Tokenizer, info: Info) -> State { + let size = tokenizer.point.index - info.start; + match tokenizer.current { - Some(b';') if info.size > 0 => { - if Kind::Named == info.kind { - // To do: fix slice. - let value = Slice::from_position( + Some(b';') if size > 0 => { + // Named. + if info.marker == b'&' { + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &info.start, - end: &tokenizer.point, - }, - ) - .serialize(); + info.start, + tokenizer.point.index, + ); + let name = slice.as_str(); - if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { + if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { return State::Nok; } } @@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::CharacterReference); State::Ok } - Some(byte) => { - if info.size < info.kind.max() && info.kind.allowed(byte) { - info.size += 1; - tokenizer.consume(); - State::Fn(Box::new(|t| value(t, info))) - } else { - State::Nok - } + // ASCII digit, for named, decimal, and hexadecimal references. + Some(b'0'..=b'9') if size < info.max => { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) + } + // ASCII hex letters, for named and hexadecimal references. + Some(b'A'..=b'F' | b'a'..=b'f') + if matches!(info.marker, b'&' | b'x') && size < info.max => + { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) + } + // Non-hex ASCII alphabeticals, for named references. + Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) } _ => State::Nok, } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 21e9259..c4c3e86 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -110,53 +110,6 @@ use crate::token::Token; use crate::tokenizer::{ContentType, State, Tokenizer}; use crate::util::slice::{Position, Slice}; -/// Kind of fences. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Grave accent (tick) code. - /// - /// ## Example - /// - /// ````markdown - /// ```rust - /// println!("I <3 🦀"); - /// ``` - /// ```` - GraveAccent, - /// Tilde code. - /// - /// ## Example - /// - /// ```markdown - /// ~~~rust - /// println!("I <3 🦀"); - /// ~~~ - /// ``` - Tilde, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - Kind::GraveAccent => b'`', - Kind::Tilde => b'~', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `~` or `` ` ``. - fn from_byte(byte: u8) -> Kind { - match byte { - b'`' => Kind::GraveAccent, - b'~' => Kind::Tilde, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse code (fenced). #[derive(Debug, Clone)] struct Info { @@ -165,8 +118,8 @@ struct Info { /// Number of tabs or spaces of indentation before the opening fence /// sequence. prefix: usize, - /// Kind of fences. - kind: Kind, + /// Marker of fences (`u8`). + marker: u8, } /// Start of fenced code. @@ -178,15 +131,20 @@ struct Info { /// | ~~~ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; if tokenizer.parse_state.constructs.code_fenced { tokenizer.enter(Token::CodeFenced); tokenizer.enter(Token::CodeFencedFence); - tokenizer.go(space_or_tab_min_max(0, max), before_sequence_open)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before_sequence_open, + )(tokenizer) } else { State::Nok } @@ -210,23 +168,22 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), ) - .size(); + .len(); } } - match tokenizer.current { - Some(byte) if matches!(byte, b'`' | b'~') => { - tokenizer.enter(Token::CodeFencedFenceSequence); - sequence_open( - tokenizer, - Info { - prefix, - size: 0, - kind: Kind::from_byte(byte), - }, - ) - } - _ => State::Nok, + if let Some(b'`' | b'~') = tokenizer.current { + tokenizer.enter(Token::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + marker: tokenizer.current.unwrap(), + }, + ) + } else { + State::Nok } } @@ -240,7 +197,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { tokenizer.consume(); State::Fn(Box::new(|t| { info.size += 1; @@ -302,7 +259,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) } - Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, + Some(b'`') if info.marker == b'`' => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(|t| info_inside(t, info))) @@ -352,7 +309,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, + Some(b'`') if info.marker == b'`' => State::Nok, _ => { tokenizer.consume(); State::Fn(Box::new(|t| meta(t, info))) @@ -432,14 +389,18 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// ``` fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - tokenizer.enter(Token::CodeFencedFence); - tokenizer.go(space_or_tab_min_max(0, max), |t| close_before(t, info))(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + |t| close_before(t, info), + )(tokenizer) } /// In a closing fence, after optional whitespace, before sequence. @@ -452,7 +413,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { tokenizer.enter(Token::CodeFencedFenceSequence); close_sequence(tokenizer, info, 0) } @@ -470,7 +431,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { tokenizer.consume(); State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 4a3a9f6..81a3080 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -62,11 +62,11 @@ use crate::tokenizer::{State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { // Do not interrupt paragraphs. - if tokenizer.interrupt || !tokenizer.parse_state.constructs.code_indented { - State::Nok - } else { + if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented { tokenizer.enter(Token::CodeIndented); tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer) + } else { + State::Nok } } @@ -129,29 +129,26 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// | bbb /// ``` fn further_start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.lazy { - State::Nok - } else { - match tokenizer.current { - Some(b'\n') => { - tokenizer.enter(Token::LineEnding); - tokenizer.consume(); - tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(further_start)) - } - _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { - Box::new(if ok { further_end } else { further_begin }) - })(tokenizer), + match tokenizer.current { + Some(b'\n') if !tokenizer.lazy => { + tokenizer.enter(Token::LineEnding); + tokenizer.consume(); + tokenizer.exit(Token::LineEnding); + State::Fn(Box::new(further_start)) } + _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { + Box::new(if ok { further_end } else { further_begin }) + })(tokenizer), + _ => State::Nok, } } -/// After a proper indent. +/// At an eol, which is followed by an indented line. /// /// ```markdown -/// | aaa -/// > | bbb -/// ^ +/// > | aaa +/// ^ +/// | bbb /// ``` fn further_end(_tokenizer: &mut Tokenizer) -> State { State::Ok diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index b36a208..d70fbc2 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -95,14 +95,13 @@ use crate::tokenizer::{State, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let len = tokenizer.events.len(); - match tokenizer.current { Some(b'`') if tokenizer.parse_state.constructs.code_text && (tokenizer.previous != Some(b'`') - || (len > 0 - && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => + || (!tokenizer.events.is_empty() + && tokenizer.events[tokenizer.events.len() - 1].token_type + == Token::CharacterEscape)) => { tokenizer.enter(Token::CodeText); tokenizer.enter(Token::CodeTextSequence); diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 14755c9..bd7df82 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -110,17 +110,18 @@ use crate::util::skip::opt_back as skip_opt_back; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let definition_before = !tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::LineEnding, Token::SpaceOrTab], - )] - .token_type - == Token::Definition; - // Do not interrupt paragraphs (but do follow definitions). - if (!tokenizer.interrupt || definition_before) && tokenizer.parse_state.constructs.definition { + let possible = !tokenizer.interrupt + || (!tokenizer.events.is_empty() + && tokenizer.events[skip_opt_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::LineEnding, Token::SpaceOrTab], + )] + .token_type + == Token::Definition); + + if possible && tokenizer.parse_state.constructs.definition { tokenizer.enter(Token::Definition); // Note: arbitrary whitespace allowed even if code (indented) is on. tokenizer.attempt_opt(space_or_tab(), before)(tokenizer) diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index cdbc192..d09bf54 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); tokenizer.consume(); - State::Fn(Box::new(inside)) + State::Fn(Box::new(after)) } _ => State::Nok, } @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.exit(Token::HardBreakEscape); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 9a73b77..aa388ee 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,15 +66,19 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.heading_atx { tokenizer.enter(Token::HeadingAtx); - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -101,19 +105,19 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { +fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - None | Some(b'\n') if rank > 0 => { + None | Some(b'\n') if size > 0 => { tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) } - Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |tokenizer| { - sequence_open(tokenizer, rank + 1) + sequence_open(tokenizer, size + 1) })) } - _ if rank > 0 => { + _ if size > 0 => { tokenizer.exit(Token::HeadingAtxSequence); tokenizer.go(space_or_tab(), at_break)(tokenizer) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 2a4adbf..98d7843 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -63,52 +63,6 @@ use crate::token::Token; use crate::tokenizer::{EventType, State, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; -/// Kind of underline. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Dash (rank 2) heading. - /// - /// ## Example - /// - /// ```markdown - /// alpha - /// ----- - /// ``` - Dash, - - /// Equals to (rank 1) heading. - /// - /// ## Example - /// - /// ```markdown - /// alpha - /// ===== - /// ``` - EqualsTo, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - Kind::Dash => b'-', - Kind::EqualsTo => b'=', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `-` or `=`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'-' => Kind::Dash, - b'=' => Kind::EqualsTo, - _ => unreachable!("invalid byte"), - } - } -} - /// At a line ending, presumably an underline. /// /// ```markdown @@ -117,23 +71,29 @@ impl Kind { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - let paragraph_before = !tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::LineEnding, Token::SpaceOrTab], - )] - .token_type - == Token::Paragraph; - - // Require a paragraph before and do not allow on a lazy line. - if paragraph_before && !tokenizer.lazy && tokenizer.parse_state.constructs.heading_setext { - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + if tokenizer.parse_state.constructs.heading_setext + && !tokenizer.lazy + // Require a paragraph before. + && (!tokenizer.events.is_empty() + && tokenizer.events[skip_opt_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::LineEnding, Token::SpaceOrTab], + )] + .token_type + == Token::Paragraph) + { + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -148,9 +108,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if matches!(byte, b'-' | b'=') => { + Some(b'-' | b'=') => { tokenizer.enter(Token::HeadingSetextUnderline); - inside(tokenizer, Kind::from_byte(byte)) + inside(tokenizer, tokenizer.current.unwrap()) } _ => State::Nok, } @@ -163,11 +123,11 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | == /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { match tokenizer.current { - Some(byte) if byte == kind.as_byte() => { + Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => { tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, kind))) + State::Fn(Box::new(move |t| inside(t, marker))) } _ => { tokenizer.exit(Token::HeadingSetextUnderline); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 5860c5d..064da35 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -98,17 +98,17 @@ //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}; +use crate::constant::{ + HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE, +}; use crate::construct::{ blank_line::start as blank_line, partial_non_lazy_continuation::start as partial_non_lazy_continuation, partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions}, }; use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -129,49 +129,6 @@ enum Kind { Complete, } -/// Type of quote, if we’re in a quoted attribute, in complete (condition 7). -#[derive(Debug, PartialEq)] -enum QuoteKind { - /// In a double quoted (`"`) attribute value. - /// - /// ## Example - /// - /// ```markdown - /// - /// ``` - Double, - /// In a single quoted (`'`) attribute value. - /// - /// ## Example - /// - /// ```markdown - /// - /// ``` - Single, -} - -impl QuoteKind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - QuoteKind::Double => b'"', - QuoteKind::Single => b'\'', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `"` or `'`. - fn from_byte(byte: u8) -> QuoteKind { - match byte { - b'"' => QuoteKind::Double, - b'\'' => QuoteKind::Single, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse HTML (flow). #[derive(Debug)] struct Info { @@ -179,12 +136,10 @@ struct Info { kind: Kind, /// Whether this is a start tag (`<` not followed by `/`). start_tag: bool, - /// Used depending on `kind` to collect all parsed bytes. - start: Option, - /// Collected index, for various reasons. - size: usize, + /// Start index of a tag name or cdata prefix. + start: usize, /// Current quote, when in a double or single quoted attribute value. - quote: Option, + quote: u8, } /// Start of HTML (flow), before optional whitespace. @@ -194,19 +149,17 @@ struct Info { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.html_flow { tokenizer.enter(Token::HtmlFlow); tokenizer.go( space_or_tab_with_options(SpaceOrTabOptions { kind: Token::HtmlFlowData, min: 0, - max, + max: if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, connect: false, content_type: None, }), @@ -249,9 +202,8 @@ fn open(tokenizer: &mut Tokenizer) -> State { kind: Kind::Basic, // Assume closing tag (or no tag). start_tag: false, - start: None, - size: 0, - quote: None, + start: 0, + quote: 0, }; match tokenizer.current { @@ -261,7 +213,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { } Some(b'/') => { tokenizer.consume(); - info.start = Some(tokenizer.point.clone()); + info.start = tokenizer.point.index; State::Fn(Box::new(|t| tag_close_start(t, info))) } Some(b'?') => { @@ -273,9 +225,10 @@ fn open(tokenizer: &mut Tokenizer) -> State { // right now, so we do need to search for `>`, similar to declarations. State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { info.start_tag = true; - info.start = Some(tokenizer.point.clone()); + info.start = tokenizer.point.index; tag_name(tokenizer, info) } _ => State::Nok, @@ -299,12 +252,6 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { info.kind = Kind::Comment; State::Fn(Box::new(|t| comment_open_inside(t, info))) } - Some(b'[') => { - tokenizer.consume(); - info.kind = Kind::Cdata; - info.size = 0; - State::Fn(Box::new(|t| cdata_open_inside(t, info))) - } Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); info.kind = Kind::Declaration; @@ -312,6 +259,12 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.concrete = true; State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } + Some(b'[') => { + tokenizer.consume(); + info.kind = Kind::Cdata; + info.start = tokenizer.point.index; + State::Fn(Box::new(|t| cdata_open_inside(t, info))) + } _ => State::Nok, } } @@ -342,12 +295,11 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == CDATA_SEARCH[info.size] => { - info.size += 1; + Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => { tokenizer.consume(); - if info.size == CDATA_SEARCH.len() { - info.size = 0; + if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() { + info.start = 0; // Do not form containers. tokenizer.concrete = true; State::Fn(Box::new(|t| continuation(t, info))) @@ -367,6 +319,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| tag_name(t, info))) @@ -387,17 +340,18 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => { let slash = matches!(tokenizer.current, Some(b'/')); - let start = info.start.take().unwrap(); - let name = Slice::from_position( + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &start, - end: &tokenizer.point, - }, - ) - .serialize() - .trim() - .to_lowercase(); + info.start, + tokenizer.point.index, + ); + let name = slice + .as_str() + // The line ending case might result in a `\r` that is already accounted for. + .trim() + .to_ascii_lowercase(); + info.start = 0; if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) { info.kind = Kind::Raw; @@ -427,6 +381,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } } + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| tag_name(t, info))) @@ -490,18 +445,19 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.consume(); + State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) + } Some(b'/') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_end(t, info))) } + // ASCII alphanumerical and `:` and `_`. Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } - Some(b'\t' | b' ') => { - tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) - } _ => complete_end(tokenizer, info), } } @@ -518,6 +474,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) @@ -537,14 +494,14 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(b'=') => { - tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) - } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) } + Some(b'=') => { + tokenizer.consume(); + State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) + } _ => complete_attribute_name_before(tokenizer, info), } } @@ -561,15 +518,15 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, - Some(byte) if matches!(byte, b'"' | b'\'') => { - info.quote = Some(QuoteKind::from_byte(byte)); - tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) - } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } + Some(b'"' | b'\'') => { + info.quote = tokenizer.current.unwrap(); + tokenizer.consume(); + State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) + } _ => complete_attribute_value_unquoted(tokenizer, info), } } @@ -585,7 +542,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { None | Some(b'\n') => State::Nok, - Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => { + Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) } @@ -673,6 +630,21 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { + tokenizer.exit(Token::HtmlFlowData); + tokenizer.check(blank_line_before, |ok| { + if ok { + Box::new(continuation_after) + } else { + Box::new(move |t| continuation_start(t, info)) + } + })(tokenizer) + } + // Note: important that this is after the basic/complete case. + None | Some(b'\n') => { + tokenizer.exit(Token::HtmlFlowData); + continuation_start(tokenizer, info) + } Some(b'-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_comment_inside(t, info))) @@ -693,20 +665,6 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.consume(); State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) } - Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { - tokenizer.exit(Token::HtmlFlowData); - tokenizer.check(blank_line_before, |ok| { - if ok { - Box::new(continuation_after) - } else { - Box::new(move |t| continuation_start(t, info)) - } - })(tokenizer) - } - None | Some(b'\n') => { - tokenizer.exit(Token::HtmlFlowData); - continuation_start(tokenizer, info) - } _ => { tokenizer.consume(); State::Fn(Box::new(|t| continuation(t, info))) @@ -793,7 +751,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State match tokenizer.current { Some(b'/') => { tokenizer.consume(); - info.start = Some(tokenizer.point.clone()); + info.start = tokenizer.point.index; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => continuation(tokenizer, info), @@ -809,18 +767,15 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { Some(b'>') => { - info.size = 0; - - let start = info.start.take().unwrap(); - let name = Slice::from_position( + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &start, - end: &tokenizer.point, - }, - ) - .serialize() - .to_lowercase(); + info.start, + tokenizer.point.index, + ); + let name = slice.as_str().to_ascii_lowercase(); + + info.start = 0; if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); @@ -829,13 +784,14 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State continuation(tokenizer, info) } } - Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => { + Some(b'A'..=b'Z' | b'a'..=b'z') + if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX => + { tokenizer.consume(); - info.size += 1; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => { - info.size = 0; + info.start = 0; continuation(tokenizer, info) } } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index f10a476..51beda5 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -54,12 +54,11 @@ //! [html_flow]: crate::construct::html_flow //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +use crate::constant::HTML_CDATA_PREFIX; use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; - /// Start of HTML (text) /// /// ```markdown @@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(instruction)) } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Some(b'[') => { - tokenizer.consume(); - State::Fn(Box::new(|t| cdata_open_inside(t, 0))) - } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } + Some(b'[') => { + tokenizer.consume(); + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) + } _ => State::Nok, } } @@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a &<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { - match tokenizer.current { - Some(byte) if byte == CDATA_SEARCH[index] => { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { + if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { + tokenizer.consume(); - if index + 1 == CDATA_SEARCH.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) - } + if size + 1 == HTML_CDATA_PREFIX.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1))) } - _ => State::Nok, + } else { + State::Nok } } @@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(end)) } + // ASCII alphabetical and `:` and `_`. Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Some(byte) if byte == b'"' || byte == b'\'' => { + Some(b'"' | b'\'') => { + let marker = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) + State::Fn(Box::new(move |t| { + tag_open_attribute_value_quoted(t, marker) + })) } Some(_) => { tokenizer.consume(); @@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Some(byte) if byte == marker => { + Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 6399f81..a1ec8d9 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -214,16 +214,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { media: Media { start: label_start.start, end: (label_end_start, label_end_start + 3), - // To do: virtual spaces not needed, create a `to_str`? id: normalize_identifier( - &Slice::from_position( + // We don’t care about virtual spaces, so `indices` and `as_str` are fine. + Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &tokenizer.events[label_start.start.1].point, - end: &tokenizer.events[label_end_start - 1].point, - }, + tokenizer.events[label_start.start.1].point.index, + tokenizer.events[label_end_start - 1].point.index, ) - .serialize(), + .as_str(), ), }, }; @@ -366,11 +364,11 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ^ /// ``` fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { - let label_start = tokenizer + tokenizer .label_start_stack .get_mut(label_start_index) - .unwrap(); - label_start.balanced = true; + .unwrap() + .balanced = true; State::Nok } @@ -529,23 +527,24 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn full_reference_after(tokenizer: &mut Tokenizer) -> State { - let end = skip::to_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::ReferenceString], - ); - - // To do: virtual spaces not needed, create a `to_str`? - let id = Slice::from_position( - tokenizer.parse_state.bytes, - &Position::from_exit_event(&tokenizer.events, end), - ) - .serialize(); - if tokenizer .parse_state .definitions - .contains(&normalize_identifier(&id)) + // We don’t care about virtual spaces, so `as_str` is fine. + .contains(&normalize_identifier( + Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event( + &tokenizer.events, + skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::ReferenceString], + ), + ), + ) + .as_str(), + )) { State::Ok } else { diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index d30b8dd..4a3508e 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -64,9 +64,8 @@ pub fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::LabelMarker); tokenizer.exit(Token::LabelImage); - let end = tokenizer.events.len() - 1; tokenizer.label_start_stack.push(LabelStart { - start: (end - 5, end), + start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1), balanced: false, inactive: false, }); diff --git a/src/construct/list.rs b/src/construct/list.rs index 9b59130..d5a9899 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -56,69 +56,6 @@ use crate::util::{ slice::{Position, Slice}, }; -/// Type of list. -#[derive(Debug, PartialEq)] -enum Kind { - /// In a dot (`.`) list item. - /// - /// ## Example - /// - /// ```markdown - /// 1. a - /// ``` - Dot, - /// In a paren (`)`) list item. - /// - /// ## Example - /// - /// ```markdown - /// 1) a - /// ``` - Paren, - /// In an asterisk (`*`) list item. - /// - /// ## Example - /// - /// ```markdown - /// * a - /// ``` - Asterisk, - /// In a plus (`+`) list item. - /// - /// ## Example - /// - /// ```markdown - /// + a - /// ``` - Plus, - /// In a dash (`-`) list item. - /// - /// ## Example - /// - /// ```markdown - /// - a - /// ``` - Dash, -} - -impl Kind { - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'.' => Kind::Dot, - b')' => Kind::Paren, - b'*' => Kind::Asterisk, - b'+' => Kind::Plus, - b'-' => Kind::Dash, - _ => unreachable!("invalid byte"), - } - } -} - /// Start of list item. /// /// ```markdown @@ -126,15 +63,19 @@ impl Kind { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.list { tokenizer.enter(Token::ListItem); - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -149,15 +90,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Unordered. - Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| { + Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| { Box::new(if ok { nok } else { before_unordered }) })(tokenizer), + Some(b'+') => before_unordered(tokenizer), // Ordered. - Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => { - tokenizer.enter(Token::ListItemPrefix); - tokenizer.enter(Token::ListItemValue); - inside(tokenizer, 0) - } + Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer), + Some(b'1') => before_ordered(tokenizer), _ => State::Nok, } } @@ -175,6 +114,18 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { marker(tokenizer) } +/// Start of an ordered list item. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +fn before_ordered(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Token::ListItemPrefix); + tokenizer.enter(Token::ListItemValue); + inside(tokenizer, 0) +} + /// In an ordered list item value. /// /// ```markdown @@ -183,14 +134,14 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, size + 1))) - } Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => { tokenizer.exit(Token::ListItemValue); marker(tokenizer) } + Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + tokenizer.consume(); + State::Fn(Box::new(move |t| inside(t, size + 1))) + } _ => State::Nok, } } @@ -262,7 +213,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn whitespace_after(tokenizer: &mut Tokenizer) -> State { - if matches!(tokenizer.current, Some(b'\t' | b' ')) { + if let Some(b'\t' | b' ') = tokenizer.current { State::Nok } else { State::Ok @@ -309,7 +260,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State { end: &tokenizer.point, }, ) - .size(); + .len(); if blank { prefix += 1; @@ -389,8 +340,8 @@ fn nok(_tokenizer: &mut Tokenizer) -> State { pub fn resolve_list_item(tokenizer: &mut Tokenizer) { let mut index = 0; let mut balance = 0; - let mut lists_wip: Vec<(Kind, usize, usize, usize)> = vec![]; - let mut lists: Vec<(Kind, usize, usize, usize)> = vec![]; + let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; + let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; // Merge list items. while index < tokenizer.events.len() { @@ -400,12 +351,14 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { if event.event_type == EventType::Enter { let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1; let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); - let kind = Kind::from_byte( - Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point) - .head() - .unwrap(), - ); - let current = (kind, balance, index, end); + // Guaranteed to be a valid ASCII byte. + let marker = Slice::from_index( + tokenizer.parse_state.bytes, + tokenizer.events[marker].point.index, + ) + .head() + .unwrap(); + let current = (marker, balance, index, end); let mut list_index = lists_wip.len(); let mut matched = false; @@ -475,7 +428,7 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { let mut list_start = tokenizer.events[list_item.2].clone(); let mut list_end = tokenizer.events[list_item.3].clone(); let token_type = match list_item.0 { - Kind::Paren | Kind::Dot => Token::ListOrdered, + b'.' | b')' => Token::ListOrdered, _ => Token::ListUnordered, }; list_start.token_type = token_type.clone(); diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 146dc40..ec5669c 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -81,10 +81,9 @@ fn inside(tokenizer: &mut Tokenizer) -> State { /// Merge “`Paragraph`”s, which currently span a single line, into actual /// `Paragraph`s that span multiple lines. pub fn resolve(tokenizer: &mut Tokenizer) { - let len = tokenizer.events.len(); let mut index = 0; - while index < len { + while index < tokenizer.events.len() { let event = &tokenizer.events[index]; if event.event_type == EventType::Enter && event.token_type == Token::Paragraph { diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index be8d6c8..155a1a3 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -10,13 +10,12 @@ use crate::tokenizer::{State, Tokenizer}; /// ^^^^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(0xEF) => { - tokenizer.enter(Token::ByteOrderMark); - tokenizer.consume(); - State::Fn(Box::new(cont)) - } - _ => State::Nok, + if tokenizer.current == Some(0xEF) { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + State::Fn(Box::new(cont)) + } else { + State::Nok } } @@ -27,12 +26,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^^^^ /// ``` fn cont(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(0xBB) => { - tokenizer.consume(); - State::Fn(Box::new(end)) - } - _ => State::Nok, + if tokenizer.current == Some(0xBB) { + tokenizer.consume(); + State::Fn(Box::new(end)) + } else { + State::Nok } } @@ -43,12 +41,11 @@ fn cont(tokenizer: &mut Tokenizer) -> State { /// ^^^^ /// ``` fn end(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(0xBF) => { - tokenizer.consume(); - tokenizer.exit(Token::ByteOrderMark); - State::Ok - } - _ => State::Nok, + if tokenizer.current == Some(0xBF) { + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Ok + } else { + State::Nok } } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 0a3721c..809aa27 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { tokenizer.exit(info.options.marker.clone()); State::Fn(Box::new(|t| enclosed_before(t, info))) } - None | Some(b' ' | b')') => State::Nok, - Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, + // ASCII control, space, closing paren, but *not* `\0`. + None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok, Some(_) => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.raw.clone()); @@ -166,12 +166,12 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + None | Some(b'\n' | b'<') => State::Nok, Some(b'>') => { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, info) } - None | Some(b'\n' | b'<') => State::Nok, Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed_escape(t, info))) @@ -207,40 +207,25 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(b'(') => { - if info.balance >= info.options.limit { - State::Nok - } else { - tokenizer.consume(); - info.balance += 1; - State::Fn(Box::new(move |t| raw(t, info))) - } + None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => { + tokenizer.exit(Token::Data); + tokenizer.exit(info.options.string.clone()); + tokenizer.exit(info.options.raw.clone()); + tokenizer.exit(info.options.destination); + State::Ok } - Some(b')') => { - if info.balance == 0 { - tokenizer.exit(Token::Data); - tokenizer.exit(info.options.string.clone()); - tokenizer.exit(info.options.raw.clone()); - tokenizer.exit(info.options.destination); - State::Ok - } else { - tokenizer.consume(); - info.balance -= 1; - State::Fn(Box::new(move |t| raw(t, info))) - } + Some(b'(') if info.balance < info.options.limit => { + tokenizer.consume(); + info.balance += 1; + State::Fn(Box::new(move |t| raw(t, info))) } - None | Some(b'\t' | b'\n' | b' ') => { - if info.balance > 0 { - State::Nok - } else { - tokenizer.exit(Token::Data); - tokenizer.exit(info.options.string.clone()); - tokenizer.exit(info.options.raw.clone()); - tokenizer.exit(info.options.destination); - State::Ok - } + // ASCII control (but *not* `\0`) and space and `(`. + None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok, + Some(b')') => { + tokenizer.consume(); + info.balance -= 1; + State::Fn(Box::new(move |t| raw(t, info))) } - Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw_escape(t, info))) diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 7e40a2d..6fdb70d 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -123,39 +123,43 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ^ /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { - match tokenizer.current { - None | Some(b'[') => State::Nok, - Some(b']') if !info.data => State::Nok, - _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, - Some(b']') => { - tokenizer.exit(info.options.string.clone()); - tokenizer.enter(info.options.marker.clone()); - tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - tokenizer.exit(info.options.label); - State::Ok - } - Some(b'\n') => tokenizer.go( - space_or_tab_eol_with_options(EolOptions { - content_type: Some(ContentType::String), - connect: info.connect, - }), - |t| { - info.connect = true; - at_break(t, info) - }, - )(tokenizer), - _ => { - tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - - if info.connect { - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - } else { - info.connect = true; + if info.size > LINK_REFERENCE_SIZE_MAX + || matches!(tokenizer.current, None | Some(b'[')) + || (matches!(tokenizer.current, Some(b']')) && !info.data) + { + State::Nok + } else { + match tokenizer.current { + Some(b'\n') => tokenizer.go( + space_or_tab_eol_with_options(EolOptions { + content_type: Some(ContentType::String), + connect: info.connect, + }), + |t| { + info.connect = true; + at_break(t, info) + }, + )(tokenizer), + Some(b']') => { + tokenizer.exit(info.options.string.clone()); + tokenizer.enter(info.options.marker.clone()); + tokenizer.consume(); + tokenizer.exit(info.options.marker.clone()); + tokenizer.exit(info.options.label); + State::Ok } + _ => { + tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); + + if info.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else { + info.connect = true; + } - label(tokenizer, info) + label(tokenizer, info) + } } } } @@ -172,30 +176,19 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - _ if info.size > LINK_REFERENCE_SIZE_MAX => { - tokenizer.exit(Token::Data); - at_break(tokenizer, info) - } - Some(b'\t' | b' ') => { - tokenizer.consume(); - info.size += 1; - State::Fn(Box::new(|t| label(t, info))) - } - Some(b'\\') => { - tokenizer.consume(); - info.size += 1; - if !info.data { - info.data = true; - } - State::Fn(Box::new(|t| escape(t, info))) - } - Some(_) => { - tokenizer.consume(); - info.size += 1; - if !info.data { - info.data = true; + Some(byte) => { + if info.size > LINK_REFERENCE_SIZE_MAX { + tokenizer.exit(Token::Data); + at_break(tokenizer, info) + } else { + let func = if matches!(byte, b'\\') { escape } else { label }; + tokenizer.consume(); + info.size += 1; + if !info.data && !matches!(byte, b'\t' | b' ') { + info.data = true; + } + State::Fn(Box::new(move |t| func(t, info))) } - State::Fn(Box::new(|t| label(t, info))) } } } diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 80861af..9cf2f14 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -48,70 +48,13 @@ pub struct Options { pub string: Token, } -/// Type of title. -#[derive(Debug, PartialEq)] -enum Kind { - /// In a parenthesized (`(` and `)`) title. - /// - /// ## Example - /// - /// ```markdown - /// (a) - /// ``` - Paren, - /// In a double quoted (`"`) title. - /// - /// ## Example - /// - /// ```markdown - /// "a" - /// ``` - Double, - /// In a single quoted (`'`) title. - /// - /// ## Example - /// - /// ```markdown - /// 'a' - /// ``` - Single, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - /// - /// > 👉 **Note**: a closing paren is used for `Kind::Paren`. - fn as_byte(&self) -> u8 { - match self { - Kind::Paren => b')', - Kind::Double => b'"', - Kind::Single => b'\'', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. - /// - /// ## Panics - /// - /// Panics if `byte` is not `(`, `"`, or `'`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'(' => Kind::Paren, - b'"' => Kind::Double, - b'\'' => Kind::Single, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse titles. #[derive(Debug)] struct Info { /// Whether we’ve seen data. connect: bool, - /// Kind of title. - kind: Kind, + /// Closing marker. + marker: u8, /// Configuration. options: Options, } @@ -124,10 +67,11 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => { + Some(b'"' | b'\'' | b'(') => { + let marker = tokenizer.current.unwrap(); let info = Info { connect: false, - kind: Kind::from_byte(byte), + marker: if marker == b'(' { b')' } else { marker }, options, }; tokenizer.enter(info.options.title.clone()); @@ -150,7 +94,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -172,10 +116,6 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { - tokenizer.exit(info.options.string.clone()); - begin(tokenizer, info) - } None => State::Nok, Some(b'\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { @@ -187,7 +127,11 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { at_break(t, info) }, )(tokenizer), - _ => { + Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { + tokenizer.exit(info.options.string.clone()); + begin(tokenizer, info) + } + Some(_) => { tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); if info.connect { @@ -210,21 +154,18 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn title(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + None | Some(b'\n') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - None | Some(b'\n') => { + Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Some(b'\\') => { + Some(byte) => { + let func = if matches!(byte, b'\\') { escape } else { title }; tokenizer.consume(); - State::Fn(Box::new(|t| escape(t, info))) - } - _ => { - tokenizer.consume(); - State::Fn(Box::new(|t| title(t, info))) + State::Fn(Box::new(move |t| func(t, info))) } } } @@ -237,7 +178,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'"' | b'\'' | b')') => { tokenizer.consume(); State::Fn(Box::new(|t| title(t, info))) } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 13815cb..4f872ba 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -92,8 +92,7 @@ fn trim_data( if trim_end { let mut index = slice.bytes.len(); - let vs = slice.after; - let mut spaces_only = vs == 0; + let mut spaces_only = slice.after == 0; while index > 0 { match slice.bytes[index - 1] { b' ' => {} @@ -105,10 +104,10 @@ fn trim_data( } let diff = slice.bytes.len() - index; - let token_type = if spaces_only - && hard_break - && exit_index + 1 < tokenizer.events.len() + let token_type = if hard_break + && spaces_only && diff >= HARD_BREAK_PREFIX_SIZE_MIN + && exit_index + 1 < tokenizer.events.len() { Token::HardBreakTrailing } else { @@ -123,7 +122,7 @@ fn trim_data( return; } - if diff > 0 || vs > 0 { + if diff > 0 || slice.after > 0 { let exit_point = tokenizer.events[exit_index].point.clone(); let mut enter_point = exit_point.clone(); enter_point.index -= diff; @@ -156,14 +155,11 @@ fn trim_data( if trim_start { let mut index = 0; - let vs = slice.before; while index < slice.bytes.len() { match slice.bytes[index] { - b' ' | b'\t' => {} + b' ' | b'\t' => index += 1, _ => break, } - - index += 1; } // The whole data is whitespace. @@ -174,7 +170,7 @@ fn trim_data( return; } - if index > 0 || vs > 0 { + if index > 0 || slice.before > 0 { let enter_point = tokenizer.events[exit_index - 1].point.clone(); let mut exit_point = enter_point.clone(); exit_point.index += index; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 4fc4dc4..785d132 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -53,64 +53,11 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; -/// Type of thematic break. -#[derive(Debug, PartialEq)] -enum Kind { - /// In a thematic break using asterisks (`*`). - /// - /// ## Example - /// - /// ```markdown - /// *** - /// ``` - Asterisk, - /// In a thematic break using dashes (`-`). - /// - /// ## Example - /// - /// ```markdown - /// --- - /// ``` - Dash, - /// In a thematic break using underscores (`_`). - /// - /// ## Example - /// - /// ```markdown - /// ___ - /// ``` - Underscore, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - Kind::Asterisk => b'*', - Kind::Dash => b'-', - Kind::Underscore => b'_', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `*`, `-`, or `_`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'*' => Kind::Asterisk, - b'-' => Kind::Dash, - b'_' => Kind::Underscore, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse thematic breaks. #[derive(Debug)] struct Info { - /// Kind of marker. - kind: Kind, + /// Marker. + marker: u8, /// Number of markers. size: usize, } @@ -122,15 +69,19 @@ struct Info { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.thematic_break { tokenizer.enter(Token::ThematicBreak); - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -144,10 +95,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break( + Some(b'*' | b'-' | b'_') => at_break( tokenizer, Info { - kind: Kind::from_byte(byte), + marker: tokenizer.current.unwrap(), size: 0, }, ), @@ -163,13 +114,13 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { + None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Some(byte) if byte == info.kind.as_byte() => { + Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => { tokenizer.enter(Token::ThematicBreakSequence); sequence(tokenizer, info) } @@ -185,7 +136,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| sequence(t, info))) -- cgit