diff options
Diffstat (limited to '')
| -rw-r--r-- | src/construct/attention.rs | 88 | ||||
| -rw-r--r-- | src/construct/autolink.rs | 57 | ||||
| -rw-r--r-- | src/construct/character_escape.rs | 3 | ||||
| -rw-r--r-- | src/construct/character_reference.rs | 132 | ||||
| -rw-r--r-- | src/construct/code_fenced.rs | 123 | ||||
| -rw-r--r-- | src/construct/code_indented.rs | 37 | ||||
| -rw-r--r-- | src/construct/code_text.rs | 7 | ||||
| -rw-r--r-- | src/construct/definition.rs | 21 | ||||
| -rw-r--r-- | src/construct/hard_break_escape.rs | 4 | ||||
| -rw-r--r-- | src/construct/heading_atx.rs | 28 | ||||
| -rw-r--r-- | src/construct/heading_setext.rs | 96 | ||||
| -rw-r--r-- | src/construct/html_flow.rs | 212 | ||||
| -rw-r--r-- | src/construct/html_text.rs | 46 | ||||
| -rw-r--r-- | src/construct/label_end.rs | 47 | ||||
| -rw-r--r-- | src/construct/label_start_image.rs | 3 | ||||
| -rw-r--r-- | src/construct/list.rs | 135 | ||||
| -rw-r--r-- | src/construct/paragraph.rs | 3 | ||||
| -rw-r--r-- | src/construct/partial_bom.rs | 37 | ||||
| -rw-r--r-- | src/construct/partial_destination.rs | 53 | ||||
| -rw-r--r-- | src/construct/partial_label.rs | 101 | ||||
| -rw-r--r-- | src/construct/partial_title.rs | 93 | ||||
| -rw-r--r-- | src/construct/partial_whitespace.rs | 18 | ||||
| -rw-r--r-- | src/construct/thematic_break.rs | 85 | 
23 files changed, 522 insertions, 907 deletions
| diff --git a/src/construct/attention.rs b/src/construct/attention.rs index b042645..583fde2 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -88,54 +88,11 @@ enum GroupKind {      Other,  } -/// Type of sequence. -#[derive(Debug, PartialEq)] -enum MarkerKind { -    /// In a run with asterisks. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// *a* -    /// ``` -    Asterisk, -    /// In a run with underscores. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// _a_ -    /// ``` -    Underscore, -} - -impl MarkerKind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            MarkerKind::Asterisk => b'*', -            MarkerKind::Underscore => b'_', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `*` or `_`. -    fn from_byte(byte: u8) -> MarkerKind { -        match byte { -            b'*' => MarkerKind::Asterisk, -            b'_' => MarkerKind::Underscore, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// Attentention sequence that we can take markers from.  #[derive(Debug)]  struct Sequence { -    /// Marker used in this sequence. -    marker: MarkerKind, +    /// Marker as a byte (`u8`) used in this sequence. +    marker: u8,      /// The depth in events where this sequence resides.      balance: usize,      /// The index into events where this sequence’s `Enter` currently resides. @@ -160,9 +117,9 @@ struct Sequence {  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => { +        Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => {              tokenizer.enter(Token::AttentionSequence); -            inside(tokenizer, MarkerKind::from_byte(byte)) +            inside(tokenizer, tokenizer.current.unwrap())          }          _ => State::Nok,      } @@ -174,14 +131,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// > | **  ///     ^^  /// ``` -fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { -    if tokenizer.current == Some(marker.as_byte()) { -        tokenizer.consume(); -        State::Fn(Box::new(move |t| inside(t, marker))) -    } else { -        tokenizer.exit(Token::AttentionSequence); -        tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); -        State::Ok +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { +    match tokenizer.current { +        Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => { +            tokenizer.consume(); +            State::Fn(Box::new(move |t| inside(t, marker))) +        } +        _ => { +            tokenizer.exit(Token::AttentionSequence); +            tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); +            State::Ok +        }      }  } @@ -219,16 +179,10 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {                      String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]);                  let char_after = string_after.chars().next(); -                let marker = MarkerKind::from_byte( -                    Slice::from_point(tokenizer.parse_state.bytes, &enter.point) -                        .head() -                        .unwrap(), -                ); -                let before = classify_character(if enter.point.index > 0 { -                    char_before -                } else { -                    None -                }); +                let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) +                    .head() +                    .unwrap(); +                let before = classify_character(char_before);                  let after = classify_character(char_after);                  let open = after == GroupKind::Other                      || (after == GroupKind::Punctuation && before != GroupKind::Other); @@ -245,12 +199,12 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {                      start_point: enter.point.clone(),                      end_point: exit.point.clone(),                      size: exit.point.index - enter.point.index, -                    open: if marker == MarkerKind::Asterisk { +                    open: if marker == b'*' {                          open                      } else {                          open && (before != GroupKind::Other || !close)                      }, -                    close: if marker == MarkerKind::Asterisk { +                    close: if marker == b'*' {                          close                      } else {                          close && (after != GroupKind::Other || !open) diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index b843af8..c0514ae 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn open(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_alphabetic() => { +        // ASCII alphabetic. +        Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(scheme_or_email_atext))          } -        Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer), -        _ => State::Nok, +        _ => email_atext(tokenizer),      }  } @@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State {              tokenizer.exit(Token::AutolinkProtocol);              end(tokenizer)          } -        Some(byte) if byte.is_ascii_control() => State::Nok, -        None | Some(b' ') => State::Nok, +        // ASCII control or space. +        None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok,          Some(_) => {              tokenizer.consume();              State::Fn(Box::new(url_inside)) @@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0)))          } -        Some(byte) if is_ascii_atext(byte) => { +        // ASCII atext. +        // +        // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or +        // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 +        // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), +        // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`), +        // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE +        // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE +        // (`~`). +        // +        // See: +        // **\[RFC5322]**: +        // [Internet Message Format](https://tools.ietf.org/html/rfc5322). +        // P. Resnick. +        // IETF. +        // +        // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric +        Some( +            b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~', +        ) => {              tokenizer.consume();              State::Fn(Box::new(email_atext))          } @@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {  /// ```  fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size), +        // ASCII alphanumeric. +        Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size),          _ => State::Nok,      }  } @@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State {              tokenizer.consume();              State::Fn(Box::new(move |t| email_value(t, size + 1)))          } -        Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { +        // ASCII alphanumeric. +        Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => {              tokenizer.consume();              State::Fn(Box::new(move |t| email_label(t, size + 1)))          } @@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State {          _ => unreachable!("expected `>`"),      }  } - -/// Check whether the character code represents an ASCII atext. -/// -/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in -/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), -/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F -/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E -/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE -/// (`{`) to U+007E TILDE (`~`). -/// -/// See: -/// **\[RFC5322]**: -/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). -/// P. Resnick. -/// IETF. -/// -/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric -fn is_ascii_atext(byte: u8) -> bool { -    matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~') -} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 02e8b62..4419d7a 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -63,7 +63,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn inside(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_punctuation() => { +        // ASCII punctuation. +        Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => {              tokenizer.enter(Token::CharacterEscapeValue);              tokenizer.consume();              tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 90763c1..cd489a4 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,67 +66,18 @@ use crate::constant::{      CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,  };  use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -/// Kind of a character reference. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { -    /// Numeric decimal character reference. -    /// -    /// ```markdown -    /// > | a	b -    ///      ^^^^^ -    /// ``` -    Decimal, -    /// Numeric hexadecimal character reference. -    /// -    /// ```markdown -    /// > | a{b -    ///      ^^^^^^ -    /// ``` -    Hexadecimal, -    /// Named character reference. -    /// -    /// ```markdown -    /// > | a&b -    ///      ^^^^^ -    /// ``` -    Named, -} - -impl Kind { -    /// Get the maximum size of characters allowed in the value of a character -    /// reference. -    fn max(&self) -> usize { -        match self { -            Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, -            Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, -            Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX, -        } -    } - -    /// Check if a byte ([`u8`]) is allowed. -    fn allowed(&self, byte: u8) -> bool { -        let check = match self { -            Kind::Hexadecimal => u8::is_ascii_hexdigit, -            Kind::Decimal => u8::is_ascii_digit, -            Kind::Named => u8::is_ascii_alphanumeric, -        }; - -        check(&byte) -    } -} +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice;  /// State needed to parse character references.  #[derive(Debug, Clone)]  struct Info { -    /// Place of value start. -    start: Point, -    /// Size of value. -    size: usize, -    /// Kind of character reference. -    kind: Kind, +    /// Index of where value starts. +    start: usize, +    /// Marker of character reference. +    marker: u8, +    /// Maximum number of characters in the value for this kind. +    max: usize,  }  /// Start of a character reference. @@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State {          value(              tokenizer,              Info { -                start: tokenizer.point.clone(), -                size: 0, -                kind: Kind::Named, +                start: tokenizer.point.index, +                marker: b'&', +                max: CHARACTER_REFERENCE_NAMED_SIZE_MAX,              },          )      } @@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {          tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal);          tokenizer.enter(Token::CharacterReferenceValue);          let info = Info { -            start: tokenizer.point.clone(), -            size: 0, -            kind: Kind::Hexadecimal, +            start: tokenizer.point.index, +            marker: b'x', +            max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,          };          State::Fn(Box::new(|t| value(t, info)))      } else {          tokenizer.enter(Token::CharacterReferenceValue);          let info = Info { -            start: tokenizer.point.clone(), -            size: 0, -            kind: Kind::Decimal, +            start: tokenizer.point.index, +            marker: b'#', +            max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,          };          value(tokenizer, info)      } @@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {  /// > | a	b  ///         ^  /// ``` -fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn value(tokenizer: &mut Tokenizer, info: Info) -> State { +    let size = tokenizer.point.index - info.start; +      match tokenizer.current { -        Some(b';') if info.size > 0 => { -            if Kind::Named == info.kind { -                // To do: fix slice. -                let value = Slice::from_position( +        Some(b';') if size > 0 => { +            // Named. +            if info.marker == b'&' { +                // Guaranteed to be valid ASCII bytes. +                let slice = Slice::from_indices(                      tokenizer.parse_state.bytes, -                    &Position { -                        start: &info.start, -                        end: &tokenizer.point, -                    }, -                ) -                .serialize(); +                    info.start, +                    tokenizer.point.index, +                ); +                let name = slice.as_str(); -                if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { +                if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) {                      return State::Nok;                  }              } @@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {              tokenizer.exit(Token::CharacterReference);              State::Ok          } -        Some(byte) => { -            if info.size < info.kind.max() && info.kind.allowed(byte) { -                info.size += 1; -                tokenizer.consume(); -                State::Fn(Box::new(|t| value(t, info))) -            } else { -                State::Nok -            } +        // ASCII digit, for named, decimal, and hexadecimal references. +        Some(b'0'..=b'9') if size < info.max => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| value(t, info))) +        } +        // ASCII hex letters, for named and hexadecimal references. +        Some(b'A'..=b'F' | b'a'..=b'f') +            if matches!(info.marker, b'&' | b'x') && size < info.max => +        { +            tokenizer.consume(); +            State::Fn(Box::new(|t| value(t, info))) +        } +        // Non-hex ASCII alphabeticals, for named references. +        Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| value(t, info)))          }          _ => State::Nok,      } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 21e9259..c4c3e86 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -110,53 +110,6 @@ use crate::token::Token;  use crate::tokenizer::{ContentType, State, Tokenizer};  use crate::util::slice::{Position, Slice}; -/// Kind of fences. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { -    /// Grave accent (tick) code. -    /// -    /// ## Example -    /// -    /// ````markdown -    /// ```rust -    /// println!("I <3 🦀"); -    /// ``` -    /// ```` -    GraveAccent, -    /// Tilde code. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// ~~~rust -    /// println!("I <3 🦀"); -    /// ~~~ -    /// ``` -    Tilde, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            Kind::GraveAccent => b'`', -            Kind::Tilde => b'~', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `~` or `` ` ``. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'`' => Kind::GraveAccent, -            b'~' => Kind::Tilde, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse code (fenced).  #[derive(Debug, Clone)]  struct Info { @@ -165,8 +118,8 @@ struct Info {      /// Number of tabs or spaces of indentation before the opening fence      /// sequence.      prefix: usize, -    /// Kind of fences. -    kind: Kind, +    /// Marker of fences (`u8`). +    marker: u8,  }  /// Start of fenced code. @@ -178,15 +131,20 @@ struct Info {  ///   | ~~~  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    };      if tokenizer.parse_state.constructs.code_fenced {          tokenizer.enter(Token::CodeFenced);          tokenizer.enter(Token::CodeFencedFence); -        tokenizer.go(space_or_tab_min_max(0, max), before_sequence_open)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before_sequence_open, +        )(tokenizer)      } else {          State::Nok      } @@ -210,23 +168,22 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {                  tokenizer.parse_state.bytes,                  &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1),              ) -            .size(); +            .len();          }      } -    match tokenizer.current { -        Some(byte) if matches!(byte, b'`' | b'~') => { -            tokenizer.enter(Token::CodeFencedFenceSequence); -            sequence_open( -                tokenizer, -                Info { -                    prefix, -                    size: 0, -                    kind: Kind::from_byte(byte), -                }, -            ) -        } -        _ => State::Nok, +    if let Some(b'`' | b'~') = tokenizer.current { +        tokenizer.enter(Token::CodeFencedFenceSequence); +        sequence_open( +            tokenizer, +            Info { +                prefix, +                size: 0, +                marker: tokenizer.current.unwrap(), +            }, +        ) +    } else { +        State::Nok      }  } @@ -240,7 +197,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {  /// ```  fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {              tokenizer.consume();              State::Fn(Box::new(|t| {                  info.size += 1; @@ -302,7 +259,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State {              tokenizer.exit(Token::CodeFencedFenceInfo);              tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer)          } -        Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, +        Some(b'`') if info.marker == b'`' => State::Nok,          Some(_) => {              tokenizer.consume();              State::Fn(Box::new(|t| info_inside(t, info))) @@ -352,7 +309,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State {              tokenizer.concrete = true;              at_break(tokenizer, info)          } -        Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, +        Some(b'`') if info.marker == b'`' => State::Nok,          _ => {              tokenizer.consume();              State::Fn(Box::new(|t| meta(t, info))) @@ -432,14 +389,18 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State {  ///     ^  /// ```  fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      tokenizer.enter(Token::CodeFencedFence); -    tokenizer.go(space_or_tab_min_max(0, max), |t| close_before(t, info))(tokenizer) +    tokenizer.go( +        space_or_tab_min_max( +            0, +            if tokenizer.parse_state.constructs.code_indented { +                TAB_SIZE - 1 +            } else { +                usize::MAX +            }, +        ), +        |t| close_before(t, info), +    )(tokenizer)  }  /// In a closing fence, after optional whitespace, before sequence. @@ -452,7 +413,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {              tokenizer.enter(Token::CodeFencedFenceSequence);              close_sequence(tokenizer, info, 0)          } @@ -470,7 +431,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {              tokenizer.consume();              State::Fn(Box::new(move |t| close_sequence(t, info, size + 1)))          } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 4a3a9f6..81a3080 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -62,11 +62,11 @@ use crate::tokenizer::{State, Tokenizer};  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State {      // Do not interrupt paragraphs. -    if tokenizer.interrupt || !tokenizer.parse_state.constructs.code_indented { -        State::Nok -    } else { +    if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented {          tokenizer.enter(Token::CodeIndented);          tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer) +    } else { +        State::Nok      }  } @@ -129,29 +129,26 @@ fn after(tokenizer: &mut Tokenizer) -> State {  ///   |     bbb  /// ```  fn further_start(tokenizer: &mut Tokenizer) -> State { -    if tokenizer.lazy { -        State::Nok -    } else { -        match tokenizer.current { -            Some(b'\n') => { -                tokenizer.enter(Token::LineEnding); -                tokenizer.consume(); -                tokenizer.exit(Token::LineEnding); -                State::Fn(Box::new(further_start)) -            } -            _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { -                Box::new(if ok { further_end } else { further_begin }) -            })(tokenizer), +    match tokenizer.current { +        Some(b'\n') if !tokenizer.lazy => { +            tokenizer.enter(Token::LineEnding); +            tokenizer.consume(); +            tokenizer.exit(Token::LineEnding); +            State::Fn(Box::new(further_start))          } +        _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { +            Box::new(if ok { further_end } else { further_begin }) +        })(tokenizer), +        _ => State::Nok,      }  } -/// After a proper indent. +/// At an eol, which is followed by an indented line.  ///  /// ```markdown -///   |     aaa -/// > |     bbb -///         ^ +/// >  |     aaa +///             ^ +///    |     bbb  /// ```  fn further_end(_tokenizer: &mut Tokenizer) -> State {      State::Ok diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index b36a208..d70fbc2 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -95,14 +95,13 @@ use crate::tokenizer::{State, Tokenizer};  ///      ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let len = tokenizer.events.len(); -      match tokenizer.current {          Some(b'`')              if tokenizer.parse_state.constructs.code_text                  && (tokenizer.previous != Some(b'`') -                    || (len > 0 -                        && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => +                    || (!tokenizer.events.is_empty() +                        && tokenizer.events[tokenizer.events.len() - 1].token_type +                            == Token::CharacterEscape)) =>          {              tokenizer.enter(Token::CodeText);              tokenizer.enter(Token::CodeTextSequence); diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 14755c9..bd7df82 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -110,17 +110,18 @@ use crate::util::skip::opt_back as skip_opt_back;  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let definition_before = !tokenizer.events.is_empty() -        && tokenizer.events[skip_opt_back( -            &tokenizer.events, -            tokenizer.events.len() - 1, -            &[Token::LineEnding, Token::SpaceOrTab], -        )] -        .token_type -            == Token::Definition; -      // Do not interrupt paragraphs (but do follow definitions). -    if (!tokenizer.interrupt || definition_before) && tokenizer.parse_state.constructs.definition { +    let possible = !tokenizer.interrupt +        || (!tokenizer.events.is_empty() +            && tokenizer.events[skip_opt_back( +                &tokenizer.events, +                tokenizer.events.len() - 1, +                &[Token::LineEnding, Token::SpaceOrTab], +            )] +            .token_type +                == Token::Definition); + +    if possible && tokenizer.parse_state.constructs.definition {          tokenizer.enter(Token::Definition);          // Note: arbitrary whitespace allowed even if code (indented) is on.          tokenizer.attempt_opt(space_or_tab(), before)(tokenizer) diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index cdbc192..d09bf54 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {          Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => {              tokenizer.enter(Token::HardBreakEscape);              tokenizer.consume(); -            State::Fn(Box::new(inside)) +            State::Fn(Box::new(after))          }          _ => State::Nok,      } @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  ///       ^  ///   | b  /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +fn after(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current {          Some(b'\n') => {              tokenizer.exit(Token::HardBreakEscape); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 9a73b77..aa388ee 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,15 +66,19 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer};  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.heading_atx {          tokenizer.enter(Token::HeadingAtx); -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -101,19 +105,19 @@ fn before(tokenizer: &mut Tokenizer) -> State {  /// > | ## aa  ///     ^  /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { +fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State {      match tokenizer.current { -        None | Some(b'\n') if rank > 0 => { +        None | Some(b'\n') if size > 0 => {              tokenizer.exit(Token::HeadingAtxSequence);              at_break(tokenizer)          } -        Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { +        Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {              tokenizer.consume();              State::Fn(Box::new(move |tokenizer| { -                sequence_open(tokenizer, rank + 1) +                sequence_open(tokenizer, size + 1)              }))          } -        _ if rank > 0 => { +        _ if size > 0 => {              tokenizer.exit(Token::HeadingAtxSequence);              tokenizer.go(space_or_tab(), at_break)(tokenizer)          } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 2a4adbf..98d7843 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -63,52 +63,6 @@ use crate::token::Token;  use crate::tokenizer::{EventType, State, Tokenizer};  use crate::util::skip::opt_back as skip_opt_back; -/// Kind of underline. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { -    /// Dash (rank 2) heading. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// alpha -    /// ----- -    /// ``` -    Dash, - -    /// Equals to (rank 1) heading. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// alpha -    /// ===== -    /// ``` -    EqualsTo, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            Kind::Dash => b'-', -            Kind::EqualsTo => b'=', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `-` or `=`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'-' => Kind::Dash, -            b'=' => Kind::EqualsTo, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// At a line ending, presumably an underline.  ///  /// ```markdown @@ -117,23 +71,29 @@ impl Kind {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -    let paragraph_before = !tokenizer.events.is_empty() -        && tokenizer.events[skip_opt_back( -            &tokenizer.events, -            tokenizer.events.len() - 1, -            &[Token::LineEnding, Token::SpaceOrTab], -        )] -        .token_type -            == Token::Paragraph; - -    // Require a paragraph before and do not allow on a lazy line. -    if paragraph_before && !tokenizer.lazy && tokenizer.parse_state.constructs.heading_setext { -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +    if tokenizer.parse_state.constructs.heading_setext +        && !tokenizer.lazy +        // Require a paragraph before. +        && (!tokenizer.events.is_empty() +            && tokenizer.events[skip_opt_back( +                &tokenizer.events, +                tokenizer.events.len() - 1, +                &[Token::LineEnding, Token::SpaceOrTab], +            )] +            .token_type +                == Token::Paragraph) +    { +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -148,9 +108,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn before(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if matches!(byte, b'-' | b'=') => { +        Some(b'-' | b'=') => {              tokenizer.enter(Token::HeadingSetextUnderline); -            inside(tokenizer, Kind::from_byte(byte)) +            inside(tokenizer, tokenizer.current.unwrap())          }          _ => State::Nok,      } @@ -163,11 +123,11 @@ fn before(tokenizer: &mut Tokenizer) -> State {  /// > | ==  ///     ^  /// ``` -fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State {      match tokenizer.current { -        Some(byte) if byte == kind.as_byte() => { +        Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => {              tokenizer.consume(); -            State::Fn(Box::new(move |t| inside(t, kind))) +            State::Fn(Box::new(move |t| inside(t, marker)))          }          _ => {              tokenizer.exit(Token::HeadingSetextUnderline); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 5860c5d..064da35 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -98,17 +98,17 @@  //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES  //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}; +use crate::constant::{ +    HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE, +};  use crate::construct::{      blank_line::start as blank_line,      partial_non_lazy_continuation::start as partial_non_lazy_continuation,      partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions},  };  use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice;  /// Kind of HTML (flow).  #[derive(Debug, PartialEq)] @@ -129,49 +129,6 @@ enum Kind {      Complete,  } -/// Type of quote, if we’re in a quoted attribute, in complete (condition 7). -#[derive(Debug, PartialEq)] -enum QuoteKind { -    /// In a double quoted (`"`) attribute value. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// <a b="c" /> -    /// ``` -    Double, -    /// In a single quoted (`'`) attribute value. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// <a b='c' /> -    /// ``` -    Single, -} - -impl QuoteKind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            QuoteKind::Double => b'"', -            QuoteKind::Single => b'\'', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `"` or `'`. -    fn from_byte(byte: u8) -> QuoteKind { -        match byte { -            b'"' => QuoteKind::Double, -            b'\'' => QuoteKind::Single, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse HTML (flow).  #[derive(Debug)]  struct Info { @@ -179,12 +136,10 @@ struct Info {      kind: Kind,      /// Whether this is a start tag (`<` not followed by `/`).      start_tag: bool, -    /// Used depending on `kind` to collect all parsed bytes. -    start: Option<Point>, -    /// Collected index, for various reasons. -    size: usize, +    /// Start index of a tag name or cdata prefix. +    start: usize,      /// Current quote, when in a double or single quoted attribute value. -    quote: Option<QuoteKind>, +    quote: u8,  }  /// Start of HTML (flow), before optional whitespace. @@ -194,19 +149,17 @@ struct Info {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.html_flow {          tokenizer.enter(Token::HtmlFlow);          tokenizer.go(              space_or_tab_with_options(SpaceOrTabOptions {                  kind: Token::HtmlFlowData,                  min: 0, -                max, +                max: if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                },                  connect: false,                  content_type: None,              }), @@ -249,9 +202,8 @@ fn open(tokenizer: &mut Tokenizer) -> State {          kind: Kind::Basic,          // Assume closing tag (or no tag).          start_tag: false, -        start: None, -        size: 0, -        quote: None, +        start: 0, +        quote: 0,      };      match tokenizer.current { @@ -261,7 +213,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {          }          Some(b'/') => {              tokenizer.consume(); -            info.start = Some(tokenizer.point.clone()); +            info.start = tokenizer.point.index;              State::Fn(Box::new(|t| tag_close_start(t, info)))          }          Some(b'?') => { @@ -273,9 +225,10 @@ fn open(tokenizer: &mut Tokenizer) -> State {              // right now, so we do need to search for `>`, similar to declarations.              State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))          } +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              info.start_tag = true; -            info.start = Some(tokenizer.point.clone()); +            info.start = tokenizer.point.index;              tag_name(tokenizer, info)          }          _ => State::Nok, @@ -299,12 +252,6 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {              info.kind = Kind::Comment;              State::Fn(Box::new(|t| comment_open_inside(t, info)))          } -        Some(b'[') => { -            tokenizer.consume(); -            info.kind = Kind::Cdata; -            info.size = 0; -            State::Fn(Box::new(|t| cdata_open_inside(t, info))) -        }          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              info.kind = Kind::Declaration; @@ -312,6 +259,12 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {              tokenizer.concrete = true;              State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))          } +        Some(b'[') => { +            tokenizer.consume(); +            info.kind = Kind::Cdata; +            info.start = tokenizer.point.index; +            State::Fn(Box::new(|t| cdata_open_inside(t, info))) +        }          _ => State::Nok,      }  } @@ -342,12 +295,11 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == CDATA_SEARCH[info.size] => { -            info.size += 1; +        Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => {              tokenizer.consume(); -            if info.size == CDATA_SEARCH.len() { -                info.size = 0; +            if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() { +                info.start = 0;                  // Do not form containers.                  tokenizer.concrete = true;                  State::Fn(Box::new(|t| continuation(t, info))) @@ -367,6 +319,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {  /// ```  fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| tag_name(t, info))) @@ -387,17 +340,18 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current {          None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => {              let slash = matches!(tokenizer.current, Some(b'/')); -            let start = info.start.take().unwrap(); -            let name = Slice::from_position( +            // Guaranteed to be valid ASCII bytes. +            let slice = Slice::from_indices(                  tokenizer.parse_state.bytes, -                &Position { -                    start: &start, -                    end: &tokenizer.point, -                }, -            ) -            .serialize() -            .trim() -            .to_lowercase(); +                info.start, +                tokenizer.point.index, +            ); +            let name = slice +                .as_str() +                // The line ending case might result in a `\r` that is already accounted for. +                .trim() +                .to_ascii_lowercase(); +            info.start = 0;              if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) {                  info.kind = Kind::Raw; @@ -427,6 +381,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {                  }              }          } +        // ASCII alphanumerical and `-`.          Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| tag_name(t, info))) @@ -490,18 +445,19 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        Some(b'\t' | b' ') => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) +        }          Some(b'/') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_end(t, info)))          } +        // ASCII alphanumerical and `:` and `_`.          Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_name(t, info)))          } -        Some(b'\t' | b' ') => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) -        }          _ => complete_end(tokenizer, info),      }  } @@ -518,6 +474,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat  /// ```  fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        // ASCII alphanumerical and `-`, `.`, `:`, and `_`.          Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_name(t, info))) @@ -537,14 +494,14 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(b'=') => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) -        }          Some(b'\t' | b' ') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_name_after(t, info)))          } +        Some(b'=') => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) +        }          _ => complete_attribute_name_before(tokenizer, info),      }  } @@ -561,15 +518,15 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State  fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current {          None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, -        Some(byte) if matches!(byte, b'"' | b'\'') => { -            info.quote = Some(QuoteKind::from_byte(byte)); -            tokenizer.consume(); -            State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) -        }          Some(b'\t' | b' ') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))          } +        Some(b'"' | b'\'') => { +            info.quote = tokenizer.current.unwrap(); +            tokenizer.consume(); +            State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) +        }          _ => complete_attribute_value_unquoted(tokenizer, info),      }  } @@ -585,7 +542,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) ->  fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current {          None | Some(b'\n') => State::Nok, -        Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => { +        Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info)))          } @@ -673,6 +630,21 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { +            tokenizer.exit(Token::HtmlFlowData); +            tokenizer.check(blank_line_before, |ok| { +                if ok { +                    Box::new(continuation_after) +                } else { +                    Box::new(move |t| continuation_start(t, info)) +                } +            })(tokenizer) +        } +        // Note: important that this is after the basic/complete case. +        None | Some(b'\n') => { +            tokenizer.exit(Token::HtmlFlowData); +            continuation_start(tokenizer, info) +        }          Some(b'-') if info.kind == Kind::Comment => {              tokenizer.consume();              State::Fn(Box::new(|t| continuation_comment_inside(t, info))) @@ -693,20 +665,6 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {              tokenizer.consume();              State::Fn(Box::new(|t| continuation_character_data_inside(t, info)))          } -        Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { -            tokenizer.exit(Token::HtmlFlowData); -            tokenizer.check(blank_line_before, |ok| { -                if ok { -                    Box::new(continuation_after) -                } else { -                    Box::new(move |t| continuation_start(t, info)) -                } -            })(tokenizer) -        } -        None | Some(b'\n') => { -            tokenizer.exit(Token::HtmlFlowData); -            continuation_start(tokenizer, info) -        }          _ => {              tokenizer.consume();              State::Fn(Box::new(|t| continuation(t, info))) @@ -793,7 +751,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State      match tokenizer.current {          Some(b'/') => {              tokenizer.consume(); -            info.start = Some(tokenizer.point.clone()); +            info.start = tokenizer.point.index;              State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))          }          _ => continuation(tokenizer, info), @@ -809,18 +767,15 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State  fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current {          Some(b'>') => { -            info.size = 0; - -            let start = info.start.take().unwrap(); -            let name = Slice::from_position( +            // Guaranteed to be valid ASCII bytes. +            let slice = Slice::from_indices(                  tokenizer.parse_state.bytes, -                &Position { -                    start: &start, -                    end: &tokenizer.point, -                }, -            ) -            .serialize() -            .to_lowercase(); +                info.start, +                tokenizer.point.index, +            ); +            let name = slice.as_str().to_ascii_lowercase(); + +            info.start = 0;              if HTML_RAW_NAMES.contains(&name.as_str()) {                  tokenizer.consume(); @@ -829,13 +784,14 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State                  continuation(tokenizer, info)              }          } -        Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => { +        Some(b'A'..=b'Z' | b'a'..=b'z') +            if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX => +        {              tokenizer.consume(); -            info.size += 1;              State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))          }          _ => { -            info.size = 0; +            info.start = 0;              continuation(tokenizer, info)          }      } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index f10a476..51beda5 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -54,12 +54,11 @@  //! [html_flow]: crate::construct::html_flow  //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +use crate::constant::HTML_CDATA_PREFIX;  use crate::construct::partial_space_or_tab::space_or_tab;  use crate::token::Token;  use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; -  /// Start of HTML (text)  ///  /// ```markdown @@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(instruction))          } +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open)) @@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(comment_open_inside))          } -        Some(b'[') => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| cdata_open_inside(t, 0))) -        } +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(declaration))          } +        Some(b'[') => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| cdata_open_inside(t, 0))) +        }          _ => State::Nok,      }  } @@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State {  /// > | a <![CDATA[>&<]]> b  ///          ^^^^^^  /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { -    match tokenizer.current { -        Some(byte) if byte == CDATA_SEARCH[index] => { -            tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { +    if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { +        tokenizer.consume(); -            if index + 1 == CDATA_SEARCH.len() { -                State::Fn(Box::new(cdata)) -            } else { -                State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) -            } +        if size + 1 == HTML_CDATA_PREFIX.len() { +            State::Fn(Box::new(cdata)) +        } else { +            State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1)))          } -        _ => State::Nok, +    } else { +        State::Nok      }  } @@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_close_start(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_close)) @@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_close(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphanumerical and `-`.          Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_close)) @@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_open(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphanumerical and `-`.          Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open)) @@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(end))          } +        // ASCII alphabetical and `:` and `_`.          Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_name)) @@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphabetical and `-`, `.`, `:`, and `_`.          Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_name)) @@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_value_before))          } -        Some(byte) if byte == b'"' || byte == b'\'' => { +        Some(b'"' | b'\'') => { +            let marker = tokenizer.current.unwrap();              tokenizer.consume(); -            State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) +            State::Fn(Box::new(move |t| { +                tag_open_attribute_value_quoted(t, marker) +            }))          }          Some(_) => {              tokenizer.consume(); @@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta              tokenizer,              Box::new(move |t| tag_open_attribute_value_quoted(t, marker)),          ), -        Some(byte) if byte == marker => { +        Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_value_quoted_after))          } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 6399f81..a1ec8d9 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -214,16 +214,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {                  media: Media {                      start: label_start.start,                      end: (label_end_start, label_end_start + 3), -                    // To do: virtual spaces not needed, create a `to_str`?                      id: normalize_identifier( -                        &Slice::from_position( +                        // We don’t care about virtual spaces, so `indices` and `as_str` are fine. +                        Slice::from_indices(                              tokenizer.parse_state.bytes, -                            &Position { -                                start: &tokenizer.events[label_start.start.1].point, -                                end: &tokenizer.events[label_end_start - 1].point, -                            }, +                            tokenizer.events[label_start.start.1].point.index, +                            tokenizer.events[label_end_start - 1].point.index,                          ) -                        .serialize(), +                        .as_str(),                      ),                  },              }; @@ -366,11 +364,11 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State {  ///        ^  /// ```  fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { -    let label_start = tokenizer +    tokenizer          .label_start_stack          .get_mut(label_start_index) -        .unwrap(); -    label_start.balanced = true; +        .unwrap() +        .balanced = true;      State::Nok  } @@ -529,23 +527,24 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State {  ///          ^  /// ```  fn full_reference_after(tokenizer: &mut Tokenizer) -> State { -    let end = skip::to_back( -        &tokenizer.events, -        tokenizer.events.len() - 1, -        &[Token::ReferenceString], -    ); - -    // To do: virtual spaces not needed, create a `to_str`? -    let id = Slice::from_position( -        tokenizer.parse_state.bytes, -        &Position::from_exit_event(&tokenizer.events, end), -    ) -    .serialize(); -      if tokenizer          .parse_state          .definitions -        .contains(&normalize_identifier(&id)) +        // We don’t care about virtual spaces, so `as_str` is fine. +        .contains(&normalize_identifier( +            Slice::from_position( +                tokenizer.parse_state.bytes, +                &Position::from_exit_event( +                    &tokenizer.events, +                    skip::to_back( +                        &tokenizer.events, +                        tokenizer.events.len() - 1, +                        &[Token::ReferenceString], +                    ), +                ), +            ) +            .as_str(), +        ))      {          State::Ok      } else { diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index d30b8dd..4a3508e 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -64,9 +64,8 @@ pub fn open(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              tokenizer.exit(Token::LabelMarker);              tokenizer.exit(Token::LabelImage); -            let end = tokenizer.events.len() - 1;              tokenizer.label_start_stack.push(LabelStart { -                start: (end - 5, end), +                start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1),                  balanced: false,                  inactive: false,              }); diff --git a/src/construct/list.rs b/src/construct/list.rs index 9b59130..d5a9899 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -56,69 +56,6 @@ use crate::util::{      slice::{Position, Slice},  }; -/// Type of list. -#[derive(Debug, PartialEq)] -enum Kind { -    /// In a dot (`.`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// 1. a -    /// ``` -    Dot, -    /// In a paren (`)`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// 1) a -    /// ``` -    Paren, -    /// In an asterisk (`*`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// * a -    /// ``` -    Asterisk, -    /// In a plus (`+`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// + a -    /// ``` -    Plus, -    /// In a dash (`-`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// - a -    /// ``` -    Dash, -} - -impl Kind { -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'.' => Kind::Dot, -            b')' => Kind::Paren, -            b'*' => Kind::Asterisk, -            b'+' => Kind::Plus, -            b'-' => Kind::Dash, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// Start of list item.  ///  /// ```markdown @@ -126,15 +63,19 @@ impl Kind {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.list {          tokenizer.enter(Token::ListItem); -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -149,15 +90,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  fn before(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current {          // Unordered. -        Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| { +        Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| {              Box::new(if ok { nok } else { before_unordered })          })(tokenizer), +        Some(b'+') => before_unordered(tokenizer),          // Ordered. -        Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => { -            tokenizer.enter(Token::ListItemPrefix); -            tokenizer.enter(Token::ListItemValue); -            inside(tokenizer, 0) -        } +        Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer), +        Some(b'1') => before_ordered(tokenizer),          _ => State::Nok,      }  } @@ -175,6 +114,18 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {      marker(tokenizer)  } +/// Start of an ordered list item. +/// +/// ```markdown +/// > | * a +///     ^ +/// ``` +fn before_ordered(tokenizer: &mut Tokenizer) -> State { +    tokenizer.enter(Token::ListItemPrefix); +    tokenizer.enter(Token::ListItemValue); +    inside(tokenizer, 0) +} +  /// In an ordered list item value.  ///  /// ```markdown @@ -183,14 +134,14 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {  /// ```  fn inside(tokenizer: &mut Tokenizer, size: usize) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { -            tokenizer.consume(); -            State::Fn(Box::new(move |t| inside(t, size + 1))) -        }          Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => {              tokenizer.exit(Token::ListItemValue);              marker(tokenizer)          } +        Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { +            tokenizer.consume(); +            State::Fn(Box::new(move |t| inside(t, size + 1))) +        }          _ => State::Nok,      }  } @@ -262,7 +213,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State {  ///      ^  /// ```  fn whitespace_after(tokenizer: &mut Tokenizer) -> State { -    if matches!(tokenizer.current, Some(b'\t' | b' ')) { +    if let Some(b'\t' | b' ') = tokenizer.current {          State::Nok      } else {          State::Ok @@ -309,7 +260,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State {                  end: &tokenizer.point,              },          ) -        .size(); +        .len();          if blank {              prefix += 1; @@ -389,8 +340,8 @@ fn nok(_tokenizer: &mut Tokenizer) -> State {  pub fn resolve_list_item(tokenizer: &mut Tokenizer) {      let mut index = 0;      let mut balance = 0; -    let mut lists_wip: Vec<(Kind, usize, usize, usize)> = vec![]; -    let mut lists: Vec<(Kind, usize, usize, usize)> = vec![]; +    let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; +    let mut lists: Vec<(u8, usize, usize, usize)> = vec![];      // Merge list items.      while index < tokenizer.events.len() { @@ -400,12 +351,14 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {              if event.event_type == EventType::Enter {                  let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1;                  let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); -                let kind = Kind::from_byte( -                    Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point) -                        .head() -                        .unwrap(), -                ); -                let current = (kind, balance, index, end); +                // Guaranteed to be a valid ASCII byte. +                let marker = Slice::from_index( +                    tokenizer.parse_state.bytes, +                    tokenizer.events[marker].point.index, +                ) +                .head() +                .unwrap(); +                let current = (marker, balance, index, end);                  let mut list_index = lists_wip.len();                  let mut matched = false; @@ -475,7 +428,7 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {          let mut list_start = tokenizer.events[list_item.2].clone();          let mut list_end = tokenizer.events[list_item.3].clone();          let token_type = match list_item.0 { -            Kind::Paren | Kind::Dot => Token::ListOrdered, +            b'.' | b')' => Token::ListOrdered,              _ => Token::ListUnordered,          };          list_start.token_type = token_type.clone(); diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 146dc40..ec5669c 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -81,10 +81,9 @@ fn inside(tokenizer: &mut Tokenizer) -> State {  /// Merge “`Paragraph`”s, which currently span a single line, into actual  /// `Paragraph`s that span multiple lines.  pub fn resolve(tokenizer: &mut Tokenizer) { -    let len = tokenizer.events.len();      let mut index = 0; -    while index < len { +    while index < tokenizer.events.len() {          let event = &tokenizer.events[index];          if event.event_type == EventType::Enter && event.token_type == Token::Paragraph { diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index be8d6c8..155a1a3 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -10,13 +10,12 @@ use crate::tokenizer::{State, Tokenizer};  ///     ^^^^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    match tokenizer.current { -        Some(0xEF) => { -            tokenizer.enter(Token::ByteOrderMark); -            tokenizer.consume(); -            State::Fn(Box::new(cont)) -        } -        _ => State::Nok, +    if tokenizer.current == Some(0xEF) { +        tokenizer.enter(Token::ByteOrderMark); +        tokenizer.consume(); +        State::Fn(Box::new(cont)) +    } else { +        State::Nok      }  } @@ -27,12 +26,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  ///          ^^^^  /// ```  fn cont(tokenizer: &mut Tokenizer) -> State { -    match tokenizer.current { -        Some(0xBB) => { -            tokenizer.consume(); -            State::Fn(Box::new(end)) -        } -        _ => State::Nok, +    if tokenizer.current == Some(0xBB) { +        tokenizer.consume(); +        State::Fn(Box::new(end)) +    } else { +        State::Nok      }  } @@ -43,12 +41,11 @@ fn cont(tokenizer: &mut Tokenizer) -> State {  ///               ^^^^  /// ```  fn end(tokenizer: &mut Tokenizer) -> State { -    match tokenizer.current { -        Some(0xBF) => { -            tokenizer.consume(); -            tokenizer.exit(Token::ByteOrderMark); -            State::Ok -        } -        _ => State::Nok, +    if tokenizer.current == Some(0xBF) { +        tokenizer.consume(); +        tokenizer.exit(Token::ByteOrderMark); +        State::Ok +    } else { +        State::Nok      }  } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 0a3721c..809aa27 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {              tokenizer.exit(info.options.marker.clone());              State::Fn(Box::new(|t| enclosed_before(t, info)))          } -        None | Some(b' ' | b')') => State::Nok, -        Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, +        // ASCII control, space, closing paren, but *not* `\0`. +        None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok,          Some(_) => {              tokenizer.enter(info.options.destination.clone());              tokenizer.enter(info.options.raw.clone()); @@ -166,12 +166,12 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        None | Some(b'\n' | b'<') => State::Nok,          Some(b'>') => {              tokenizer.exit(Token::Data);              tokenizer.exit(info.options.string.clone());              enclosed_before(tokenizer, info)          } -        None | Some(b'\n' | b'<') => State::Nok,          Some(b'\\') => {              tokenizer.consume();              State::Fn(Box::new(|t| enclosed_escape(t, info))) @@ -207,40 +207,25 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(b'(') => { -            if info.balance >= info.options.limit { -                State::Nok -            } else { -                tokenizer.consume(); -                info.balance += 1; -                State::Fn(Box::new(move |t| raw(t, info))) -            } +        None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => { +            tokenizer.exit(Token::Data); +            tokenizer.exit(info.options.string.clone()); +            tokenizer.exit(info.options.raw.clone()); +            tokenizer.exit(info.options.destination); +            State::Ok          } -        Some(b')') => { -            if info.balance == 0 { -                tokenizer.exit(Token::Data); -                tokenizer.exit(info.options.string.clone()); -                tokenizer.exit(info.options.raw.clone()); -                tokenizer.exit(info.options.destination); -                State::Ok -            } else { -                tokenizer.consume(); -                info.balance -= 1; -                State::Fn(Box::new(move |t| raw(t, info))) -            } +        Some(b'(') if info.balance < info.options.limit => { +            tokenizer.consume(); +            info.balance += 1; +            State::Fn(Box::new(move |t| raw(t, info)))          } -        None | Some(b'\t' | b'\n' | b' ') => { -            if info.balance > 0 { -                State::Nok -            } else { -                tokenizer.exit(Token::Data); -                tokenizer.exit(info.options.string.clone()); -                tokenizer.exit(info.options.raw.clone()); -                tokenizer.exit(info.options.destination); -                State::Ok -            } +        // ASCII control (but *not* `\0`) and space and `(`. +        None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok, +        Some(b')') => { +            tokenizer.consume(); +            info.balance -= 1; +            State::Fn(Box::new(move |t| raw(t, info)))          } -        Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok,          Some(b'\\') => {              tokenizer.consume();              State::Fn(Box::new(move |t| raw_escape(t, info))) diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 7e40a2d..6fdb70d 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -123,39 +123,43 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {  ///      ^  /// ```  fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { -    match tokenizer.current { -        None | Some(b'[') => State::Nok, -        Some(b']') if !info.data => State::Nok, -        _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, -        Some(b']') => { -            tokenizer.exit(info.options.string.clone()); -            tokenizer.enter(info.options.marker.clone()); -            tokenizer.consume(); -            tokenizer.exit(info.options.marker.clone()); -            tokenizer.exit(info.options.label); -            State::Ok -        } -        Some(b'\n') => tokenizer.go( -            space_or_tab_eol_with_options(EolOptions { -                content_type: Some(ContentType::String), -                connect: info.connect, -            }), -            |t| { -                info.connect = true; -                at_break(t, info) -            }, -        )(tokenizer), -        _ => { -            tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - -            if info.connect { -                let index = tokenizer.events.len() - 1; -                link(&mut tokenizer.events, index); -            } else { -                info.connect = true; +    if info.size > LINK_REFERENCE_SIZE_MAX +        || matches!(tokenizer.current, None | Some(b'[')) +        || (matches!(tokenizer.current, Some(b']')) && !info.data) +    { +        State::Nok +    } else { +        match tokenizer.current { +            Some(b'\n') => tokenizer.go( +                space_or_tab_eol_with_options(EolOptions { +                    content_type: Some(ContentType::String), +                    connect: info.connect, +                }), +                |t| { +                    info.connect = true; +                    at_break(t, info) +                }, +            )(tokenizer), +            Some(b']') => { +                tokenizer.exit(info.options.string.clone()); +                tokenizer.enter(info.options.marker.clone()); +                tokenizer.consume(); +                tokenizer.exit(info.options.marker.clone()); +                tokenizer.exit(info.options.label); +                State::Ok              } +            _ => { +                tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); + +                if info.connect { +                    let index = tokenizer.events.len() - 1; +                    link(&mut tokenizer.events, index); +                } else { +                    info.connect = true; +                } -            label(tokenizer, info) +                label(tokenizer, info) +            }          }      }  } @@ -172,30 +176,19 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {              tokenizer.exit(Token::Data);              at_break(tokenizer, info)          } -        _ if info.size > LINK_REFERENCE_SIZE_MAX => { -            tokenizer.exit(Token::Data); -            at_break(tokenizer, info) -        } -        Some(b'\t' | b' ') => { -            tokenizer.consume(); -            info.size += 1; -            State::Fn(Box::new(|t| label(t, info))) -        } -        Some(b'\\') => { -            tokenizer.consume(); -            info.size += 1; -            if !info.data { -                info.data = true; -            } -            State::Fn(Box::new(|t| escape(t, info))) -        } -        Some(_) => { -            tokenizer.consume(); -            info.size += 1; -            if !info.data { -                info.data = true; +        Some(byte) => { +            if info.size > LINK_REFERENCE_SIZE_MAX { +                tokenizer.exit(Token::Data); +                at_break(tokenizer, info) +            } else { +                let func = if matches!(byte, b'\\') { escape } else { label }; +                tokenizer.consume(); +                info.size += 1; +                if !info.data && !matches!(byte, b'\t' | b' ') { +                    info.data = true; +                } +                State::Fn(Box::new(move |t| func(t, info)))              } -            State::Fn(Box::new(|t| label(t, info)))          }      }  } diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 80861af..9cf2f14 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -48,70 +48,13 @@ pub struct Options {      pub string: Token,  } -/// Type of title. -#[derive(Debug, PartialEq)] -enum Kind { -    /// In a parenthesized (`(` and `)`) title. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// (a) -    /// ``` -    Paren, -    /// In a double quoted (`"`) title. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// "a" -    /// ``` -    Double, -    /// In a single quoted (`'`) title. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// 'a' -    /// ``` -    Single, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    /// -    /// > 👉 **Note**: a closing paren is used for `Kind::Paren`. -    fn as_byte(&self) -> u8 { -        match self { -            Kind::Paren => b')', -            Kind::Double => b'"', -            Kind::Single => b'\'', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `(`, `"`, or `'`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'(' => Kind::Paren, -            b'"' => Kind::Double, -            b'\'' => Kind::Single, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse titles.  #[derive(Debug)]  struct Info {      /// Whether we’ve seen data.      connect: bool, -    /// Kind of title. -    kind: Kind, +    /// Closing marker. +    marker: u8,      /// Configuration.      options: Options,  } @@ -124,10 +67,11 @@ struct Info {  /// ```  pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {      match tokenizer.current { -        Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => { +        Some(b'"' | b'\'' | b'(') => { +            let marker = tokenizer.current.unwrap();              let info = Info {                  connect: false, -                kind: Kind::from_byte(byte), +                marker: if marker == b'(' { b')' } else { marker },                  options,              };              tokenizer.enter(info.options.title.clone()); @@ -150,7 +94,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {  /// ```  fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {              tokenizer.enter(info.options.marker.clone());              tokenizer.consume();              tokenizer.exit(info.options.marker.clone()); @@ -172,10 +116,6 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { -            tokenizer.exit(info.options.string.clone()); -            begin(tokenizer, info) -        }          None => State::Nok,          Some(b'\n') => tokenizer.go(              space_or_tab_eol_with_options(EolOptions { @@ -187,7 +127,11 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {                  at_break(t, info)              },          )(tokenizer), -        _ => { +        Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { +            tokenizer.exit(info.options.string.clone()); +            begin(tokenizer, info) +        } +        Some(_) => {              tokenizer.enter_with_content(Token::Data, Some(ContentType::String));              if info.connect { @@ -210,21 +154,18 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {  /// ```  fn title(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        None | Some(b'\n') => {              tokenizer.exit(Token::Data);              at_break(tokenizer, info)          } -        None | Some(b'\n') => { +        Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {              tokenizer.exit(Token::Data);              at_break(tokenizer, info)          } -        Some(b'\\') => { +        Some(byte) => { +            let func = if matches!(byte, b'\\') { escape } else { title };              tokenizer.consume(); -            State::Fn(Box::new(|t| escape(t, info))) -        } -        _ => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| title(t, info))) +            State::Fn(Box::new(move |t| func(t, info)))          }      }  } @@ -237,7 +178,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn escape(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'"' | b'\'' | b')') => {              tokenizer.consume();              State::Fn(Box::new(|t| title(t, info)))          } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 13815cb..4f872ba 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -92,8 +92,7 @@ fn trim_data(      if trim_end {          let mut index = slice.bytes.len(); -        let vs = slice.after; -        let mut spaces_only = vs == 0; +        let mut spaces_only = slice.after == 0;          while index > 0 {              match slice.bytes[index - 1] {                  b' ' => {} @@ -105,10 +104,10 @@ fn trim_data(          }          let diff = slice.bytes.len() - index; -        let token_type = if spaces_only -            && hard_break -            && exit_index + 1 < tokenizer.events.len() +        let token_type = if hard_break +            && spaces_only              && diff >= HARD_BREAK_PREFIX_SIZE_MIN +            && exit_index + 1 < tokenizer.events.len()          {              Token::HardBreakTrailing          } else { @@ -123,7 +122,7 @@ fn trim_data(              return;          } -        if diff > 0 || vs > 0 { +        if diff > 0 || slice.after > 0 {              let exit_point = tokenizer.events[exit_index].point.clone();              let mut enter_point = exit_point.clone();              enter_point.index -= diff; @@ -156,14 +155,11 @@ fn trim_data(      if trim_start {          let mut index = 0; -        let vs = slice.before;          while index < slice.bytes.len() {              match slice.bytes[index] { -                b' ' | b'\t' => {} +                b' ' | b'\t' => index += 1,                  _ => break,              } - -            index += 1;          }          // The whole data is whitespace. @@ -174,7 +170,7 @@ fn trim_data(              return;          } -        if index > 0 || vs > 0 { +        if index > 0 || slice.before > 0 {              let enter_point = tokenizer.events[exit_index - 1].point.clone();              let mut exit_point = enter_point.clone();              exit_point.index += index; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 4fc4dc4..785d132 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -53,64 +53,11 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN};  use crate::token::Token;  use crate::tokenizer::{State, Tokenizer}; -/// Type of thematic break. -#[derive(Debug, PartialEq)] -enum Kind { -    /// In a thematic break using asterisks (`*`). -    /// -    /// ## Example -    /// -    /// ```markdown -    /// *** -    /// ``` -    Asterisk, -    /// In a thematic break using dashes (`-`). -    /// -    /// ## Example -    /// -    /// ```markdown -    /// --- -    /// ``` -    Dash, -    /// In a thematic break using underscores (`_`). -    /// -    /// ## Example -    /// -    /// ```markdown -    /// ___ -    /// ``` -    Underscore, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            Kind::Asterisk => b'*', -            Kind::Dash => b'-', -            Kind::Underscore => b'_', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `*`, `-`, or `_`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'*' => Kind::Asterisk, -            b'-' => Kind::Dash, -            b'_' => Kind::Underscore, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse thematic breaks.  #[derive(Debug)]  struct Info { -    /// Kind of marker. -    kind: Kind, +    /// Marker. +    marker: u8,      /// Number of markers.      size: usize,  } @@ -122,15 +69,19 @@ struct Info {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.thematic_break {          tokenizer.enter(Token::ThematicBreak); -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -144,10 +95,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn before(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break( +        Some(b'*' | b'-' | b'_') => at_break(              tokenizer,              Info { -                kind: Kind::from_byte(byte), +                marker: tokenizer.current.unwrap(),                  size: 0,              },          ), @@ -163,13 +114,13 @@ fn before(tokenizer: &mut Tokenizer) -> State {  /// ```  fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { +        None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {              tokenizer.exit(Token::ThematicBreak);              // Feel free to interrupt.              tokenizer.interrupt = false;              State::Ok          } -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {              tokenizer.enter(Token::ThematicBreakSequence);              sequence(tokenizer, info)          } @@ -185,7 +136,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {              tokenizer.consume();              info.size += 1;              State::Fn(Box::new(|t| sequence(t, info))) | 
