diff options
Diffstat (limited to '')
31 files changed, 396 insertions, 372 deletions
| @@ -154,7 +154,6 @@ cargo doc --document-private-items  - [ ] (3) Check subtokenizer unraveling is ok  - [ ] (3) Remove splicing and cloning in subtokenizer  - [ ] (3) Pass more references around -- [ ] (1) Remove todos in `span.rs` if not needed  - [ ] (1) Get markers from constructs (`string`, `text`)  - [ ] (3) Read through rust docs to figure out what useful functions there are,        and fix stuff I’m doing manually now @@ -276,3 +275,4 @@ important.  - [x] (3) Unicode punctuation  - [x] (1) Use rust to crawl unicode  - [x] (1) Document attention +- [x] (1) Remove todos in `span.rs` if not needed diff --git a/src/construct/attention.rs b/src/construct/attention.rs index dff8633..2144864 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -109,6 +109,13 @@ enum MarkerKind {  }  impl MarkerKind { +    /// Turn the kind into a [char]. +    fn as_char(&self) -> char { +        match self { +            MarkerKind::Asterisk => '*', +            MarkerKind::Underscore => '_', +        } +    }      /// Turn [char] into a kind.      ///      /// ## Panics @@ -137,14 +144,23 @@ impl MarkerKind {  /// Attentention sequence that we can take markers from.  #[derive(Debug)]  struct Sequence { +    /// Marker used in this sequence.      marker: MarkerKind, +    /// The index into events where this sequence’s `Enter` currently resides.      event_index: usize, +    /// The (shifted) point where this sequence starts.      start_point: Point, +    /// The (shifted) index where this sequence starts.      start_index: usize, +    /// The (shifted) point where this sequence end.      end_point: Point, +    /// The (shifted) index where this sequence end.      end_index: usize, +    /// The number of markers we can still use.      size: usize, +    /// Whether this sequence can open attention.      open: bool, +    /// Whether this sequence can close attention.      close: bool,  } @@ -155,9 +171,9 @@ struct Sequence {  /// ```  pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::Char(char) if char == '*' || char == '_' => { +        Code::Char('*' | '_') => {              tokenizer.enter(TokenType::AttentionSequence); -            inside(tokenizer, code, char) +            inside(tokenizer, code, MarkerKind::from_code(code))          }          _ => (State::Nok, None),      } @@ -168,9 +184,9 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```markdown  /// *|*  /// ``` -fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult { +fn inside(tokenizer: &mut Tokenizer, code: Code, marker: MarkerKind) -> StateFnResult {      match code { -        Code::Char(char) if char == marker => { +        Code::Char(char) if char == marker.as_char() => {              tokenizer.consume(code);              (State::Fn(Box::new(move |t, c| inside(t, c, marker))), None)          } diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 6486a2d..e29bf8b 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -148,17 +148,11 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// a<u|ser@example.com>b  /// ```  fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    // Whether this character can be both a protocol and email atext. -    let unknown = match code { -        Code::Char('+' | '-' | '.') => true, -        Code::Char(char) if char.is_ascii_alphanumeric() => true, -        _ => false, -    }; - -    if unknown { -        scheme_inside_or_email_atext(tokenizer, code, 1) -    } else { -        email_atext(tokenizer, code) +    match code { +        Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { +            scheme_inside_or_email_atext(tokenizer, code, 1) +        } +        _ => email_atext(tokenizer, code),      }  } @@ -173,20 +167,14 @@ fn scheme_inside_or_email_atext(      code: Code,      size: usize,  ) -> StateFnResult { -    if let Code::Char(':') = code { -        tokenizer.consume(code); -        (State::Fn(Box::new(url_inside)), None) -    } else { -        // Whether this character can be both a protocol and email atext. -        let unknown = match code { -            Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true, -            Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => { -                true -            } -            _ => false, -        }; - -        if unknown { +    match code { +        Code::Char(':') => { +            tokenizer.consume(code); +            (State::Fn(Box::new(url_inside)), None) +        } +        Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') +            if size < AUTOLINK_SCHEME_SIZE_MAX => +        {              tokenizer.consume(code);              (                  State::Fn(Box::new(move |t, c| { @@ -194,9 +182,8 @@ fn scheme_inside_or_email_atext(                  })),                  None,              ) -        } else { -            email_atext(tokenizer, code)          } +        _ => email_atext(tokenizer, code),      }  } @@ -291,22 +278,22 @@ fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnRes  /// a<user.name@ex-|ample.com>b  /// ```  fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { -    let ok = match code { -        Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true, -        Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true, -        _ => false, -    }; - -    if ok { -        tokenizer.consume(code); -        let func = if let Code::Char('-') = code { -            email_value -        } else { -            email_label -        }; -        (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None) -    } else { -        (State::Nok, None) +    match code { +        Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |t, c| email_value(t, c, size + 1))), +                None, +            ) +        } +        Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |t, c| email_label(t, c, size + 1))), +                None, +            ) +        } +        _ => (State::Nok, None),      }  } @@ -325,7 +312,7 @@ fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {              tokenizer.exit(TokenType::Autolink);              (State::Ok, None)          } -        _ => unreachable!("expected `>` at `end`"), +        _ => unreachable!("expected `>`"),      }  } diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index bc42d21..65e49ca 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -138,21 +138,18 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// a&|#x9;b  /// ```  fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    let info = Info { +        buffer: vec![], +        kind: Kind::Named, +    };      if let Code::Char('#') = code {          tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric);          tokenizer.consume(code);          tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); -        (State::Fn(Box::new(numeric)), None) +        (State::Fn(Box::new(|t, c| numeric(t, c, info))), None)      } else {          tokenizer.enter(TokenType::CharacterReferenceValue); -        value( -            tokenizer, -            code, -            Info { -                buffer: vec![], -                kind: Kind::Named, -            }, -        ) +        value(tokenizer, code, info)      }  } @@ -163,37 +160,18 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// a&#|123;b  /// a&#|x9;b  /// ``` -fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn numeric(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {      if let Code::Char('x' | 'X') = code {          tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal);          tokenizer.consume(code);          tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal);          tokenizer.enter(TokenType::CharacterReferenceValue); - -        ( -            State::Fn(Box::new(|t, c| { -                value( -                    t, -                    c, -                    Info { -                        buffer: vec![], -                        kind: Kind::Hexadecimal, -                    }, -                ) -            })), -            None, -        ) +        info.kind = Kind::Hexadecimal; +        (State::Fn(Box::new(|t, c| value(t, c, info))), None)      } else {          tokenizer.enter(TokenType::CharacterReferenceValue); - -        value( -            tokenizer, -            code, -            Info { -                buffer: vec![], -                kind: Kind::Decimal, -            }, -        ) +        info.kind = Kind::Decimal; +        value(tokenizer, code, info)      }  } @@ -210,20 +188,19 @@ fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  fn value(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {      match code {          Code::Char(';') if !info.buffer.is_empty() => { -            tokenizer.exit(TokenType::CharacterReferenceValue); -            let value = info.buffer.iter().collect::<String>(); - -            if let Kind::Named = info.kind { -                if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { -                    return (State::Nok, None); -                } +            if Kind::Named == info.kind +                && !CHARACTER_REFERENCE_NAMES +                    .contains(&info.buffer.iter().collect::<String>().as_str()) +            { +                (State::Nok, None) +            } else { +                tokenizer.exit(TokenType::CharacterReferenceValue); +                tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); +                tokenizer.consume(code); +                tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); +                tokenizer.exit(TokenType::CharacterReference); +                (State::Ok, None)              } - -            tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); -            tokenizer.consume(code); -            tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); -            tokenizer.exit(TokenType::CharacterReference); -            (State::Ok, None)          }          Code::Char(char) => {              if info.buffer.len() < info.kind.max() && info.kind.allowed(char) { diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index f2d243a..05266ba 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -151,6 +151,17 @@ impl Kind {              _ => unreachable!("invalid char"),          }      } +    /// Turn [Code] into a kind. +    /// +    /// ## Panics +    /// +    /// Panics if `code` is not ``Code::Char('~' | '`')``. +    fn from_code(code: Code) -> Kind { +        match code { +            Code::Char(char) => Kind::from_char(char), +            _ => unreachable!("invalid code"), +        } +    }  }  /// State needed to parse code (fenced). @@ -172,10 +183,6 @@ struct Info {  ///  console.log(1);  ///  ~~~  /// ``` -/// -/// Parsing note: normally, the prefix is already stripped. -/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need -/// it.  pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      tokenizer.enter(TokenType::CodeFenced);      tokenizer.enter(TokenType::CodeFencedFence); @@ -202,7 +209,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult      }      match code { -        Code::Char(char) if char == '`' || char == '~' => { +        Code::Char('`' | '~') => {              tokenizer.enter(TokenType::CodeFencedFenceSequence);              sequence_open(                  tokenizer, @@ -210,7 +217,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult                  Info {                      prefix,                      size: 0, -                    kind: Kind::from_char(char), +                    kind: Kind::from_code(code),                  },              )          } @@ -237,11 +244,11 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State                  None,              )          } -        _ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN => (State::Nok, None), -        _ => { +        _ if info.size >= CODE_FENCED_SEQUENCE_SIZE_MIN => {              tokenizer.exit(TokenType::CodeFencedFenceSequence);              tokenizer.attempt_opt(space_or_tab(), |t, c| info_before(t, c, info))(tokenizer, code)          } +        _ => (State::Nok, None),      }  } @@ -291,7 +298,7 @@ fn info_inside(              tokenizer.exit(TokenType::CodeFencedFenceInfo);              tokenizer.attempt_opt(space_or_tab(), |t, c| meta_before(t, c, info))(tokenizer, code)          } -        Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), +        Code::Char('`') if info.kind == Kind::GraveAccent => (State::Nok, None),          Code::Char(_) => {              codes.push(code);              tokenizer.consume(code); @@ -339,7 +346,7 @@ fn meta(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {              tokenizer.exit(TokenType::CodeFencedFence);              at_break(tokenizer, code, info)          } -        Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), +        Code::Char('`') if info.kind == Kind::GraveAccent => (State::Nok, None),          _ => {              tokenizer.consume(code);              (State::Fn(Box::new(|t, c| meta(t, c, info))), None) @@ -369,7 +376,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult                  }              },          )(tokenizer, code), -        _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), +        _ => unreachable!("expected eof/eol"),      }  } diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index c595c75..a6dc7eb 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -138,7 +138,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnR  fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {      match code {          Code::None => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.enter(TokenType::CodeTextLineEnding);              tokenizer.consume(code);              tokenizer.exit(TokenType::CodeTextLineEnding); @@ -165,7 +165,7 @@ fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnRe  /// ```  fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {      match code { -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '`') => { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '`') => {              tokenizer.exit(TokenType::CodeTextData);              between(tokenizer, code, size_open)          } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index e1afd03..db4a009 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -227,7 +227,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.exit(TokenType::Definition);              // You’d be interrupting.              tokenizer.interrupt = true; @@ -293,7 +293,7 @@ fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn title_after_after_optional_whitespace(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              (State::Ok, Some(vec![code]))          }          _ => (State::Nok, None), diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 1e755a3..212d276 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.exit(TokenType::HardBreakEscape);              (State::Ok, Some(vec![code]))          } diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs index 6709e51..35a7cab 100644 --- a/src/construct/hard_break_trailing.rs +++ b/src/construct/hard_break_trailing.rs @@ -76,7 +76,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {                  None,              )          } -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')              if size >= HARD_BREAK_PREFIX_SIZE_MIN =>          {              tokenizer.exit(TokenType::HardBreakTrailingSpace); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 3ce7052..9fa2ace 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      }  } -/// To do. +/// Resolve heading (atx).  pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {      let mut edit_map = EditMap::new();      let mut index = 0; diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index df20aa7..211434f 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -179,7 +179,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      }  } -/// To do. +/// Resolve heading (setext).  pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {      let mut edit_map = EditMap::new();      let mut index = 0; @@ -207,7 +207,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {              tokenizer.events[enter].token_type = TokenType::HeadingSetextText;              tokenizer.events[exit].token_type = TokenType::HeadingSetextText; -            // Add of Enter:HeadingSetext, Exit:HeadingSetext. +            // Add Enter:HeadingSetext, Exit:HeadingSetext.              let mut heading_enter = tokenizer.events[enter].clone();              heading_enter.token_type = TokenType::HeadingSetext;              let mut heading_exit = tokenizer.events[index].clone(); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index a1bddad..229b0ef 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -103,6 +103,7 @@ use crate::construct::{      blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max,  };  use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::codes::{parse, serialize};  /// Kind of HTML (flow).  #[derive(Debug, PartialEq)] @@ -164,6 +165,17 @@ impl QuoteKind {              _ => unreachable!("invalid char"),          }      } +    /// Turn [Code] into a kind. +    /// +    /// ## Panics +    /// +    /// Panics if `code` is not `Code::Char('"' | '\'')`. +    fn from_code(code: Code) -> QuoteKind { +        match code { +            Code::Char(char) => QuoteKind::from_char(char), +            _ => unreachable!("invalid code"), +        } +    }  }  /// State needed to parse HTML (flow). @@ -175,7 +187,7 @@ struct Info {      start_tag: bool,      /// Used depending on `kind` to either collect all parsed characters, or to      /// store expected characters. -    buffer: Vec<char>, +    buffer: Vec<Code>,      /// `index` into `buffer` when expecting certain characters.      index: usize,      /// Current quote, when in a double or single quoted attribute value. @@ -254,7 +266,7 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {                  None,              )          } -        Code::Char(char) if char.is_ascii_alphabetic() => { +        Code::Char('A'..='Z' | 'a'..='z') => {              info.start_tag = true;              tag_name(tokenizer, code, info)          } @@ -282,14 +294,14 @@ fn declaration_open(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> St          Code::Char('[') => {              tokenizer.consume(code);              info.kind = Kind::Cdata; -            info.buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; +            info.buffer = parse("CDATA[");              info.index = 0;              (                  State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))),                  None,              )          } -        Code::Char(char) if char.is_ascii_alphabetic() => { +        Code::Char('A'..='Z' | 'a'..='z') => {              tokenizer.consume(code);              info.kind = Kind::Declaration;              ( @@ -329,22 +341,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Sta  /// <![CDATA|[>&<]]>  /// ```  fn cdata_open_inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { -    match code { -        Code::Char(char) if char == info.buffer[info.index] => { -            info.index += 1; -            tokenizer.consume(code); +    if code == info.buffer[info.index] { +        info.index += 1; +        tokenizer.consume(code); -            if info.index == info.buffer.len() { -                info.buffer.clear(); -                (State::Fn(Box::new(|t, c| continuation(t, c, info))), None) -            } else { -                ( -                    State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))), -                    None, -                ) -            } +        if info.index == info.buffer.len() { +            info.buffer.clear(); +            (State::Fn(Box::new(|t, c| continuation(t, c, info))), None) +        } else { +            ( +                State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))), +                None, +            )          } -        _ => (State::Nok, None), +    } else { +        (State::Nok, None)      }  } @@ -355,9 +366,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> S  /// ```  fn tag_close_start(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {      match code { -        Code::Char(char) if char.is_ascii_alphabetic() => { +        Code::Char('A'..='Z' | 'a'..='z') => {              tokenizer.consume(code); -            info.buffer.push(char); +            info.buffer.push(code);              (State::Fn(Box::new(|t, c| tag_name(t, c, info))), None)          }          _ => (State::Nok, None), @@ -376,13 +387,9 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes          | Code::CarriageReturnLineFeed          | Code::VirtualSpace          | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { -            let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); +            let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();              let name = tag_name_buffer.as_str(); -            let slash = if let Code::Char(char) = code { -                char == '/' -            } else { -                false -            }; +            let slash = matches!(code, Code::Char('/'));              info.buffer.clear(); @@ -413,9 +420,9 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes                  }              }          } -        Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { +        Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {              tokenizer.consume(code); -            info.buffer.push(char); +            info.buffer.push(code);              (State::Fn(Box::new(|t, c| tag_name(t, c, info))), None)          }          Code::Char(_) => (State::Nok, None), @@ -481,7 +488,7 @@ fn complete_attribute_name_before(              tokenizer.consume(code);              (State::Fn(Box::new(|t, c| complete_end(t, c, info))), None)          } -        Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { +        Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {              tokenizer.consume(code);              (                  State::Fn(Box::new(|t, c| complete_attribute_name(t, c, info))), @@ -508,13 +515,7 @@ fn complete_attribute_name_before(  /// ```  fn complete_attribute_name(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {      match code { -        Code::Char(char) -            if char == '-' -                || char == '.' -                || char == ':' -                || char == '_' -                || char.is_ascii_alphanumeric() => -        { +        Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {              tokenizer.consume(code);              (                  State::Fn(Box::new(|t, c| complete_attribute_name(t, c, info))), @@ -571,9 +572,9 @@ fn complete_attribute_value_before(  ) -> StateFnResult {      match code {          Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), -        Code::Char(char) if char == '"' || char == '\'' => { +        Code::Char('"' | '\'') => {              tokenizer.consume(code); -            info.quote = Some(QuoteKind::from_char(char)); +            info.quote = Some(QuoteKind::from_code(code));              (                  State::Fn(Box::new(|t, c| complete_attribute_value_quoted(t, c, info))),                  None, @@ -602,7 +603,7 @@ fn complete_attribute_value_quoted(      info: Info,  ) -> StateFnResult {      match code { -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),          Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => {              tokenizer.consume(code);              ( @@ -860,7 +861,7 @@ fn continuation_raw_end_tag(  ) -> StateFnResult {      match code {          Code::Char('>') => { -            let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); +            let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();              info.buffer.clear();              if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { @@ -873,9 +874,9 @@ fn continuation_raw_end_tag(                  continuation(tokenizer, code, info)              }          } -        Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => { +        Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => {              tokenizer.consume(code); -            info.buffer.push(char); +            info.buffer.push(code);              (                  State::Fn(Box::new(|t, c| continuation_raw_end_tag(t, c, info))),                  None, diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 2ac0ccd..0926f48 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -56,6 +56,7 @@  use crate::construct::partial_space_or_tab::space_or_tab;  use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; +use crate::util::codes::parse;  /// Start of HTML (text)  /// @@ -94,7 +95,7 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {              tokenizer.consume(code);              (State::Fn(Box::new(instruction)), None)          } -        Code::Char(char) if char.is_ascii_alphabetic() => { +        Code::Char('A'..='Z' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_open)), None)          } @@ -117,13 +118,13 @@ fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {          }          Code::Char('[') => {              tokenizer.consume(code); -            let buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; +            let buffer = parse("CDATA[");              (                  State::Fn(Box::new(|t, c| cdata_open_inside(t, c, buffer, 0))),                  None,              )          } -        Code::Char(char) if char.is_ascii_alphabetic() => { +        Code::Char('A'..='Z' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(declaration)), None)          } @@ -197,7 +198,7 @@ fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(comment))          }          Code::Char('-') => { @@ -239,25 +240,24 @@ fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  fn cdata_open_inside(      tokenizer: &mut Tokenizer,      code: Code, -    buffer: Vec<char>, +    buffer: Vec<Code>,      index: usize,  ) -> StateFnResult { -    match code { -        Code::Char(char) if char == buffer[index] => { -            tokenizer.consume(code); +    if code == buffer[index] { +        tokenizer.consume(code); -            if index + 1 == buffer.len() { -                (State::Fn(Box::new(cdata)), None) -            } else { -                ( -                    State::Fn(Box::new(move |t, c| { -                        cdata_open_inside(t, c, buffer, index + 1) -                    })), -                    None, -                ) -            } +        if index + 1 == buffer.len() { +            (State::Fn(Box::new(cdata)), None) +        } else { +            ( +                State::Fn(Box::new(move |t, c| { +                    cdata_open_inside(t, c, buffer, index + 1) +                })), +                None, +            )          } -        _ => (State::Nok, None), +    } else { +        (State::Nok, None)      }  } @@ -269,7 +269,7 @@ fn cdata_open_inside(  fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(cdata))          }          Code::Char(']') => { @@ -319,7 +319,7 @@ fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None | Code::Char('>') => end(tokenizer, code), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(declaration))          }          _ => { @@ -338,7 +338,7 @@ fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(instruction))          }          Code::Char('?') => { @@ -372,7 +372,7 @@ fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::Char(char) if char.is_ascii_alphabetic() => { +        Code::Char('A'..='Z' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_close)), None)          } @@ -388,7 +388,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { +        Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_close)), None)          } @@ -404,7 +404,7 @@ fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(tag_close_between))          }          Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -422,13 +422,13 @@ fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { +        Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_open)), None)          }          Code::CarriageReturnLineFeed          | Code::VirtualSpace -        | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), +        | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),          _ => (State::Nok, None),      }  } @@ -442,7 +442,7 @@ fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(tag_open_between))          }          Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -453,7 +453,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {              tokenizer.consume(code);              (State::Fn(Box::new(end)), None)          } -        Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { +        Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_open_attribute_name)), None)          } @@ -470,13 +470,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::Char(char) -            if char == '-' -                || char == '.' -                || char == ':' -                || char == '_' -                || char.is_ascii_alphanumeric() => -        { +        Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_open_attribute_name)), None)          } @@ -494,7 +488,7 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResu  /// ```  fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after))          }          Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -519,7 +513,7 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> State  fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before))          }          Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -555,7 +549,7 @@ fn tag_open_attribute_value_quoted(  ) -> StateFnResult {      match code {          Code::None => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending( +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending(              tokenizer,              code,              Box::new(move |t, c| tag_open_attribute_value_quoted(t, c, marker)), @@ -589,7 +583,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> S          Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None),          Code::CarriageReturnLineFeed          | Code::VirtualSpace -        | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), +        | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),          Code::Char(_) => {              tokenizer.consume(code);              (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) @@ -607,7 +601,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer, code: Code)      match code {          Code::CarriageReturnLineFeed          | Code::VirtualSpace -        | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code), +        | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer, code),          _ => (State::Nok, None),      }  } @@ -646,7 +640,7 @@ fn at_line_ending(      return_state: Box<StateFn>,  ) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.exit(TokenType::HtmlTextData);              tokenizer.enter(TokenType::LineEnding);              tokenizer.consume(code); @@ -656,7 +650,7 @@ fn at_line_ending(                  None,              )          } -        _ => unreachable!("expected line ending"), +        _ => unreachable!("expected eol"),      }  } diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index ae2f4de..5ec278e 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -45,7 +45,7 @@ use crate::util::edit_map::EditMap;  pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { -            unreachable!("unexpected eol/eof at start of paragraph") +            unreachable!("unexpected eol/eof")          }          _ => {              tokenizer.enter(TokenType::Paragraph); @@ -99,7 +99,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {              {                  // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding, Enter:Paragraph.                  edit_map.add(exit_index, 4, vec![]); -                println!("rm {:?} {:?}", exit_index, exit_index + 4);                  // Add Exit:LineEnding position info to Exit:Data.                  let line_ending_exit = &tokenizer.events[enter_next_index - 1]; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 9f99570..555ccaf 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -6,8 +6,6 @@  //! [string]: crate::content::string  //! [text]: crate::content::text -// To do: pass token types in? -  use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer};  use crate::util::edit_map::EditMap; @@ -34,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnR  fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult {      match code {          Code::None => (State::Ok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.enter(TokenType::LineEnding);              tokenizer.consume(code);              tokenizer.exit(TokenType::LineEnding); @@ -58,7 +56,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnRe  /// ```  fn data(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult {      let done = match code { -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => true, +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => true,          _ if stop.contains(&code) => true,          _ => false,      }; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 8b281c7..31c13ec 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -171,7 +171,7 @@ fn enclosed(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult              tokenizer.exit(info.options.string.clone());              enclosed_before(tokenizer, code, info)          } -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '<') => { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '<') => {              (State::Nok, None)          }          Code::Char('\\') => { @@ -235,7 +235,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {          Code::None          | Code::CarriageReturnLineFeed          | Code::VirtualSpace -        | Code::Char('\t' | '\r' | '\n' | ' ') => { +        | Code::Char('\t' | '\n' | '\r' | ' ') => {              if info.balance > 0 {                  (State::Nok, None)              } else { diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 32182d6..f201f60 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -133,7 +133,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes              tokenizer.exit(info.options.label);              (State::Ok, None)          } -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go(              space_or_tab_eol_with_options(EolOptions {                  content_type: Some(ContentType::String),                  connect: info.connect, @@ -165,7 +165,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes  /// ```  fn label(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {      match code { -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '[' | ']') => {              tokenizer.exit(TokenType::Data);              at_break(tokenizer, code, info)          } diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index d2934b3..5b1ec5e 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -195,7 +195,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResul  /// ```  fn after_space_or_tab(tokenizer: &mut Tokenizer, code: Code, mut info: EolInfo) -> StateFnResult {      match code { -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.enter_with_content(TokenType::LineEnding, info.options.content_type);              if info.connect { @@ -254,7 +254,7 @@ fn after_more_space_or_tab(_tokenizer: &mut Tokenizer, code: Code) -> StateFnRes      // Blank line not allowed.      if matches!(          code, -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')      ) {          (State::Nok, None)      } else { diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index caacb0d..010f554 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -102,6 +102,19 @@ impl Kind {              _ => unreachable!("invalid char"),          }      } +    /// Turn [Code] into a kind. +    /// +    /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. +    /// +    /// ## Panics +    /// +    /// Panics if `code` is not `Code::Char('(' | '"' | '\'')`. +    fn from_code(code: Code) -> Kind { +        match code { +            Code::Char(char) => Kind::from_char(char), +            _ => unreachable!("invalid code"), +        } +    }  }  /// State needed to parse titles. @@ -124,10 +137,10 @@ struct Info {  /// ```  pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFnResult {      match code { -        Code::Char(char) if char == '"' || char == '\'' || char == '(' => { +        Code::Char('"' | '\'' | '(') => {              let info = Info {                  connect: false, -                kind: Kind::from_char(char), +                kind: Kind::from_code(code),                  options,              };              tokenizer.enter(info.options.title.clone()); @@ -180,7 +193,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes              begin(tokenizer, code, info)          }          Code::None => (State::Nok, None), -        Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go(              space_or_tab_eol_with_options(EolOptions {                  content_type: Some(ContentType::String),                  connect: info.connect, @@ -216,7 +229,7 @@ fn title(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {              tokenizer.exit(TokenType::Data);              at_break(tokenizer, code, info)          } -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {              tokenizer.exit(TokenType::Data);              at_break(tokenizer, code, info)          } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 62b1205..c9ec564 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -33,7 +33,7 @@ pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {          space_or_tab(),          if matches!(              tokenizer.previous, -            Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') +            Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')          ) {              // If there’s whitespace, and we were at an eol/eof, `ok`              ok @@ -48,7 +48,7 @@ pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  fn at_eol(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      if matches!(          code, -        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')      ) {          ok(tokenizer, code)      } else { diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 8d29157..28aca34 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -95,7 +95,7 @@ impl Kind {      ///      /// ## Panics      /// -    /// Panics if `char` is not `*`, `_`, or `_`. +    /// Panics if `char` is not `*`, `-`, or `_`.      fn from_char(char: char) -> Kind {          match char {              '*' => Kind::Asterisk, @@ -104,6 +104,19 @@ impl Kind {              _ => unreachable!("invalid char"),          }      } +    /// Turn [Code] into a kind. +    /// +    /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. +    /// +    /// ## Panics +    /// +    /// Panics if `code` is not `Code::Char('*' | '-' | '_')`. +    fn from_code(code: Code) -> Kind { +        match code { +            Code::Char(char) => Kind::from_char(char), +            _ => unreachable!("invalid code"), +        } +    }  }  /// State needed to parse thematic breaks. @@ -133,11 +146,11 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::Char(char) if char == '*' || char == '-' || char == '_' => at_break( +        Code::Char('*' | '-' | '_') => at_break(              tokenizer,              code,              Info { -                kind: Kind::from_char(char), +                kind: Kind::from_code(code),                  size: 0,              },          ), diff --git a/src/content/flow.rs b/src/content/flow.rs index 3ff948d..74c6a62 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -140,7 +140,7 @@ fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {              tokenizer.interrupt = false;              (State::Fn(Box::new(start)), None)          } -        _ => unreachable!("expected eol/eof after blank line `{:?}`", code), +        _ => unreachable!("expected eol/eof"),      }  } @@ -162,7 +162,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {              tokenizer.exit(TokenType::LineEnding);              (State::Fn(Box::new(start)), None)          } -        _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code), +        _ => unreachable!("expected eol/eof"),      }  } diff --git a/src/content/mod.rs b/src/content/mod.rs index 395e41b..ae8ad83 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -1,6 +1,5 @@  //! Content types found in markdown. -#[allow(clippy::module_inception)]  pub mod flow;  pub mod string;  pub mod text; diff --git a/src/content/text.rs b/src/content/text.rs index ecb6ae1..cf630f1 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -8,15 +8,15 @@  //!  //! *   [Attention][crate::construct::attention]  //! *   [Autolink][crate::construct::autolink] -//! *   [HTML (text)][crate::construct::html_text] +//! *   [Character escape][crate::construct::character_escape] +//! *   [Character reference][crate::construct::character_reference] +//! *   [Code (text)][crate::construct::code_text]  //! *   [Hard break (escape)][crate::construct::hard_break_escape]  //! *   [Hard break (trailing)][crate::construct::hard_break_trailing] -//! *   [Code (text)][crate::construct::code_text] +//! *   [HTML (text)][crate::construct::html_text]  //! *   [Label start (image)][crate::construct::label_start_image]  //! *   [Label start (link)][crate::construct::label_start_link]  //! *   [Label end][crate::construct::label_end] -//! *   [Character escape][crate::construct::character_escape] -//! *   [Character reference][crate::construct::character_reference]  use crate::construct::{      attention::start as attention, autolink::start as autolink, diff --git a/src/parser.rs b/src/parser.rs index 89a0de1..32689d6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -3,7 +3,8 @@  use std::collections::HashSet;  // To do: this should start with `containers`, when they’re done.  use crate::content::flow::flow; -use crate::tokenizer::{as_codes, Code, Event, Point}; +use crate::tokenizer::{Code, Event, Point}; +use crate::util::codes::parse as parse_codes;  /// Information needed, in all content types, when parsing markdown.  /// @@ -22,7 +23,7 @@ pub struct ParseState {  /// Passes the codes back so the compiler can access the source.  pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) {      let mut parse_state = ParseState { -        codes: as_codes(value), +        codes: parse_codes(value),          definitions: HashSet::new(),      }; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1fa94d7..f0f9ff0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -11,7 +11,6 @@  //! [`attempt`]: Tokenizer::attempt  //! [`check`]: Tokenizer::check -use crate::constant::TAB_SIZE;  use crate::parser::ParseState;  use std::collections::HashMap; @@ -2224,83 +2223,6 @@ fn feed_impl(      check_statefn_result((state, None))  } -/// Turn a string into codes. -pub fn as_codes(value: &str) -> Vec<Code> { -    let mut codes: Vec<Code> = vec![]; -    let mut at_start = true; -    let mut at_carriage_return = false; -    let mut column = 1; - -    for char in value.chars() { -        if at_start { -            if char == '\u{feff}' { -                // Ignore. -                continue; -            } - -            at_start = false; -        } - -        // Send a CRLF. -        if at_carriage_return && '\n' == char { -            at_carriage_return = false; -            codes.push(Code::CarriageReturnLineFeed); -        } else { -            // Send the previous CR: we’re not at a next `\n`. -            if at_carriage_return { -                at_carriage_return = false; -                codes.push(Code::Char('\r')); -            } - -            match char { -                // Send a replacement character. -                '\0' => { -                    column += 1; -                    codes.push(Code::Char('�')); -                } -                // Send a tab and virtual spaces. -                '\t' => { -                    let remainder = column % TAB_SIZE; -                    let mut virtual_spaces = if remainder == 0 { -                        0 -                    } else { -                        TAB_SIZE - remainder -                    }; -                    codes.push(Code::Char(char)); -                    column += 1; -                    while virtual_spaces > 0 { -                        codes.push(Code::VirtualSpace); -                        column += 1; -                        virtual_spaces -= 1; -                    } -                } -                // Send an LF. -                '\n' => { -                    column = 1; -                    codes.push(Code::Char(char)); -                } -                // Don’t send anything yet. -                '\r' => { -                    column = 1; -                    at_carriage_return = true; -                } -                // Send the char. -                _ => { -                    column += 1; -                    codes.push(Code::Char(char)); -                } -            } -        }; -    } - -    // Send the last CR: we’re not at a next `\n`. -    if at_carriage_return { -        codes.push(Code::Char('\r')); -    } - -    codes -} -  /// Check a [`StateFnResult`][], make sure its valid (that there are no bugs),  /// and clean a final eof passed back in `remainder`.  fn check_statefn_result(result: StateFnResult) -> StateFnResult { diff --git a/src/util/codes.rs b/src/util/codes.rs new file mode 100644 index 0000000..8a46d02 --- /dev/null +++ b/src/util/codes.rs @@ -0,0 +1,126 @@ +//! Utilities to deal with character codes. + +use crate::constant::TAB_SIZE; +use crate::tokenizer::Code; + +/// Turn a string into codes. +pub fn parse(value: &str) -> Vec<Code> { +    let mut codes: Vec<Code> = vec![]; +    let mut at_start = true; +    let mut at_carriage_return = false; +    let mut column = 1; + +    for char in value.chars() { +        if at_start { +            if char == '\u{feff}' { +                // Ignore. +                continue; +            } + +            at_start = false; +        } + +        // Send a CRLF. +        if at_carriage_return && '\n' == char { +            at_carriage_return = false; +            codes.push(Code::CarriageReturnLineFeed); +        } else { +            // Send the previous CR: we’re not at a next `\n`. +            if at_carriage_return { +                at_carriage_return = false; +                codes.push(Code::Char('\r')); +            } + +            match char { +                // Send a replacement character. +                '\0' => { +                    column += 1; +                    codes.push(Code::Char('�')); +                } +                // Send a tab and virtual spaces. +                '\t' => { +                    let remainder = column % TAB_SIZE; +                    let mut virtual_spaces = if remainder == 0 { +                        0 +                    } else { +                        TAB_SIZE - remainder +                    }; +                    codes.push(Code::Char(char)); +                    column += 1; +                    while virtual_spaces > 0 { +                        codes.push(Code::VirtualSpace); +                        column += 1; +                        virtual_spaces -= 1; +                    } +                } +                // Send an LF. +                '\n' => { +                    column = 1; +                    codes.push(Code::Char(char)); +                } +                // Don’t send anything yet. +                '\r' => { +                    column = 1; +                    at_carriage_return = true; +                } +                // Send the char. +                _ => { +                    column += 1; +                    codes.push(Code::Char(char)); +                } +            } +        }; +    } + +    // Send the last CR: we’re not at a next `\n`. +    if at_carriage_return { +        codes.push(Code::Char('\r')); +    } + +    codes +} + +/// Serialize codes, optionally expanding tabs. +pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { +    let mut at_tab = false; +    let mut index = 0; +    let mut value: Vec<char> = vec![]; + +    while index < codes.len() { +        let code = codes[index]; +        let mut at_tab_next = false; + +        match code { +            Code::CarriageReturnLineFeed => { +                value.push('\r'); +                value.push('\n'); +            } +            Code::Char(char) if char == '\n' || char == '\r' => { +                value.push(char); +            } +            Code::Char(char) if char == '\t' => { +                at_tab_next = true; +                value.push(if expand_tabs { ' ' } else { char }); +            } +            Code::VirtualSpace => { +                if !expand_tabs && at_tab { +                    index += 1; +                    continue; +                } +                value.push(' '); +            } +            Code::Char(char) => { +                value.push(char); +            } +            Code::None => { +                unreachable!("unexpected EOF code in codes"); +            } +        } + +        at_tab = at_tab_next; + +        index += 1; +    } + +    value.into_iter().collect() +} diff --git a/src/util/encode.rs b/src/util/encode.rs index 5762c22..a3bd589 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,11 +21,36 @@  ///  /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)  pub fn encode(value: &str) -> String { -    // To do: replacing 4 times might just be slow. -    // Perhaps we can walk the chars. -    value -        .replace('&', "&") -        .replace('"', """) -        .replace('<', "<") -        .replace('>', ">") +    let mut result: Vec<&str> = vec![]; +    let mut start = 0; +    let mut index = 0; + +    for byte in value.bytes() { +        if let Some(replacement) = match byte { +            b'&' => Some("&"), +            b'"' => Some("""), +            b'<' => Some("<"), +            b'>' => Some(">"), +            _ => None, +        } { +            if start != index { +                result.push(&value[start..index]); +            } + +            result.push(replacement); +            start = index + 1; +        } + +        index += 1; +    } + +    if start == 0 { +        value.to_string() +    } else { +        if start < index { +            result.push(&value[start..index]); +        } + +        result.join("") +    }  } diff --git a/src/util/mod.rs b/src/util/mod.rs index 68ef275..d1a0e01 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@  //! Utilities used when compiling markdown. +pub mod codes;  pub mod decode_character_reference;  pub mod edit_map;  pub mod encode; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 4753f7b..123a3a9 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String {      // Collapse markdown whitespace and trim it.      for char in value.chars() {          match char { -            '\t' | '\r' | '\n' | ' ' => { +            '\t' | '\n' | '\r' | ' ' => {                  at_whitespace = true;              }              _ => { diff --git a/src/util/span.rs b/src/util/span.rs index 02811cc..32dd00f 100644 --- a/src/util/span.rs +++ b/src/util/span.rs @@ -1,20 +1,15 @@  //! Utilities to deal with semantic labels.  use crate::tokenizer::{Code, Event, EventType}; +use crate::util::codes::serialize as serialize_codes;  /// A struct representing the span of an opening and closing event of a token.  #[derive(Debug)]  pub struct Span { -    // To do: probably needed in the future. -    // start: Point,      /// Absolute offset (and `index` in `codes`) of where this span starts.      pub start_index: usize, -    // To do: probably needed in the future. -    // end: Point,      /// Absolute offset (and `index` in `codes`) of where this span ends.      pub end_index: usize, -    // To do: probably needed in the future. -    // token_type: TokenType,  }  /// Get a span from an event. @@ -29,10 +24,8 @@ pub struct Span {  /// When `micromark` is used, this function never panics.  pub fn from_exit_event(events: &[Event], index: usize) -> Span {      let exit = &events[index]; -    // let end = exit.point.clone();      let end_index = exit.index;      let token_type = exit.token_type.clone(); -    // To do: support `enter` events if needed and walk forwards?      assert_eq!(          exit.event_type,          EventType::Exit, @@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span {          let enter = &events[enter_index];          if enter.event_type == EventType::Enter && enter.token_type == token_type {              return Span { -                // start: enter.point.clone(),                  start_index: enter.index, -                // end,                  end_index, -                // token_type,              };          } @@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {  pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {      &codes[span.start_index..span.end_index]  } - -/// Serialize a slice of codes, optionally expanding tabs. -fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String { -    let mut at_tab = false; -    let mut index = 0; -    let mut value: Vec<char> = vec![]; - -    while index < codes.len() { -        let code = codes[index]; -        let mut at_tab_next = false; - -        match code { -            Code::CarriageReturnLineFeed => { -                value.push('\r'); -                value.push('\n'); -            } -            Code::Char(char) if char == '\n' || char == '\r' => { -                value.push(char); -            } -            Code::Char(char) if char == '\t' => { -                at_tab_next = true; -                value.push(if expand_tabs { ' ' } else { char }); -            } -            Code::VirtualSpace => { -                if !expand_tabs && at_tab { -                    index += 1; -                    continue; -                } -                value.push(' '); -            } -            Code::Char(char) => { -                value.push(char); -            } -            Code::None => { -                unreachable!("unexpected EOF code in codes"); -            } -        } - -        at_tab = at_tab_next; - -        index += 1; -    } - -    value.into_iter().collect() -} | 
