diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
commit | 0eeff9148e327183e532752f46421a75506dd7a6 (patch) | |
tree | 4f0aed04f90aa759ce96a2e87aa719e7fa95c450 /src/construct/autolink.rs | |
parent | 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff) | |
download | markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2 markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip |
Refactor to improve states
* Remove custom kind wrappers, use plain bytes instead
* Remove `Into`s, use the explicit expected types instead
* Refactor to use `slice.as_str` in most places
* Remove unneeded unique check before adding a definition
* Use a shared CDATA prefix in constants
* Inline byte checks into matches
* Pass bytes back from parser instead of whole parse state
* Refactor to work more often on bytes
* Rename custom `size` to `len`
Diffstat (limited to 'src/construct/autolink.rs')
-rw-r--r-- | src/construct/autolink.rs | 57 |
1 files changed, 29 insertions, 28 deletions
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index b843af8..c0514ae 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_alphabetic() => { + // ASCII alphabetic. + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(scheme_or_email_atext)) } - Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer), - _ => State::Nok, + _ => email_atext(tokenizer), } } @@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::AutolinkProtocol); end(tokenizer) } - Some(byte) if byte.is_ascii_control() => State::Nok, - None | Some(b' ') => State::Nok, + // ASCII control or space. + None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(url_inside)) @@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Some(byte) if is_ascii_atext(byte) => { + // ASCII atext. + // + // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or + // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 + // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), + // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`), + // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE + // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE + // (`~`). + // + // See: + // **\[RFC5322]**: + // [Internet Message Format](https://tools.ietf.org/html/rfc5322). + // P. Resnick. + // IETF. + // + // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric + Some( + b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~', + ) => { tokenizer.consume(); State::Fn(Box::new(email_atext)) } @@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size), + // ASCII alphanumeric. + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size), _ => State::Nok, } } @@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { tokenizer.consume(); State::Fn(Box::new(move |t| email_value(t, size + 1))) } - Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + // ASCII alphanumeric. + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_label(t, size + 1))) } @@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State { _ => unreachable!("expected `>`"), } } - -/// Check whether the character code represents an ASCII atext. -/// -/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in -/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), -/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F -/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E -/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE -/// (`{`) to U+007E TILDE (`~`). -/// -/// See: -/// **\[RFC5322]**: -/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). -/// P. Resnick. -/// IETF. -/// -/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric -fn is_ascii_atext(byte: u8) -> bool { - matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~') -} |