//! GFM: autolink literal occurs in the [text][] content type.
//!
//! ## Grammar
//!
//! Autolink literals form with the following BNF
//! (<small>see [construct][crate::construct] for character groups</small>):
//!
//! ```bnf
//! gfm_autolink_literal ::= gfm_protocol_autolink | gfm_www_autolink | gfm_email_autolink
//!
//! ; Restriction: the code before must be `www_autolink_before`.
//! ; Restriction: the code after `.` must not be eof.
//! www_autolink ::= 3('w' | 'W') '.' [domain [path]]
//! www_autolink_before ::= eof | eol | space_or_tab | '(' | '*' | '_' | '[' | ']' | '~'
//!
//! ; Restriction: the code before must be `http_autolink_before`.
//! ; Restriction: the code after the protocol must be `http_autolink_protocol_after`.
//! http_autolink ::= ('h' | 'H') 2('t' | 'T') ('p' | 'P') ['s' | 'S'] ':' 2'/' domain [path]
//! http_autolink_before ::= byte - ascii_alpha
//! http_autolink_protocol_after ::= byte - eof - eol - ascii_control - unicode_whitespace - unicode_punctuation
//!
//! ; Restriction: the code before must be `email_autolink_before`.
//! ; Restriction: `ascii_digit` may not occur in the last label part of the label.
//! email_autolink ::= 1*('+' | '-' | '.' | '_' | ascii_alphanumeric) '@' 1*(1*label_segment label_dot_cont) 1*label_segment
//! email_autolink_before ::= byte - ascii_alpha - '/'
//!
//! ; Restriction: `_` may not occur in the last two domain parts.
//! domain ::= 1*(url_ampt_cont | domain_punct_cont | '-' | byte - eof - ascii_control - unicode_whitespace - unicode_punctuation)
//! ; Restriction: must not be followed by `punct`.
//! domain_punct_cont ::= '.' | '_'
//! ; Restriction: must not be followed by `char-ref`.
//! url_ampt_cont ::= '&'
//!
//! ; Restriction: a counter `balance = 0` is increased for every `(`, and decreased for every `)`.
//! ; Restriction: `)` must not be `paren_at_end`.
//! path ::= 1*(url_ampt_cont | path_punctuation_cont | '(' | ')' | byte - eof - eol - space_or_tab)
//! ; Restriction: must not be followed by `punct`.
//! path_punctuation_cont ::= trailing_punctuation - '<'
//! ; Restriction: must be followed by `punct` and `balance` must be less than `0`.
//! paren_at_end ::= ')'
//!
//! label_segment ::= label_dash_underscore_cont | ascii_alpha | ascii_digit
//! ; Restriction: if followed by `punct`, the whole email autolink is invalid.
//! label_dash_underscore_cont ::= '-' | '_'
//! ; Restriction: must not be followed by `punct`.
//! label_dot_cont ::= '.'
//!
//! punct ::= *trailing_punctuation ( byte - eof - eol - space_or_tab - '<' )
//! char_ref ::= *ascii_alpha ';' path_end
//! trailing_punctuation ::= '!' | '"' | '\'' | ')' | '*' | ',' | '.' | ':' | ';' | '<' | '?' | '_' | '~'
//! ```
//!
//! The grammar for GFM autolink literal is very relaxed: basically anything
//! except for whitespace is allowed after a prefix.
//! To use whitespace characters and otherwise impossible characters, in URLs,
//! you can use percent encoding:
//!
//! ```markdown
//! https://example.com/alpha%20bravo
//! ```
//!
//! Yields:
//!
//! ```html
//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p>
//! ```
//!
//! There are several cases where incorrect encoding of URLs would, in other
//! languages, result in a parse error.
//! In markdown, there are no errors, and URLs are normalized.
//! In addition, many characters are percent encoded
//! ([`sanitize_uri`][sanitize_uri]).
//! For example:
//!
//! ```markdown
//! www.a👍b%
//! ```
//!
//! Yields:
//!
//! ```html
//! <p><a href="http://www.a%F0%9F%91%8Db%25">www.a👍b%</a></p>
//! ```
//!
//! There is a big difference between how www and protocol literals work
//! compared to how email literals work.
//! The first two are done when parsing, and work like anything else in
//! markdown.
//! But email literals are handled afterwards: when everything is parsed, we
//! look back at the events to figure out if there were email addresses.
//! This particularly affects how they interleave with character escapes and
//! character references.
//!
//! ## HTML
//!
//! GFM autolink literals relate to the `<a>` element in HTML.
//! See [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info.
//! When an email autolink is used, the string `mailto:` is prepended when
//! generating the `href` attribute of the hyperlink.
//! When a www autolink is used, the string `http:` is prepended.
//!
//! ## Recommendation
//!
//! It is recommended to use labels ([label start link][label_start_link],
//! [label end][label_end]), either with a resource or a definition
//! ([definition][]), instead of autolink literals, as those allow relative
//! URLs and descriptive text to explain the URL in prose.
//!
//! ## Bugs
//!
//! GitHub’s own algorithm to parse autolink literals contains three bugs.
//! A smaller bug is left unfixed in this project for consistency.
//! Two main bugs are not present in this project.
//! The issues relating to autolink literals are:
//!
//! * [GFM autolink extension (`www.`, `https?://` parts): links don’t work when after bracket](https://github.com/github/cmark-gfm/issues/278)\
//! fixed here ✅
//! * [GFM autolink extension (`www.` part): uppercase does not match on issues/PRs/comments](https://github.com/github/cmark-gfm/issues/280)\
//! fixed here ✅
//! * [GFM autolink extension (`www.` part): the word `www` matches](https://github.com/github/cmark-gfm/issues/279)\
//! present here for consistency
//!
//! ## Tokens
//!
//! * [`GfmAutolinkLiteralProtocol`][Name::GfmAutolinkLiteralProtocol]
//! * [`GfmAutolinkLiteralWww`][Name::GfmAutolinkLiteralWww]
//! * [`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail]
//!
//! ## References
//!
//! * [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal)
//! * [*§ 6.9 Autolinks (extension)* in `GFM`](https://github.github.com/gfm/#autolinks-extension-)
//!
//! [text]: crate::construct::text
//! [definition]: crate::construct::definition
//! [attention]: crate::construct::attention
//! [label_start_link]: crate::construct::label_start_link
//! [label_end]: crate::construct::label_end
//! [sanitize_uri]: crate::util::sanitize_uri
//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
use crate::event::{Event, Kind, Name};
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::{
classify_character::{classify_opt, Kind as CharacterKind},
slice::{char_after_index, Position, Slice},
};
use alloc::vec::Vec;
/// Start of protocol autolink literal.
///
/// ```markdown
/// > | https://example.com/a?b#c
/// ^
/// ```
pub fn protocol_start(tokenizer: &mut Tokenizer) -> State {
if tokenizer
.parse_state
.options
.constructs
.gfm_autolink_literal &&
matches!(tokenizer.current, Some(b'H' | b'h'))
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L214>.
&& !matches!(tokenizer.previous, Some(b'A'..=b'Z' | b'a'..=b'z'))
{
tokenizer.enter(Name::GfmAutolinkLiteralProtocol);
tokenizer.attempt(
State::Next(StateName::GfmAutolinkLiteralProtocolAfter),
State::Nok,
);
tokenizer.attempt(
State::Next(StateName::GfmAutolinkLiteralDomainInside),
State::Nok,
);
tokenizer.tokenize_state.start = tokenizer.point.index;
State::Retry(StateName::GfmAutolinkLiteralProtocolPrefixInside)
} else {
State::Nok
}
}
/// After a protocol autolink literal.
///
/// ```markdown
/// > | https://example.com/a?b#c
/// ^
/// ```
pub fn protocol_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Name::GfmAutolinkLiteralProtocol);
State::Ok
}
/// In protocol.
///
/// ```markdown
/// > | https://example.com/a?b#c
/// ^^^^^
/// ```
pub fn protocol_prefix_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'A'..=b'Z' | b'a'..=b'z')
// `5` is size of `https`
if tokenizer.point.index - tokenizer.tokenize_state.start < 5 =>
{
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralProtocolPrefixInside)
}
Some(b':') => {
let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
tokenizer.tokenize_state.start,
tokenizer.point.index,
);
let name = slice.as_str().to_ascii_lowercase();
tokenizer.tokenize_state.start = 0;
if name == "http" || name == "https" {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside)
} else {
State::Nok
}
}
_ => {
tokenizer.tokenize_state.start = 0;
State::Nok
}
}
}
/// In protocol slashes.
///
/// ```markdown
/// > | https://example.com/a?b#c
/// ^^
/// ```
pub fn protocol_slashes_inside(tokenizer: &mut Tokenizer) -> State {
if tokenizer.current == Some(b'/') {
tokenizer.consume();
if tokenizer.tokenize_state.size == 0 {
tokenizer.tokenize_state.size += 1;
State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside)
} else {
tokenizer.tokenize_state.size = 0;
State::Ok
}
} else {
tokenizer.tokenize_state.size = 0;
State::Nok
}
}
/// Start of www autolink literal.
///
/// ```markdown
/// > | www.example.com/a?b#c
/// ^
/// ```
pub fn www_start(tokenizer: &mut Tokenizer) -> State {
if tokenizer
.parse_state
.options
.constructs
.gfm_autolink_literal &&
matches!(tokenizer.current, Some(b'W' | b'w'))
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>.
&& matches!(tokenizer.previous, None | Some(b'\t' | b'\n' | b' ' | b'(' | b'*' | b'_' | b'[' | b']' | b'~'))
{
tokenizer.enter(Name::GfmAutolinkLiteralWww);
tokenizer.attempt(
State::Next(StateName::GfmAutolinkLiteralWwwAfter),
State::Nok,
);
// Note: we *check*, so we can discard the `www.` we parsed.
// If it worked, we consider it as a part of the domain.
tokenizer.check(
State::Next(StateName::GfmAutolinkLiteralDomainInside),
State::Nok,
);
State::Retry(StateName::GfmAutolinkLiteralWwwPrefixInside)
} else {
State::Nok
}
}
/// After a www autolink literal.
///
/// ```markdown
/// > | www.example.com/a?b#c
/// ^
/// ```
pub fn www_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Name::GfmAutolinkLiteralWww);
State::Ok
}
/// In www prefix.
///
/// ```markdown
/// > | www.example.com
/// ^^^^
/// ```
pub fn www_prefix_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'.') if tokenizer.tokenize_state.size == 3 => {
tokenizer.tokenize_state.size = 0;
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralWwwPrefixAfter)
}
Some(b'W' | b'w') if tokenizer.tokenize_state.size < 3 => {
tokenizer.tokenize_state.size += 1;
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralWwwPrefixInside)
}
_ => {
tokenizer.tokenize_state.size = 0;
State::Nok
}
}
}
/// After www prefix.
///
/// ```markdown
/// > | www.example.com
/// ^
/// ```
pub fn www_prefix_after(tokenizer: &mut Tokenizer) -> State {
// If there is *anything*, we can link.
if tokenizer.current == None {
State::Nok
} else {
State::Ok
}
}
/// In domain.
///
/// ```markdown
/// > | https://example.com/a
/// ^^^^^^^^^^^
/// ```
pub fn domain_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Check whether this marker, which is a trailing punctuation
// marker, optionally followed by more trailing markers, and then
// followed by an end.
Some(b'.' | b'_') => {
tokenizer.check(
State::Next(StateName::GfmAutolinkLiteralDomainAfter),
State::Next(StateName::GfmAutolinkLiteralDomainAtPunctuation),
);
State::Retry(StateName::GfmAutolinkLiteralTrail)
}
// Dashes and continuation bytes are fine.
Some(b'-' | 0x80..=0xBF) => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralDomainInside)
}
_ => {
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
if byte_to_kind(
tokenizer.parse_state.bytes,
tokenizer.point.index,
tokenizer.current,
) == CharacterKind::Other
{
tokenizer.tokenize_state.seen = true;
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralDomainInside)
} else {
State::Retry(StateName::GfmAutolinkLiteralDomainAfter)
}
}
}
}
/// In domain, at potential trailing punctuation, that was not trailing.
///
/// ```markdown
/// > | https://example.com
/// ^
/// ```
pub fn domain_at_punctuation(tokenizer: &mut Tokenizer) -> State {
// There is an underscore in the last segment of the domain
if matches!(tokenizer.current, Some(b'_')) {
tokenizer.tokenize_state.marker = b'_';
}
// Otherwise, it’s a `.`: save the last segment underscore in the
// penultimate segment slot.
else {
tokenizer.tokenize_state.marker_b = tokenizer.tokenize_state.marker;
tokenizer.tokenize_state.marker = 0;
}
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralDomainInside)
}
/// After domain
///
/// ```markdown
/// > | https://example.com/a
/// ^
/// ```
pub fn domain_after(tokenizer: &mut Tokenizer) -> State {
// No underscores allowed in last two segments.
let result = if tokenizer.tokenize_state.marker_b == b'_'
|| tokenizer.tokenize_state.marker == b'_'
// At least one character must be seen.
|| !tokenizer.tokenize_state.seen
// Note: that’s GH says a dot is needed, but it’s not true:
// <https://github.com/github/cmark-gfm/issues/279>
{
State::Nok
} else {
State::Retry(StateName::GfmAutolinkLiteralPathInside)
};
tokenizer.tokenize_state.seen = false;
tokenizer.tokenize_state.marker = 0;
tokenizer.tokenize_state.marker_b = 0;
result
}
/// In path.
///
/// ```markdown
/// > | https://example.com/a
/// ^^
/// ```
pub fn path_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Continuation bytes are fine, we’ve already checked the first one.
Some(0x80..=0xBF) => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralPathInside)
}
// Count opening parens.
Some(b'(') => {
tokenizer.tokenize_state.size += 1;
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralPathInside)
}
// Check whether this trailing punctuation marker is optionally
// followed by more trailing markers, and then followed
// by an end.
// If this is a paren (followed by trailing, then the end), we
// *continue* if we saw less closing parens than opening parens.
Some(
b'!' | b'"' | b'&' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'<' | b'?'
| b']' | b'_' | b'~',
) => {
let next = if tokenizer.current == Some(b')')
&& tokenizer.tokenize_state.size_b < tokenizer.tokenize_state.size
{
StateName::GfmAutolinkLiteralPathAtPunctuation
} else {
StateName::GfmAutolinkLiteralPathAfter
};
tokenizer.check(
State::Next(next),
State::Next(StateName::GfmAutolinkLiteralPathAtPunctuation),
);
State::Retry(StateName::GfmAutolinkLiteralTrail)
}
_ => {
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
if byte_to_kind(
tokenizer.parse_state.bytes,
tokenizer.point.index,
tokenizer.current,
) == CharacterKind::Whitespace
{
State::Retry(StateName::GfmAutolinkLiteralPathAfter)
} else {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralPathInside)
}
}
}
}
/// In path, at potential trailing punctuation, that was not trailing.
///
/// ```markdown
/// > | https://example.com/a"b
/// ^
/// ```
pub fn path_at_punctuation(tokenizer: &mut Tokenizer) -> State {
// Count closing parens.
if tokenizer.current == Some(b')') {
tokenizer.tokenize_state.size_b += 1;
}
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralPathInside)
}
/// At end of path, reset parens.
///
/// ```markdown
/// > | https://example.com/asd(qwe).
/// ^
/// ```
pub fn path_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.size = 0;
tokenizer.tokenize_state.size_b = 0;
State::Ok
}
/// In trail of domain or path.
///
/// ```markdown
/// > | https://example.com").
/// ^
/// ```
pub fn trail(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Regular trailing punctuation.
Some(
b'!' | b'"' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'?' | b'_' | b'~',
) => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralTrail)
}
// `&` followed by one or more alphabeticals and then a `;`, is
// as a whole considered as trailing punctuation.
// In all other cases, it is considered as continuation of the URL.
Some(b'&') => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralTrailCharRefStart)
}
// `<` is an end.
Some(b'<') => State::Ok,
// Needed because we allow literals after `[`, as we fix:
// <https://github.com/github/cmark-gfm/issues/278>.
// Check that it is not followed by `(` or `[`.
Some(b']') => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralTrailBracketAfter)
}
_ => {
// Whitespace is the end of the URL, anything else is continuation.
if byte_to_kind(
tokenizer.parse_state.bytes,
tokenizer.point.index,
tokenizer.current,
) == CharacterKind::Whitespace
{
State::Ok
} else {
State::Nok
}
}
}
}
/// In trail, after `]`.
///
/// > 👉 **Note**: this deviates from `cmark-gfm` to fix a bug.
/// > See end of <https://github.com/github/cmark-gfm/issues/278> for more.
///
/// ```markdown
/// > | https://example.com](
/// ^
/// ```
pub fn trail_bracket_after(tokenizer: &mut Tokenizer) -> State {
// Whitespace or something that could start a resource or reference is the end.
// Switch back to trail otherwise.
if matches!(
tokenizer.current,
None | Some(b'\t' | b'\n' | b' ' | b'(' | b'[')
) {
State::Ok
} else {
State::Retry(StateName::GfmAutolinkLiteralTrail)
}
}
/// In character-reference like trail, after `&`.
///
/// ```markdown
/// > | https://example.com&).
/// ^
/// ```
pub fn trail_char_ref_start(tokenizer: &mut Tokenizer) -> State {
if matches!(tokenizer.current, Some(b'A'..=b'Z' | b'a'..=b'z')) {
State::Retry(StateName::GfmAutolinkLiteralTrailCharRefInside)
} else {
State::Nok
}
}
/// In character-reference like trail.
///
/// ```markdown
/// > | https://example.com&).
/// ^
/// ```
pub fn trail_char_ref_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralTrailCharRefInside)
}
// Switch back to trail if this is well-formed.
Some(b';') => {
tokenizer.consume();
State::Next(StateName::GfmAutolinkLiteralTrail)
}
_ => State::Nok,
}
}
/// Resolve: postprocess text to find email autolink literals.
pub fn resolve(tokenizer: &mut Tokenizer) {
tokenizer.map.consume(&mut tokenizer.events);
let mut index = 0;
let mut links = 0;
while index < tokenizer.events.len() {
let event = &tokenizer.events[index];
if event.kind == Kind::Enter {
if event.name == Name::Link {
links += 1;
}
} else {
if event.name == Name::Data && links == 0 {
let slice = Slice::from_position(
tokenizer.parse_state.bytes,
&Position::from_exit_event(&tokenizer.events, index),
);
let bytes = slice.bytes;
let mut byte_index = 0;
let mut replace = Vec::new();
let mut point = tokenizer.events[index - 1].point.clone();
let start_index = point.index;
let mut start = 0;
while byte_index < bytes.len() {
if bytes[byte_index] == b'@' {
let mut range = (0, 0);
if let Some(start) = peek_bytes_atext(bytes, byte_index) {
if let Some(end) = peek_bytes_email_domain(bytes, byte_index + 1) {
let end = peek_bytes_truncate(bytes, start, end);
range = (start, end);
}
}
if range.1 != 0 {
byte_index = range.1;
// If there is something between the last link
// (or the start) and this link.
if start != range.0 {
replace.push(Event {
kind: Kind::Enter,
name: Name::Data,
point: point.clone(),
link: None,
});
point = point
.shift_to(tokenizer.parse_state.bytes, start_index + range.0);
replace.push(Event {
kind: Kind::Exit,
name: Name::Data,
point: point.clone(),
link: None,
});
}
// Add the link.
replace.push(Event {
kind: Kind::Enter,
name: Name::GfmAutolinkLiteralEmail,
point: point.clone(),
link: None,
});
point =
point.shift_to(tokenizer.parse_state.bytes, start_index + range.1);
replace.push(Event {
kind: Kind::Exit,
name: Name::GfmAutolinkLiteralEmail,
point: point.clone(),
link: None,
});
start = range.1;
}
}
byte_index += 1;
}
// If there was a link, and we have more bytes left.
if start != 0 && start < bytes.len() {
replace.push(Event {
kind: Kind::Enter,
name: Name::Data,
point: point.clone(),
link: None,
});
replace.push(Event {
kind: Kind::Exit,
name: Name::Data,
point: event.point.clone(),
link: None,
});
}
// If there were links.
if !replace.is_empty() {
tokenizer.map.add(index - 1, 2, replace);
}
}
if event.name == Name::Link {
links -= 1;
}
}
index += 1;
}
}
// To do: add `xmpp`, `mailto` support.
/// Move back past atext.
///
/// Moving back is only used when post processing text: so for the email address
/// algorithm.
///
/// ```markdown
/// > | a contact@example.org b
/// ^-- from
/// ^-- to
/// ```
fn peek_bytes_atext(bytes: &[u8], end: usize) -> Option<usize> {
let mut index = end;
// Take simplified atext.
// See `email_atext` in `autolink.rs` for a similar algorithm.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L301>.
while index > 0
&& matches!(bytes[index - 1], b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z')
{
index -= 1;
}
// Do not allow a slash “inside” atext.
// The reference code is a bit weird, but that’s what it results in.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L307>.
// Other than slash, every preceding character is allowed.
if index == end || (index > 0 && bytes[index - 1] == b'/') {
None
} else {
Some(index)
}
}
/// Move past email domain.
///
/// Peeking like this only used when post processing text: so for the email
/// address algorithm.
///
/// ```markdown
/// > | a contact@example.org b
/// ^-- from
/// ^-- to
/// ```
fn peek_bytes_email_domain(bytes: &[u8], start: usize) -> Option<usize> {
let mut index = start;
let mut dot = false;
// Move past “domain”.
// The reference code is a bit overly complex as it handles the `@`, of which there may be just one.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L318>
while index < bytes.len() {
match bytes[index] {
// Alphanumerical, `-`, and `_`.
b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z' => {}
// Dot followed by alphanumerical (not `-` or `_`).
b'.' if index + 1 < bytes.len()
&& matches!(bytes[index + 1], b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') =>
{
dot = true;
}
_ => break,
}
index += 1;
}
// Domain must not be empty, must include a dot, and must end in alphabetical or `.`.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L332>.
if index > start && dot && matches!(bytes[index - 1], b'.' | b'A'..=b'Z' | b'a'..=b'z') {
Some(index)
} else {
None
}
}
/// Move back past punctuation.
///
/// Moving back is only used when post processing text: so for the email address
/// algorithm.
///
/// This is much more complex that needed, because GH allows a lot of
/// punctuation in the protocol and www algorithms.
/// However, those aren’t implemented like the email algo.
///
/// ```markdown
/// > | a contact@example.org”) b
/// ^-- from
/// ^-- to
/// ```
fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {
let mut index = start;
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L42>
while index < end {
if bytes[index] == b'<' {
end = index;
break;
}
index += 1;
}
let mut split = end;
// Move before trailing punctuation.
while split > start {
match bytes[split - 1] {
b'!' | b'"' | b'&' | b'\'' | b')' | b',' | b'.' | b':' | b'<' | b'>' | b'?' | b']'
| b'}' => {}
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L61>.
// Note: we can’t move across actual references, because those have been parsed already.
b';' => {
let mut new_split = split - 1;
// Move back past alphabeticals.
while new_split > start && matches!(bytes[new_split - 1], b'A'..=b'Z' | b'a'..=b'z')
{
new_split -= 1;
}
// Nonempty character reference:
if new_split > start && bytes[new_split - 1] == b'&' && new_split < split - 1 {
split = new_split - 1;
continue;
}
// Otherwise it’s just a `;`.
}
_ => break,
}
split -= 1;
}
// If there was trailing punctuation, try to balance parens.
if split != end {
let mut open = 0;
let mut close = 0;
let mut paren_index = start;
// Count parens in `url` (not in trail).
while paren_index < split {
match bytes[paren_index] {
b'(' => open += 1,
b')' => close += 1,
_ => {}
}
paren_index += 1;
}
let mut trail_index = split;
// If there are more opening than closing parens, try to balance them
// from the trail.
while open > close && trail_index < end {
if bytes[trail_index] == b')' {
split = trail_index;
close += 1;
}
trail_index += 1;
}
}
split
}
/// Classify a byte (or `char`).
fn byte_to_kind(bytes: &[u8], index: usize, byte: Option<u8>) -> CharacterKind {
match byte {
None => CharacterKind::Whitespace,
Some(byte) => {
if byte.is_ascii_whitespace() {
CharacterKind::Whitespace
} else if byte.is_ascii_punctuation() {
CharacterKind::Punctuation
} else if byte.is_ascii_alphanumeric() {
CharacterKind::Other
} else {
// Otherwise: seems to be an ASCII control, so it seems to be a
// non-ASCII `char`.
let char = char_after_index(bytes, index);
classify_opt(char)
}
}
}
}