//! ```
//!
//! Yields:
//!
//! ```html
//! https://example.com/alpha%20bravo
//! ```
//!
//! There are several cases where incorrect encoding of URLs would, in other
//! languages, result in a parse error.
//! In markdown, there are no errors, and URLs are normalized.
//! In addition, many characters are percent encoded
//! ([`sanitize_uri`][sanitize_uri]).
//! For example:
//!
//! ```markdown
//!
//! ```
//!
//! Yields:
//!
//! ```html
//! https://a👍b%
//! ```
//!
//! Interestingly, there are a couple of things that are valid autolinks in
//! markdown but in HTML would be valid tags, such as `` and
//! ``.
//! However, because `CommonMark` employs a naïve HTML parsing algorithm, those
//! are not considered HTML.
//!
//! While `CommonMark` restricts links from occurring in other links in the
//! case of labels (see [label end][label_end]), this restriction is not in
//! place for autolinks inside labels:
//!
//! ```markdown
//! [](#)
//! ```
//!
//! Yields:
//!
//! ```html
//! https://example.com
//! ```
//!
//! The generated output, in this case, is invalid according to HTML.
//! When a browser sees that markup, it will instead parse it as:
//!
//! ```html
//! https://example.com
//! ```
//!
//! ## HTML
//!
//! Autolinks relate to the `` element in HTML.
//! See [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info.
//! When an email autolink is used (so, without a protocol), the string
//! `mailto:` is prepended before the email, when generating the `href`
//! attribute of the hyperlink.
//!
//! ## Recommendation
//!
//! It is recommended to use labels ([label start link][label_start_link],
//! [label end][label_end]), either with a resource or a definition
//! ([definition][]), instead of autolinks, as those allow more characters in
//! URLs, and allow relative URLs and `www.` URLs.
//! They also allow for descriptive text to explain the URL in prose.
//!
//! ## Tokens
//!
//! * [`Autolink`][Name::Autolink]
//! * [`AutolinkEmail`][Name::AutolinkEmail]
//! * [`AutolinkMarker`][Name::AutolinkMarker]
//! * [`AutolinkProtocol`][Name::AutolinkProtocol]
//!
//! ## References
//!
//! * [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js)
//! * [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks)
//!
//! [text]: crate::construct::text
//! [definition]: crate::construct::definition
//! [label_start_link]: crate::construct::label_start_link
//! [label_end]: crate::construct::label_end
//! [autolink_scheme_size_max]: crate::util::constant::AUTOLINK_SCHEME_SIZE_MAX
//! [autolink_domain_size_max]: crate::util::constant::AUTOLINK_DOMAIN_SIZE_MAX
//! [sanitize_uri]: crate::util::sanitize_uri
//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
use crate::event::Name;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};
/// Start of an autolink.
///
/// ```markdown
/// > | ab
/// ^
/// > | ab
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
if tokenizer.parse_state.options.constructs.autolink && tokenizer.current == Some(b'<') {
tokenizer.enter(Name::Autolink);
tokenizer.enter(Name::AutolinkMarker);
tokenizer.consume();
tokenizer.exit(Name::AutolinkMarker);
tokenizer.enter(Name::AutolinkProtocol);
State::Next(StateName::AutolinkOpen)
} else {
State::Nok
}
}
/// After `<`, at protocol or atext.
///
/// ```markdown
/// > | ab
/// ^
/// > | ab
/// ^
/// ```
pub fn open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// ASCII alphabetic.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Next(StateName::AutolinkSchemeOrEmailAtext)
}
_ => State::Retry(StateName::AutolinkEmailAtext),
}
}
/// At second byte of protocol or atext.
///
/// ```markdown
/// > | ab
/// ^
/// > | ab
/// ^
/// ```
pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// ASCII alphanumeric and `+`, `-`, and `.`.
Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
// Count the previous alphabetical from `open` too.
tokenizer.tokenize_state.size = 1;
State::Retry(StateName::AutolinkSchemeInsideOrEmailAtext)
}
_ => State::Retry(StateName::AutolinkEmailAtext),
}
}
/// In ambiguous protocol or atext.
///
/// ```markdown
/// > | ab
/// ^
/// > | ab
/// ^
/// ```
pub fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b':') => {
tokenizer.consume();
tokenizer.tokenize_state.size = 0;
State::Next(StateName::AutolinkUrlInside)
}
// ASCII alphanumeric and `+`, `-`, and `.`.
Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z')
if tokenizer.tokenize_state.size < AUTOLINK_SCHEME_SIZE_MAX =>
{
tokenizer.consume();
tokenizer.tokenize_state.size += 1;
State::Next(StateName::AutolinkSchemeInsideOrEmailAtext)
}
_ => {
tokenizer.tokenize_state.size = 0;
State::Retry(StateName::AutolinkEmailAtext)
}
}
}
/// After protocol, in URL.
///
/// ```markdown
/// > | ab
/// ^
/// ```
pub fn url_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'>') => {
tokenizer.exit(Name::AutolinkProtocol);
tokenizer.enter(Name::AutolinkMarker);
tokenizer.consume();
tokenizer.exit(Name::AutolinkMarker);
tokenizer.exit(Name::Autolink);
State::Ok
}
// ASCII control, space, or `<`.
None | Some(b'\0'..=0x1F | b' ' | b'<' | 0x7F) => State::Nok,
Some(_) => {
tokenizer.consume();
State::Next(StateName::AutolinkUrlInside)
}
}
}
/// In email atext.
///
/// ```markdown
/// > | ab
/// ^
/// ```
pub fn email_atext(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'@') => {
tokenizer.consume();
State::Next(StateName::AutolinkEmailAtSignOrDot)
}
// ASCII atext.
//
// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or
// a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027
// APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`),
// U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`),
// U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE
// ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE
// (`~`).
//
// See:
// **\[RFC5322]**:
// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
// P. Resnick.
// IETF.
//
// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
Some(
b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~',
) => {
tokenizer.consume();
State::Next(StateName::AutolinkEmailAtext)
}
_ => State::Nok,
}
}
/// In label, after at-sign or dot.
///
/// ```markdown
/// > | ab
/// ^ ^
/// ```
pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// ASCII alphanumeric.
Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
State::Retry(StateName::AutolinkEmailValue)
}
_ => State::Nok,
}
}
/// In label, where `.` and `>` are allowed.
///
/// ```markdown
/// > | ab
/// ^
/// ```
pub fn email_label(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'.') => {
tokenizer.consume();
tokenizer.tokenize_state.size = 0;
State::Next(StateName::AutolinkEmailAtSignOrDot)
}
Some(b'>') => {
let index = tokenizer.events.len();
tokenizer.exit(Name::AutolinkProtocol);
// Change the event name.
tokenizer.events[index - 1].name = Name::AutolinkEmail;
tokenizer.events[index].name = Name::AutolinkEmail;
tokenizer.enter(Name::AutolinkMarker);
tokenizer.consume();
tokenizer.exit(Name::AutolinkMarker);
tokenizer.exit(Name::Autolink);
tokenizer.tokenize_state.size = 0;
State::Ok
}
_ => State::Retry(StateName::AutolinkEmailValue),
}
}
/// In label, where `.` and `>` are *not* allowed.
///
/// Though, this is also used in `email_label` to parse other values.
///
/// ```markdown
/// > | ab
/// ^
/// ```
pub fn email_value(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// ASCII alphanumeric or `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z')
if tokenizer.tokenize_state.size < AUTOLINK_DOMAIN_SIZE_MAX =>
{
let name = if matches!(tokenizer.current, Some(b'-')) {
StateName::AutolinkEmailValue
} else {
StateName::AutolinkEmailLabel
};
tokenizer.tokenize_state.size += 1;
tokenizer.consume();
State::Next(name)
}
_ => {
tokenizer.tokenize_state.size = 0;
State::Nok
}
}
}