aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 12:37:25 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 12:37:25 +0200
commitefdf90959f78d1582da312bffbefaabb79f264b7 (patch)
treea36c7dfa72ec5cadfdb296d94aed2d06a871b701 /src/construct
parent17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 (diff)
downloadmarkdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.gz
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.bz2
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.zip
Add autolinks
Diffstat (limited to '')
-rw-r--r--src/construct/autolink.rs327
-rw-r--r--src/construct/mod.rs1
2 files changed, 328 insertions, 0 deletions
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
new file mode 100644
index 0000000..24f2c20
--- /dev/null
+++ b/src/construct/autolink.rs
@@ -0,0 +1,327 @@
+//! Autolinks are a construct that occurs in the [text][] content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! autolink ::= '<' ( url | email ) '>'
+//!
+//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ')
+//! email ::= 1*ascii_atext '@' domain *('.' domain)
+//! ; Restriction: up to (including) 63 character are allowed in each domain.
+//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric )
+//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~'
+//! ```
+//!
+//! Autolinks relate to the `<a>` element in HTML.
+//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info.
+//! When an email autolink is used (so, without a protocol), the string
+//! `mailto:` is prepended before the email, when generating the `href`
+//! attribute of the hyperlink.
+//!
+//! The maximum allowed size of a scheme is `31` (inclusive), which is defined
+//! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max].
+//! The maximum allowed size of a domain is `63` (inclusive), which is defined
+//! in [`AUTOLINK_DOMAIN_SIZE_MAX`][autolink_domain_size_max].
+//!
+//! The grammar for autolinks is quite strict and requires ASCII to be used
+//! (without, for example, spaces).
+//! To use non-ascii characters and otherwise impossible characters, in URLs,
+//! you can use percent encoding:
+//!
+//! ```markdown
+//! <https://example.com/alpha%20bravo>
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p>
+//! ```
+//!
+//! Interestingly, there are a couple of things that are valid autolinks in
+//! markdown but in HTML would be valid tags, such as `<svg:rect>` and
+//! `<xml:lang/>`.
+//! However, because CommonMark employs a naïve HTML parsing algorithm, those
+//! are not considered HTML.
+//!
+//! While CommonMark restricts links from occurring in other links in the case
+//! of bracketed links, this restriction is not in place for autolinks inside
+//! autolinks:
+//!
+//! ```markdown
+//! [<https://example.com>](#)
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="#"><a href="https://example.com">https://example.com</a></a></p>
+//! ```
+//!
+//! The generated output, in this case, is invalid according to HTML.
+//! When a browser sees that markup, it will instead parse it as:
+//!
+//! ```html
+//! <p><a href="#"></a><a href="https://example.com">https://example.com</a></p>
+//! ```
+//!
+//! ## References
+//!
+//! * [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js)
+//! * [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks)
+//!
+//! [text]: crate::content::text
+//! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX
+//! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX
+//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
+//!
+//! <!-- To do: link to `encode` -->
+
+use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of an autolink.
+///
+/// ```markdown
+/// a|<https://example.com>b
+/// a|<user@example.com>b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('<') => {
+ tokenizer.enter(TokenType::Autolink);
+ tokenizer.enter(TokenType::AutolinkMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::AutolinkMarker);
+ tokenizer.enter(TokenType::AutolinkProtocol);
+ (State::Fn(Box::new(open)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<`, before the protocol.
+///
+/// ```markdown
+/// a<|https://example.com>b
+/// a<|user@example.com>b
+/// ```
+pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(scheme_or_email_atext)), None)
+ }
+ Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer, code),
+ _ => (State::Nok, None),
+ }
+}
+
+/// After the first character of the protocol or email name.
+///
+/// ```markdown
+/// a<h|ttps://example.com>b
+/// a<u|ser@example.com>b
+/// ```
+pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // Whether this character can be both a protocol and email atext.
+ let unknown = match code {
+ Code::Char('+' | '-' | '.') => true,
+ Code::Char(char) if char.is_ascii_alphanumeric() => true,
+ _ => false,
+ };
+
+ if unknown {
+ scheme_inside_or_email_atext(tokenizer, code, 1)
+ } else {
+ email_atext(tokenizer, code)
+ }
+}
+
+/// Inside an ambiguous protocol or email name.
+///
+/// ```markdown
+/// a<ht|tps://example.com>b
+/// a<us|er@example.com>b
+/// ```
+pub fn scheme_inside_or_email_atext(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ size: usize,
+) -> StateFnResult {
+ if let Code::Char(':') = code {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(url_inside)), None)
+ } else {
+ // Whether this character can be both a protocol and email atext.
+ let unknown = match code {
+ Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true,
+ Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => {
+ true
+ }
+ _ => false,
+ };
+
+ if unknown {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| {
+ scheme_inside_or_email_atext(t, c, size + 1)
+ })),
+ None,
+ )
+ } else {
+ email_atext(tokenizer, code)
+ }
+ }
+}
+
+/// Inside a URL, after the protocol.
+///
+/// ```markdown
+/// a<https:|//example.com>b
+/// ```
+pub fn url_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.exit(TokenType::AutolinkProtocol);
+ end(tokenizer, code)
+ }
+ Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
+ Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => {
+ (State::Nok, None)
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(url_inside)), None)
+ }
+ }
+}
+
+/// Inside email atext.
+///
+/// ```markdown
+/// a<user.na|me@example.com>b
+/// ```
+pub fn email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('@') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+ None,
+ )
+ }
+ Code::Char(char) if is_ascii_atext(char) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(email_atext)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After an at-sign or a dot in the label.
+///
+/// ```markdown
+/// a<user.name@|example.com>b
+/// a<user.name@example.|com>b
+/// ```
+pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, code, size),
+ _ => (State::Nok, None),
+ }
+}
+
+/// In the label, where `.` and `>` are allowed.
+///
+/// ```markdown
+/// a<user.name@ex|ample.com>b
+/// ```
+pub fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ Code::Char('.') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+ None,
+ )
+ }
+ Code::Char('>') => {
+ let tail_index = tokenizer.events.len();
+ let head_index = tokenizer.events.len() - 1;
+ tokenizer.exit(TokenType::AutolinkProtocol);
+ // Change the token type.
+ tokenizer.events[head_index].token_type = TokenType::AutolinkEmail;
+ tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail;
+ end(tokenizer, code)
+ }
+ _ => email_value(tokenizer, code, size),
+ }
+}
+
+/// In the label, where `.` and `>` are *not* allowed.
+///
+/// Though, this is also used in `email_label` to parse other values.
+///
+/// ```markdown
+/// a<user.name@ex-|ample.com>b
+/// ```
+pub fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ let ok = match code {
+ Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+ Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+ _ => false,
+ };
+
+ if ok {
+ tokenizer.consume(code);
+ let func = if let Code::Char('-') = code {
+ email_value
+ } else {
+ email_label
+ };
+ (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// At the `>`.
+///
+/// ```markdown
+/// a<https://example.com|>b
+/// a<user@example.com|>b
+/// ```
+pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.enter(TokenType::AutolinkMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::AutolinkMarker);
+ tokenizer.exit(TokenType::Autolink);
+ (State::Ok, None)
+ }
+ _ => unreachable!("expected `>` at `end`"),
+ }
+}
+
+/// Check whether the character code represents an ASCII atext.
+///
+/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in
+/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`),
+/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F
+/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E
+/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE
+/// (`{`) to U+007E TILDE (`~`).
+///
+/// See:
+/// **\[RFC5322]**:
+/// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
+/// P. Resnick.
+/// IETF.
+///
+/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
+fn is_ascii_atext(x: char) -> bool {
+ matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~')
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index d671db6..0bc8746 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -1,5 +1,6 @@
//! Constructs found in markdown.
+pub mod autolink;
pub mod blank_line;
pub mod character_escape;
pub mod character_reference;