Add autolinks

author: Titus Wormer <tituswormer@gmail.com> 2022-06-13 12:37:25 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-13 12:37:25 +0200
commit: efdf90959f78d1582da312bffbefaabb79f264b7 (patch)
tree: a36c7dfa72ec5cadfdb296d94aed2d06a871b701 /src/construct/autolink.rs
parent: 17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 (diff)
download: markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.gz
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.bz2
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.zip
1 files changed, 327 insertions, 0 deletions
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
new file mode 100644
index 0000000..24f2c20
--- /dev/null
+++ b/src/construct/autolink.rs
@@ -0,0 +1,327 @@
+//! Autolinks are a construct that occurs in the [text][] content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! autolink ::= '<' ( url | email ) '>'
+//!
+//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ')
+//! email ::= 1*ascii_atext '@' domain *('.' domain)
+//! ; Restriction: up to (including) 63 character are allowed in each domain.
+//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric )
+//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~'
+//! ```
+//!
+//! Autolinks relate to the `<a>` element in HTML.
+//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info.
+//! When an email autolink is used (so, without a protocol), the string
+//! `mailto:` is prepended before the email, when generating the `href`
+//! attribute of the hyperlink.
+//!
+//! The maximum allowed size of a scheme is `31` (inclusive), which is defined
+//! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max].
+//! The maximum allowed size of a domain is `63` (inclusive), which is defined
+//! in [`AUTOLINK_DOMAIN_SIZE_MAX`][autolink_domain_size_max].
+//!
+//! The grammar for autolinks is quite strict and requires ASCII to be used
+//! (without, for example, spaces).
+//! To use non-ascii characters and otherwise impossible characters, in URLs,
+//! you can use percent encoding:
+//!
+//! ```markdown
+//! <https://example.com/alpha%20bravo>
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p>
+//! ```
+//!
+//! Interestingly, there are a couple of things that are valid autolinks in
+//! markdown but in HTML would be valid tags, such as `<svg:rect>` and
+//! `<xml:lang/>`.
+//! However, because CommonMark employs a naïve HTML parsing algorithm, those
+//! are not considered HTML.
+//!
+//! While CommonMark restricts links from occurring in other links in the case
+//! of bracketed links, this restriction is not in place for autolinks inside
+//! autolinks:
+//!
+//! ```markdown
+//! [<https://example.com>](#)
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="#"><a href="https://example.com">https://example.com</a></a></p>
+//! ```
+//!
+//! The generated output, in this case, is invalid according to HTML.
+//! When a browser sees that markup, it will instead parse it as:
+//!
+//! ```html
+//! <p><a href="#"></a><a href="https://example.com">https://example.com</a></p>
+//! ```
+//!
+//! ## References
+//!
+//! *   [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js)
+//! *   [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks)
+//!
+//! [text]: crate::content::text
+//! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX
+//! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX
+//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
+//!
+//! <!-- To do: link to `encode` -->
+
+use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of an autolink.
+///
+/// ```markdown
+/// a|<https://example.com>b
+/// a|<user@example.com>b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('<') => {
+            tokenizer.enter(TokenType::Autolink);
+            tokenizer.enter(TokenType::AutolinkMarker);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::AutolinkMarker);
+            tokenizer.enter(TokenType::AutolinkProtocol);
+            (State::Fn(Box::new(open)), None)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// After `<`, before the protocol.
+///
+/// ```markdown
+/// a<|https://example.com>b
+/// a<|user@example.com>b
+/// ```
+pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char(char) if char.is_ascii_alphabetic() => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(scheme_or_email_atext)), None)
+        }
+        Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer, code),
+        _ => (State::Nok, None),
+    }
+}
+
+/// After the first character of the protocol or email name.
+///
+/// ```markdown
+/// a<h|ttps://example.com>b
+/// a<u|ser@example.com>b
+/// ```
+pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    // Whether this character can be both a protocol and email atext.
+    let unknown = match code {
+        Code::Char('+' | '-' | '.') => true,
+        Code::Char(char) if char.is_ascii_alphanumeric() => true,
+        _ => false,
+    };
+
+    if unknown {
+        scheme_inside_or_email_atext(tokenizer, code, 1)
+    } else {
+        email_atext(tokenizer, code)
+    }
+}
+
+/// Inside an ambiguous protocol or email name.
+///
+/// ```markdown
+/// a<ht|tps://example.com>b
+/// a<us|er@example.com>b
+/// ```
+pub fn scheme_inside_or_email_atext(
+    tokenizer: &mut Tokenizer,
+    code: Code,
+    size: usize,
+) -> StateFnResult {
+    if let Code::Char(':') = code {
+        tokenizer.consume(code);
+        (State::Fn(Box::new(url_inside)), None)
+    } else {
+        // Whether this character can be both a protocol and email atext.
+        let unknown = match code {
+            Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true,
+            Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => {
+                true
+            }
+            _ => false,
+        };
+
+        if unknown {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |t, c| {
+                    scheme_inside_or_email_atext(t, c, size + 1)
+                })),
+                None,
+            )
+        } else {
+            email_atext(tokenizer, code)
+        }
+    }
+}
+
+/// Inside a URL, after the protocol.
+///
+/// ```markdown
+/// a<https:|//example.com>b
+/// ```
+pub fn url_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('>') => {
+            tokenizer.exit(TokenType::AutolinkProtocol);
+            end(tokenizer, code)
+        }
+        Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
+        Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => {
+            (State::Nok, None)
+        }
+        Code::Char(_) => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(url_inside)), None)
+        }
+    }
+}
+
+/// Inside email atext.
+///
+/// ```markdown
+/// a<user.na|me@example.com>b
+/// ```
+pub fn email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('@') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+                None,
+            )
+        }
+        Code::Char(char) if is_ascii_atext(char) => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(email_atext)), None)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// After an at-sign or a dot in the label.
+///
+/// ```markdown
+/// a<user.name@|example.com>b
+/// a<user.name@example.|com>b
+/// ```
+pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, code, size),
+        _ => (State::Nok, None),
+    }
+}
+
+/// In the label, where `.` and `>` are allowed.
+///
+/// ```markdown
+/// a<user.name@ex|ample.com>b
+/// ```
+pub fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        Code::Char('.') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+                None,
+            )
+        }
+        Code::Char('>') => {
+            let tail_index = tokenizer.events.len();
+            let head_index = tokenizer.events.len() - 1;
+            tokenizer.exit(TokenType::AutolinkProtocol);
+            // Change the token type.
+            tokenizer.events[head_index].token_type = TokenType::AutolinkEmail;
+            tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail;
+            end(tokenizer, code)
+        }
+        _ => email_value(tokenizer, code, size),
+    }
+}
+
+/// In the label, where `.` and `>` are *not* allowed.
+///
+/// Though, this is also used in `email_label` to parse other values.
+///
+/// ```markdown
+/// a<user.name@ex-|ample.com>b
+/// ```
+pub fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    let ok = match code {
+        Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+        Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+        _ => false,
+    };
+
+    if ok {
+        tokenizer.consume(code);
+        let func = if let Code::Char('-') = code {
+            email_value
+        } else {
+            email_label
+        };
+        (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None)
+    } else {
+        (State::Nok, None)
+    }
+}
+
+/// At the `>`.
+///
+/// ```markdown
+/// a<https://example.com|>b
+/// a<user@example.com|>b
+/// ```
+pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('>') => {
+            tokenizer.enter(TokenType::AutolinkMarker);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::AutolinkMarker);
+            tokenizer.exit(TokenType::Autolink);
+            (State::Ok, None)
+        }
+        _ => unreachable!("expected `>` at `end`"),
+    }
+}
+
+/// Check whether the character code represents an ASCII atext.
+///
+/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in
+/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`),
+/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F
+/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E
+/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE
+/// (`{`) to U+007E TILDE (`~`).
+///
+/// See:
+/// **\[RFC5322]**:
+/// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
+/// P. Resnick.
+/// IETF.
+///
+/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
+fn is_ascii_atext(x: char) -> bool {
+    matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~')
+}
author	Titus Wormer <tituswormer@gmail.com>	2022-06-13 12:37:25 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-13 12:37:25 +0200
commit	efdf90959f78d1582da312bffbefaabb79f264b7 (patch)
tree	a36c7dfa72ec5cadfdb296d94aed2d06a871b701 /src/construct/autolink.rs
parent	17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 (diff)
download	markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.gz markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.bz2 markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.zip