Add autolinks

author: Titus Wormer <tituswormer@gmail.com> 2022-06-13 12:37:25 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-13 12:37:25 +0200
commit: efdf90959f78d1582da312bffbefaabb79f264b7 (patch)
tree: a36c7dfa72ec5cadfdb296d94aed2d06a871b701 /src
parent: 17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 (diff)
download: markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.gz
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.bz2
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.zip
6 files changed, 383 insertions, 6 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 48983b6..df26f1b 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -89,7 +89,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 | TokenType::CharacterReferenceMarkerNumeric
                 | TokenType::CharacterReferenceMarkerHexadecimal
                 | TokenType::CharacterReferenceMarkerSemi
-                | TokenType::CharacterReferenceValue => {}
+                | TokenType::CharacterReferenceValue
+                | TokenType::Autolink
+                | TokenType::AutolinkMarker
+                | TokenType::AutolinkProtocol
+                | TokenType::AutolinkEmail => {}
                 #[allow(unreachable_patterns)]
                 _ => {
                     unreachable!("unhandled `enter` of TokenType {:?}", token_type)
@@ -108,7 +112,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                 | TokenType::CharacterEscape
                 | TokenType::CharacterEscapeMarker
                 | TokenType::CharacterReference
-                | TokenType::CharacterReferenceMarkerSemi => {}
+                | TokenType::CharacterReferenceMarkerSemi
+                | TokenType::Autolink
+                | TokenType::AutolinkMarker => {}
                 TokenType::HtmlFlow => {
                     ignore_encode = false;
                 }
@@ -229,6 +235,26 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
                     atx_opening_sequence_size = None;
                     atx_heading_buffer = None;
                 }
+                TokenType::AutolinkProtocol => {
+                    let slice = slice_serialize(codes, &get_span(events, index), false);
+                    let buf = buf_tail_mut(buffers);
+                    // To do: options.allowDangerousProtocol ? undefined : protocolHref
+                    // let url = sanitize_uri(slice);
+                    let url = encode(&slice);
+                    buf.push(format!("<a href=\"{}\">", url));
+                    buf.push(encode(&slice));
+                    buf.push("</a>".to_string());
+                }
+                TokenType::AutolinkEmail => {
+                    let slice = slice_serialize(codes, &get_span(events, index), false);
+                    let buf = buf_tail_mut(buffers);
+                    // To do: options.allowDangerousProtocol ? undefined : protocolHref
+                    // let url = sanitize_uri(slice);
+                    let url = encode(&slice);
+                    buf.push(format!("<a href=\"mailto:{}\">", url));
+                    buf.push(encode(&slice));
+                    buf.push("</a>".to_string());
+                }
                 TokenType::ThematicBreak => {
                     buf_tail_mut(buffers).push("<hr />".to_string());
                 }
diff --git a/src/constant.rs b/src/constant.rs
index 332fdaf..c98c24d 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -27,6 +27,23 @@
 /// [code_indented]: crate::construct::code_indented
 pub const TAB_SIZE: usize = 4;
 
+/// The number of characters allowed in a protocol of an [autolink][].
+///
+/// The protocol part is the `xxx` in `<xxx://example.com>`.
+/// 32 characters is fine, 33 is too many.
+///
+/// [autolink]: crate::construct::autolink
+pub const AUTOLINK_SCHEME_SIZE_MAX: usize = 32;
+
+/// The number of characters allowed in a domain of an email [autolink][].
+///
+/// There can be multiple “domains”.
+/// A domain part is each `xxx` in `<example@xxx.xxx.xxx>`.
+/// 63 characters is fine, 64 is too many.
+///
+/// [autolink]: crate::construct::autolink
+pub const AUTOLINK_DOMAIN_SIZE_MAX: usize = 63;
+
 /// The number of markers needed for a [thematic break][thematic_break] to form.
 ///
 /// Like many things in markdown, the number is `3`.
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
new file mode 100644
index 0000000..24f2c20
--- /dev/null
+++ b/src/construct/autolink.rs
@@ -0,0 +1,327 @@
+//! Autolinks are a construct that occurs in the [text][] content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! autolink ::= '<' ( url | email ) '>'
+//!
+//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ')
+//! email ::= 1*ascii_atext '@' domain *('.' domain)
+//! ; Restriction: up to (including) 63 character are allowed in each domain.
+//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric )
+//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~'
+//! ```
+//!
+//! Autolinks relate to the `<a>` element in HTML.
+//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info.
+//! When an email autolink is used (so, without a protocol), the string
+//! `mailto:` is prepended before the email, when generating the `href`
+//! attribute of the hyperlink.
+//!
+//! The maximum allowed size of a scheme is `31` (inclusive), which is defined
+//! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max].
+//! The maximum allowed size of a domain is `63` (inclusive), which is defined
+//! in [`AUTOLINK_DOMAIN_SIZE_MAX`][autolink_domain_size_max].
+//!
+//! The grammar for autolinks is quite strict and requires ASCII to be used
+//! (without, for example, spaces).
+//! To use non-ascii characters and otherwise impossible characters, in URLs,
+//! you can use percent encoding:
+//!
+//! ```markdown
+//! <https://example.com/alpha%20bravo>
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p>
+//! ```
+//!
+//! Interestingly, there are a couple of things that are valid autolinks in
+//! markdown but in HTML would be valid tags, such as `<svg:rect>` and
+//! `<xml:lang/>`.
+//! However, because CommonMark employs a naïve HTML parsing algorithm, those
+//! are not considered HTML.
+//!
+//! While CommonMark restricts links from occurring in other links in the case
+//! of bracketed links, this restriction is not in place for autolinks inside
+//! autolinks:
+//!
+//! ```markdown
+//! [<https://example.com>](#)
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="#"><a href="https://example.com">https://example.com</a></a></p>
+//! ```
+//!
+//! The generated output, in this case, is invalid according to HTML.
+//! When a browser sees that markup, it will instead parse it as:
+//!
+//! ```html
+//! <p><a href="#"></a><a href="https://example.com">https://example.com</a></p>
+//! ```
+//!
+//! ## References
+//!
+//! *   [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js)
+//! *   [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks)
+//!
+//! [text]: crate::content::text
+//! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX
+//! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX
+//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
+//!
+//! <!-- To do: link to `encode` -->
+
+use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of an autolink.
+///
+/// ```markdown
+/// a|<https://example.com>b
+/// a|<user@example.com>b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('<') => {
+            tokenizer.enter(TokenType::Autolink);
+            tokenizer.enter(TokenType::AutolinkMarker);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::AutolinkMarker);
+            tokenizer.enter(TokenType::AutolinkProtocol);
+            (State::Fn(Box::new(open)), None)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// After `<`, before the protocol.
+///
+/// ```markdown
+/// a<|https://example.com>b
+/// a<|user@example.com>b
+/// ```
+pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char(char) if char.is_ascii_alphabetic() => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(scheme_or_email_atext)), None)
+        }
+        Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer, code),
+        _ => (State::Nok, None),
+    }
+}
+
+/// After the first character of the protocol or email name.
+///
+/// ```markdown
+/// a<h|ttps://example.com>b
+/// a<u|ser@example.com>b
+/// ```
+pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    // Whether this character can be both a protocol and email atext.
+    let unknown = match code {
+        Code::Char('+' | '-' | '.') => true,
+        Code::Char(char) if char.is_ascii_alphanumeric() => true,
+        _ => false,
+    };
+
+    if unknown {
+        scheme_inside_or_email_atext(tokenizer, code, 1)
+    } else {
+        email_atext(tokenizer, code)
+    }
+}
+
+/// Inside an ambiguous protocol or email name.
+///
+/// ```markdown
+/// a<ht|tps://example.com>b
+/// a<us|er@example.com>b
+/// ```
+pub fn scheme_inside_or_email_atext(
+    tokenizer: &mut Tokenizer,
+    code: Code,
+    size: usize,
+) -> StateFnResult {
+    if let Code::Char(':') = code {
+        tokenizer.consume(code);
+        (State::Fn(Box::new(url_inside)), None)
+    } else {
+        // Whether this character can be both a protocol and email atext.
+        let unknown = match code {
+            Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true,
+            Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => {
+                true
+            }
+            _ => false,
+        };
+
+        if unknown {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |t, c| {
+                    scheme_inside_or_email_atext(t, c, size + 1)
+                })),
+                None,
+            )
+        } else {
+            email_atext(tokenizer, code)
+        }
+    }
+}
+
+/// Inside a URL, after the protocol.
+///
+/// ```markdown
+/// a<https:|//example.com>b
+/// ```
+pub fn url_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('>') => {
+            tokenizer.exit(TokenType::AutolinkProtocol);
+            end(tokenizer, code)
+        }
+        Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
+        Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => {
+            (State::Nok, None)
+        }
+        Code::Char(_) => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(url_inside)), None)
+        }
+    }
+}
+
+/// Inside email atext.
+///
+/// ```markdown
+/// a<user.na|me@example.com>b
+/// ```
+pub fn email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('@') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+                None,
+            )
+        }
+        Code::Char(char) if is_ascii_atext(char) => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(email_atext)), None)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// After an at-sign or a dot in the label.
+///
+/// ```markdown
+/// a<user.name@|example.com>b
+/// a<user.name@example.|com>b
+/// ```
+pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, code, size),
+        _ => (State::Nok, None),
+    }
+}
+
+/// In the label, where `.` and `>` are allowed.
+///
+/// ```markdown
+/// a<user.name@ex|ample.com>b
+/// ```
+pub fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        Code::Char('.') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+                None,
+            )
+        }
+        Code::Char('>') => {
+            let tail_index = tokenizer.events.len();
+            let head_index = tokenizer.events.len() - 1;
+            tokenizer.exit(TokenType::AutolinkProtocol);
+            // Change the token type.
+            tokenizer.events[head_index].token_type = TokenType::AutolinkEmail;
+            tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail;
+            end(tokenizer, code)
+        }
+        _ => email_value(tokenizer, code, size),
+    }
+}
+
+/// In the label, where `.` and `>` are *not* allowed.
+///
+/// Though, this is also used in `email_label` to parse other values.
+///
+/// ```markdown
+/// a<user.name@ex-|ample.com>b
+/// ```
+pub fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    let ok = match code {
+        Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+        Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+        _ => false,
+    };
+
+    if ok {
+        tokenizer.consume(code);
+        let func = if let Code::Char('-') = code {
+            email_value
+        } else {
+            email_label
+        };
+        (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None)
+    } else {
+        (State::Nok, None)
+    }
+}
+
+/// At the `>`.
+///
+/// ```markdown
+/// a<https://example.com|>b
+/// a<user@example.com|>b
+/// ```
+pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::Char('>') => {
+            tokenizer.enter(TokenType::AutolinkMarker);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::AutolinkMarker);
+            tokenizer.exit(TokenType::Autolink);
+            (State::Ok, None)
+        }
+        _ => unreachable!("expected `>` at `end`"),
+    }
+}
+
+/// Check whether the character code represents an ASCII atext.
+///
+/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in
+/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`),
+/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F
+/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E
+/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE
+/// (`{`) to U+007E TILDE (`~`).
+///
+/// See:
+/// **\[RFC5322]**:
+/// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
+/// P. Resnick.
+/// IETF.
+///
+/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
+fn is_ascii_atext(x: char) -> bool {
+    matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~')
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index d671db6..0bc8746 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -1,5 +1,6 @@
 //! Constructs found in markdown.
 
+pub mod autolink;
 pub mod blank_line;
 pub mod character_escape;
 pub mod character_reference;
diff --git a/src/content/text.rs b/src/content/text.rs
index 2c93b18..a7b40e7 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -5,7 +5,7 @@
 //!
 //! The constructs found in text are:
 //!
-//! *   Autolink
+//! *   [Autolink][crate::construct::autolink]
 //! *   Attention
 //! *   HTML (text)
 //! *   Hard break escape
@@ -17,7 +17,8 @@
 //! *   [Character reference][crate::construct::character_reference]
 
 use crate::construct::{
-    character_escape::start as character_escape, character_reference::start as character_reference,
+    autolink::start as autolink, character_escape::start as character_escape,
+    character_reference::start as character_reference,
 };
 use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
 
@@ -33,7 +34,7 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
 pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
     match code {
         Code::None => (State::Ok, None),
-        _ => tokenizer.attempt_2(character_reference, character_escape, |ok| {
+        _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| {
             Box::new(if ok { start } else { before_data })
         })(tokenizer, code),
     }
@@ -68,7 +69,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
             (State::Ok, None)
         }
         // To do: somehow get these markers from constructs.
-        Code::Char('&' | '\\') => {
+        Code::Char('&' | '\\' | '<') => {
             tokenizer.exit(TokenType::Data);
             start(tokenizer, code)
         }
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 4d235ed..4c1caa4 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -20,6 +20,11 @@ use std::collections::HashMap;
 // To do: document each variant.
 #[derive(Debug, Clone, PartialEq)]
 pub enum TokenType {
+    Autolink,
+    AutolinkMarker,
+    AutolinkProtocol,
+    AutolinkEmail,
+
     AtxHeading,
     AtxHeadingSequence,
     AtxHeadingWhitespace,
author	Titus Wormer <tituswormer@gmail.com>	2022-06-13 12:37:25 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-13 12:37:25 +0200
commit	efdf90959f78d1582da312bffbefaabb79f264b7 (patch)
tree	a36c7dfa72ec5cadfdb296d94aed2d06a871b701 /src
parent	17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 (diff)
download	markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.gz markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.bz2 markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.zip