aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 12:37:25 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-13 12:37:25 +0200
commitefdf90959f78d1582da312bffbefaabb79f264b7 (patch)
treea36c7dfa72ec5cadfdb296d94aed2d06a871b701 /src
parent17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 (diff)
downloadmarkdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.gz
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.tar.bz2
markdown-rs-efdf90959f78d1582da312bffbefaabb79f264b7.zip
Add autolinks
Diffstat (limited to '')
-rw-r--r--src/compiler.rs30
-rw-r--r--src/constant.rs17
-rw-r--r--src/construct/autolink.rs327
-rw-r--r--src/construct/mod.rs1
-rw-r--r--src/content/text.rs9
-rw-r--r--src/tokenizer.rs5
6 files changed, 383 insertions, 6 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 48983b6..df26f1b 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -89,7 +89,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CharacterReferenceMarkerNumeric
| TokenType::CharacterReferenceMarkerHexadecimal
| TokenType::CharacterReferenceMarkerSemi
- | TokenType::CharacterReferenceValue => {}
+ | TokenType::CharacterReferenceValue
+ | TokenType::Autolink
+ | TokenType::AutolinkMarker
+ | TokenType::AutolinkProtocol
+ | TokenType::AutolinkEmail => {}
#[allow(unreachable_patterns)]
_ => {
unreachable!("unhandled `enter` of TokenType {:?}", token_type)
@@ -108,7 +112,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CharacterEscape
| TokenType::CharacterEscapeMarker
| TokenType::CharacterReference
- | TokenType::CharacterReferenceMarkerSemi => {}
+ | TokenType::CharacterReferenceMarkerSemi
+ | TokenType::Autolink
+ | TokenType::AutolinkMarker => {}
TokenType::HtmlFlow => {
ignore_encode = false;
}
@@ -229,6 +235,26 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
atx_opening_sequence_size = None;
atx_heading_buffer = None;
}
+ TokenType::AutolinkProtocol => {
+ let slice = slice_serialize(codes, &get_span(events, index), false);
+ let buf = buf_tail_mut(buffers);
+ // To do: options.allowDangerousProtocol ? undefined : protocolHref
+ // let url = sanitize_uri(slice);
+ let url = encode(&slice);
+ buf.push(format!("<a href=\"{}\">", url));
+ buf.push(encode(&slice));
+ buf.push("</a>".to_string());
+ }
+ TokenType::AutolinkEmail => {
+ let slice = slice_serialize(codes, &get_span(events, index), false);
+ let buf = buf_tail_mut(buffers);
+ // To do: options.allowDangerousProtocol ? undefined : protocolHref
+ // let url = sanitize_uri(slice);
+ let url = encode(&slice);
+ buf.push(format!("<a href=\"mailto:{}\">", url));
+ buf.push(encode(&slice));
+ buf.push("</a>".to_string());
+ }
TokenType::ThematicBreak => {
buf_tail_mut(buffers).push("<hr />".to_string());
}
diff --git a/src/constant.rs b/src/constant.rs
index 332fdaf..c98c24d 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -27,6 +27,23 @@
/// [code_indented]: crate::construct::code_indented
pub const TAB_SIZE: usize = 4;
+/// The number of characters allowed in a protocol of an [autolink][].
+///
+/// The protocol part is the `xxx` in `<xxx://example.com>`.
+/// 32 characters is fine, 33 is too many.
+///
+/// [autolink]: crate::construct::autolink
+pub const AUTOLINK_SCHEME_SIZE_MAX: usize = 32;
+
+/// The number of characters allowed in a domain of an email [autolink][].
+///
+/// There can be multiple “domains”.
+/// A domain part is each `xxx` in `<example@xxx.xxx.xxx>`.
+/// 63 characters is fine, 64 is too many.
+///
+/// [autolink]: crate::construct::autolink
+pub const AUTOLINK_DOMAIN_SIZE_MAX: usize = 63;
+
/// The number of markers needed for a [thematic break][thematic_break] to form.
///
/// Like many things in markdown, the number is `3`.
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
new file mode 100644
index 0000000..24f2c20
--- /dev/null
+++ b/src/construct/autolink.rs
@@ -0,0 +1,327 @@
+//! Autolinks are a construct that occurs in the [text][] content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! autolink ::= '<' ( url | email ) '>'
+//!
+//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ')
+//! email ::= 1*ascii_atext '@' domain *('.' domain)
+//! ; Restriction: up to (including) 63 character are allowed in each domain.
+//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric )
+//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~'
+//! ```
+//!
+//! Autolinks relate to the `<a>` element in HTML.
+//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info.
+//! When an email autolink is used (so, without a protocol), the string
+//! `mailto:` is prepended before the email, when generating the `href`
+//! attribute of the hyperlink.
+//!
+//! The maximum allowed size of a scheme is `31` (inclusive), which is defined
+//! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max].
+//! The maximum allowed size of a domain is `63` (inclusive), which is defined
+//! in [`AUTOLINK_DOMAIN_SIZE_MAX`][autolink_domain_size_max].
+//!
+//! The grammar for autolinks is quite strict and requires ASCII to be used
+//! (without, for example, spaces).
+//! To use non-ascii characters and otherwise impossible characters, in URLs,
+//! you can use percent encoding:
+//!
+//! ```markdown
+//! <https://example.com/alpha%20bravo>
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p>
+//! ```
+//!
+//! Interestingly, there are a couple of things that are valid autolinks in
+//! markdown but in HTML would be valid tags, such as `<svg:rect>` and
+//! `<xml:lang/>`.
+//! However, because CommonMark employs a naïve HTML parsing algorithm, those
+//! are not considered HTML.
+//!
+//! While CommonMark restricts links from occurring in other links in the case
+//! of bracketed links, this restriction is not in place for autolinks inside
+//! autolinks:
+//!
+//! ```markdown
+//! [<https://example.com>](#)
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p><a href="#"><a href="https://example.com">https://example.com</a></a></p>
+//! ```
+//!
+//! The generated output, in this case, is invalid according to HTML.
+//! When a browser sees that markup, it will instead parse it as:
+//!
+//! ```html
+//! <p><a href="#"></a><a href="https://example.com">https://example.com</a></p>
+//! ```
+//!
+//! ## References
+//!
+//! * [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js)
+//! * [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks)
+//!
+//! [text]: crate::content::text
+//! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX
+//! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX
+//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
+//!
+//! <!-- To do: link to `encode` -->
+
+use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of an autolink.
+///
+/// ```markdown
+/// a|<https://example.com>b
+/// a|<user@example.com>b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('<') => {
+ tokenizer.enter(TokenType::Autolink);
+ tokenizer.enter(TokenType::AutolinkMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::AutolinkMarker);
+ tokenizer.enter(TokenType::AutolinkProtocol);
+ (State::Fn(Box::new(open)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<`, before the protocol.
+///
+/// ```markdown
+/// a<|https://example.com>b
+/// a<|user@example.com>b
+/// ```
+pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(scheme_or_email_atext)), None)
+ }
+ Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer, code),
+ _ => (State::Nok, None),
+ }
+}
+
+/// After the first character of the protocol or email name.
+///
+/// ```markdown
+/// a<h|ttps://example.com>b
+/// a<u|ser@example.com>b
+/// ```
+pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // Whether this character can be both a protocol and email atext.
+ let unknown = match code {
+ Code::Char('+' | '-' | '.') => true,
+ Code::Char(char) if char.is_ascii_alphanumeric() => true,
+ _ => false,
+ };
+
+ if unknown {
+ scheme_inside_or_email_atext(tokenizer, code, 1)
+ } else {
+ email_atext(tokenizer, code)
+ }
+}
+
+/// Inside an ambiguous protocol or email name.
+///
+/// ```markdown
+/// a<ht|tps://example.com>b
+/// a<us|er@example.com>b
+/// ```
+pub fn scheme_inside_or_email_atext(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ size: usize,
+) -> StateFnResult {
+ if let Code::Char(':') = code {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(url_inside)), None)
+ } else {
+ // Whether this character can be both a protocol and email atext.
+ let unknown = match code {
+ Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true,
+ Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => {
+ true
+ }
+ _ => false,
+ };
+
+ if unknown {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| {
+ scheme_inside_or_email_atext(t, c, size + 1)
+ })),
+ None,
+ )
+ } else {
+ email_atext(tokenizer, code)
+ }
+ }
+}
+
+/// Inside a URL, after the protocol.
+///
+/// ```markdown
+/// a<https:|//example.com>b
+/// ```
+pub fn url_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.exit(TokenType::AutolinkProtocol);
+ end(tokenizer, code)
+ }
+ Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
+ Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => {
+ (State::Nok, None)
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(url_inside)), None)
+ }
+ }
+}
+
+/// Inside email atext.
+///
+/// ```markdown
+/// a<user.na|me@example.com>b
+/// ```
+pub fn email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('@') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+ None,
+ )
+ }
+ Code::Char(char) if is_ascii_atext(char) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(email_atext)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After an at-sign or a dot in the label.
+///
+/// ```markdown
+/// a<user.name@|example.com>b
+/// a<user.name@example.|com>b
+/// ```
+pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, code, size),
+ _ => (State::Nok, None),
+ }
+}
+
+/// In the label, where `.` and `>` are allowed.
+///
+/// ```markdown
+/// a<user.name@ex|ample.com>b
+/// ```
+pub fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ Code::Char('.') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))),
+ None,
+ )
+ }
+ Code::Char('>') => {
+ let tail_index = tokenizer.events.len();
+ let head_index = tokenizer.events.len() - 1;
+ tokenizer.exit(TokenType::AutolinkProtocol);
+ // Change the token type.
+ tokenizer.events[head_index].token_type = TokenType::AutolinkEmail;
+ tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail;
+ end(tokenizer, code)
+ }
+ _ => email_value(tokenizer, code, size),
+ }
+}
+
+/// In the label, where `.` and `>` are *not* allowed.
+///
+/// Though, this is also used in `email_label` to parse other values.
+///
+/// ```markdown
+/// a<user.name@ex-|ample.com>b
+/// ```
+pub fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ let ok = match code {
+ Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+ Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true,
+ _ => false,
+ };
+
+ if ok {
+ tokenizer.consume(code);
+ let func = if let Code::Char('-') = code {
+ email_value
+ } else {
+ email_label
+ };
+ (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// At the `>`.
+///
+/// ```markdown
+/// a<https://example.com|>b
+/// a<user@example.com|>b
+/// ```
+pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.enter(TokenType::AutolinkMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::AutolinkMarker);
+ tokenizer.exit(TokenType::Autolink);
+ (State::Ok, None)
+ }
+ _ => unreachable!("expected `>` at `end`"),
+ }
+}
+
+/// Check whether the character code represents an ASCII atext.
+///
+/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in
+/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`),
+/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F
+/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E
+/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE
+/// (`{`) to U+007E TILDE (`~`).
+///
+/// See:
+/// **\[RFC5322]**:
+/// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
+/// P. Resnick.
+/// IETF.
+///
+/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
+fn is_ascii_atext(x: char) -> bool {
+ matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~')
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index d671db6..0bc8746 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -1,5 +1,6 @@
//! Constructs found in markdown.
+pub mod autolink;
pub mod blank_line;
pub mod character_escape;
pub mod character_reference;
diff --git a/src/content/text.rs b/src/content/text.rs
index 2c93b18..a7b40e7 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -5,7 +5,7 @@
//!
//! The constructs found in text are:
//!
-//! * Autolink
+//! * [Autolink][crate::construct::autolink]
//! * Attention
//! * HTML (text)
//! * Hard break escape
@@ -17,7 +17,8 @@
//! * [Character reference][crate::construct::character_reference]
use crate::construct::{
- character_escape::start as character_escape, character_reference::start as character_reference,
+ autolink::start as autolink, character_escape::start as character_escape,
+ character_reference::start as character_reference,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -33,7 +34,7 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
- _ => tokenizer.attempt_2(character_reference, character_escape, |ok| {
+ _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| {
Box::new(if ok { start } else { before_data })
})(tokenizer, code),
}
@@ -68,7 +69,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
(State::Ok, None)
}
// To do: somehow get these markers from constructs.
- Code::Char('&' | '\\') => {
+ Code::Char('&' | '\\' | '<') => {
tokenizer.exit(TokenType::Data);
start(tokenizer, code)
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 4d235ed..4c1caa4 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -20,6 +20,11 @@ use std::collections::HashMap;
// To do: document each variant.
#[derive(Debug, Clone, PartialEq)]
pub enum TokenType {
+ Autolink,
+ AutolinkMarker,
+ AutolinkProtocol,
+ AutolinkEmail,
+
AtxHeading,
AtxHeadingSequence,
AtxHeadingWhitespace,