From efdf90959f78d1582da312bffbefaabb79f264b7 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 13 Jun 2022 12:37:25 +0200 Subject: Add autolinks --- readme.md | 5 +- src/compiler.rs | 30 ++++- src/constant.rs | 17 +++ src/construct/autolink.rs | 327 ++++++++++++++++++++++++++++++++++++++++++++++ src/construct/mod.rs | 1 + src/content/text.rs | 9 +- src/tokenizer.rs | 5 + tests/autolink.rs | 247 ++++++++++++++++++++++++++++++++++ 8 files changed, 633 insertions(+), 8 deletions(-) create mode 100644 src/construct/autolink.rs create mode 100644 tests/autolink.rs diff --git a/readme.md b/readme.md index 527170d..26035c4 100644 --- a/readme.md +++ b/readme.md @@ -68,6 +68,7 @@ cargo doc --document-private-items ### Small things +- [ ] (3) Encode urls - [ ] (1) Parse initial and final whitespace of paragraphs (in text) - [ ] (3) Clean compiler - [ ] (1) Optionally remove dangerous protocols when compiling @@ -96,7 +97,7 @@ cargo doc --document-private-items ### Constructs - [ ] (5) attention (strong, emphasis) (text) -- [ ] (1) autolink +- [x] autolink - [x] blank line - [ ] (5) block quote - [x] character escape @@ -137,7 +138,7 @@ cargo doc --document-private-items - [x] paragraph - [ ] (5) text - [ ] attention (strong, emphasis) (text) - - [ ] autolink + - [x] autolink - [x] character escape - [x] character reference - [ ] code (text) diff --git a/src/compiler.rs b/src/compiler.rs index 48983b6..df26f1b 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -89,7 +89,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterReferenceMarkerNumeric | TokenType::CharacterReferenceMarkerHexadecimal | TokenType::CharacterReferenceMarkerSemi - | TokenType::CharacterReferenceValue => {} + | TokenType::CharacterReferenceValue + | TokenType::Autolink + | TokenType::AutolinkMarker + | TokenType::AutolinkProtocol + | TokenType::AutolinkEmail => {} #[allow(unreachable_patterns)] _ => { unreachable!("unhandled `enter` of TokenType {:?}", token_type) @@ -108,7 +112,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterEscape | TokenType::CharacterEscapeMarker | TokenType::CharacterReference - | TokenType::CharacterReferenceMarkerSemi => {} + | TokenType::CharacterReferenceMarkerSemi + | TokenType::Autolink + | TokenType::AutolinkMarker => {} TokenType::HtmlFlow => { ignore_encode = false; } @@ -229,6 +235,26 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St atx_opening_sequence_size = None; atx_heading_buffer = None; } + TokenType::AutolinkProtocol => { + let slice = slice_serialize(codes, &get_span(events, index), false); + let buf = buf_tail_mut(buffers); + // To do: options.allowDangerousProtocol ? undefined : protocolHref + // let url = sanitize_uri(slice); + let url = encode(&slice); + buf.push(format!("", url)); + buf.push(encode(&slice)); + buf.push("".to_string()); + } + TokenType::AutolinkEmail => { + let slice = slice_serialize(codes, &get_span(events, index), false); + let buf = buf_tail_mut(buffers); + // To do: options.allowDangerousProtocol ? undefined : protocolHref + // let url = sanitize_uri(slice); + let url = encode(&slice); + buf.push(format!("", url)); + buf.push(encode(&slice)); + buf.push("".to_string()); + } TokenType::ThematicBreak => { buf_tail_mut(buffers).push("
".to_string()); } diff --git a/src/constant.rs b/src/constant.rs index 332fdaf..c98c24d 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -27,6 +27,23 @@ /// [code_indented]: crate::construct::code_indented pub const TAB_SIZE: usize = 4; +/// The number of characters allowed in a protocol of an [autolink][]. +/// +/// The protocol part is the `xxx` in ``. +/// 32 characters is fine, 33 is too many. +/// +/// [autolink]: crate::construct::autolink +pub const AUTOLINK_SCHEME_SIZE_MAX: usize = 32; + +/// The number of characters allowed in a domain of an email [autolink][]. +/// +/// There can be multiple “domains”. +/// A domain part is each `xxx` in ``. +/// 63 characters is fine, 64 is too many. +/// +/// [autolink]: crate::construct::autolink +pub const AUTOLINK_DOMAIN_SIZE_MAX: usize = 63; + /// The number of markers needed for a [thematic break][thematic_break] to form. /// /// Like many things in markdown, the number is `3`. diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs new file mode 100644 index 0000000..24f2c20 --- /dev/null +++ b/src/construct/autolink.rs @@ -0,0 +1,327 @@ +//! Autolinks are a construct that occurs in the [text][] content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! autolink ::= '<' ( url | email ) '>' +//! +//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ') +//! email ::= 1*ascii_atext '@' domain *('.' domain) +//! ; Restriction: up to (including) 63 character are allowed in each domain. +//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric ) +//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~' +//! ``` +//! +//! Autolinks relate to the `` element in HTML. +//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info. +//! When an email autolink is used (so, without a protocol), the string +//! `mailto:` is prepended before the email, when generating the `href` +//! attribute of the hyperlink. +//! +//! The maximum allowed size of a scheme is `31` (inclusive), which is defined +//! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max]. +//! The maximum allowed size of a domain is `63` (inclusive), which is defined +//! in [`AUTOLINK_DOMAIN_SIZE_MAX`][autolink_domain_size_max]. +//! +//! The grammar for autolinks is quite strict and requires ASCII to be used +//! (without, for example, spaces). +//! To use non-ascii characters and otherwise impossible characters, in URLs, +//! you can use percent encoding: +//! +//! ```markdown +//! +//! ``` +//! +//! Yields: +//! +//! ```html +//!

https://example.com/alpha%20bravo

+//! ``` +//! +//! Interestingly, there are a couple of things that are valid autolinks in +//! markdown but in HTML would be valid tags, such as `` and +//! ``. +//! However, because CommonMark employs a naïve HTML parsing algorithm, those +//! are not considered HTML. +//! +//! While CommonMark restricts links from occurring in other links in the case +//! of bracketed links, this restriction is not in place for autolinks inside +//! autolinks: +//! +//! ```markdown +//! [](#) +//! ``` +//! +//! Yields: +//! +//! ```html +//!

https://example.com

+//! ``` +//! +//! The generated output, in this case, is invalid according to HTML. +//! When a browser sees that markup, it will instead parse it as: +//! +//! ```html +//!

https://example.com

+//! ``` +//! +//! ## References +//! +//! * [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js) +//! * [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks) +//! +//! [text]: crate::content::text +//! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX +//! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX +//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! +//! + +use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of an autolink. +/// +/// ```markdown +/// a|b +/// a|b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('<') => { + tokenizer.enter(TokenType::Autolink); + tokenizer.enter(TokenType::AutolinkMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::AutolinkMarker); + tokenizer.enter(TokenType::AutolinkProtocol); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// After `<`, before the protocol. +/// +/// ```markdown +/// a<|https://example.com>b +/// a<|user@example.com>b +/// ``` +pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(scheme_or_email_atext)), None) + } + Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer, code), + _ => (State::Nok, None), + } +} + +/// After the first character of the protocol or email name. +/// +/// ```markdown +/// ab +/// ab +/// ``` +pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // Whether this character can be both a protocol and email atext. + let unknown = match code { + Code::Char('+' | '-' | '.') => true, + Code::Char(char) if char.is_ascii_alphanumeric() => true, + _ => false, + }; + + if unknown { + scheme_inside_or_email_atext(tokenizer, code, 1) + } else { + email_atext(tokenizer, code) + } +} + +/// Inside an ambiguous protocol or email name. +/// +/// ```markdown +/// ab +/// ab +/// ``` +pub fn scheme_inside_or_email_atext( + tokenizer: &mut Tokenizer, + code: Code, + size: usize, +) -> StateFnResult { + if let Code::Char(':') = code { + tokenizer.consume(code); + (State::Fn(Box::new(url_inside)), None) + } else { + // Whether this character can be both a protocol and email atext. + let unknown = match code { + Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true, + Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => { + true + } + _ => false, + }; + + if unknown { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |t, c| { + scheme_inside_or_email_atext(t, c, size + 1) + })), + None, + ) + } else { + email_atext(tokenizer, code) + } + } +} + +/// Inside a URL, after the protocol. +/// +/// ```markdown +/// ab +/// ``` +pub fn url_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.exit(TokenType::AutolinkProtocol); + end(tokenizer, code) + } + Code::Char(char) if char.is_ascii_control() => (State::Nok, None), + Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => { + (State::Nok, None) + } + Code::Char(_) => { + tokenizer.consume(code); + (State::Fn(Box::new(url_inside)), None) + } + } +} + +/// Inside email atext. +/// +/// ```markdown +/// ab +/// ``` +pub fn email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('@') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))), + None, + ) + } + Code::Char(char) if is_ascii_atext(char) => { + tokenizer.consume(code); + (State::Fn(Box::new(email_atext)), None) + } + _ => (State::Nok, None), + } +} + +/// After an at-sign or a dot in the label. +/// +/// ```markdown +/// ab +/// ab +/// ``` +pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, code, size), + _ => (State::Nok, None), + } +} + +/// In the label, where `.` and `>` are allowed. +/// +/// ```markdown +/// ab +/// ``` +pub fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + Code::Char('.') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))), + None, + ) + } + Code::Char('>') => { + let tail_index = tokenizer.events.len(); + let head_index = tokenizer.events.len() - 1; + tokenizer.exit(TokenType::AutolinkProtocol); + // Change the token type. + tokenizer.events[head_index].token_type = TokenType::AutolinkEmail; + tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail; + end(tokenizer, code) + } + _ => email_value(tokenizer, code, size), + } +} + +/// In the label, where `.` and `>` are *not* allowed. +/// +/// Though, this is also used in `email_label` to parse other values. +/// +/// ```markdown +/// ab +/// ``` +pub fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + let ok = match code { + Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true, + Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true, + _ => false, + }; + + if ok { + tokenizer.consume(code); + let func = if let Code::Char('-') = code { + email_value + } else { + email_label + }; + (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None) + } else { + (State::Nok, None) + } +} + +/// At the `>`. +/// +/// ```markdown +/// ab +/// ab +/// ``` +pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.enter(TokenType::AutolinkMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::AutolinkMarker); + tokenizer.exit(TokenType::Autolink); + (State::Ok, None) + } + _ => unreachable!("expected `>` at `end`"), + } +} + +/// Check whether the character code represents an ASCII atext. +/// +/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in +/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), +/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F +/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E +/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE +/// (`{`) to U+007E TILDE (`~`). +/// +/// See: +/// **\[RFC5322]**: +/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). +/// P. Resnick. +/// IETF. +/// +/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric +fn is_ascii_atext(x: char) -> bool { + matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~') +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index d671db6..0bc8746 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -1,5 +1,6 @@ //! Constructs found in markdown. +pub mod autolink; pub mod blank_line; pub mod character_escape; pub mod character_reference; diff --git a/src/content/text.rs b/src/content/text.rs index 2c93b18..a7b40e7 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -5,7 +5,7 @@ //! //! The constructs found in text are: //! -//! * Autolink +//! * [Autolink][crate::construct::autolink] //! * Attention //! * HTML (text) //! * Hard break escape @@ -17,7 +17,8 @@ //! * [Character reference][crate::construct::character_reference] use crate::construct::{ - character_escape::start as character_escape, character_reference::start as character_reference, + autolink::start as autolink, character_escape::start as character_escape, + character_reference::start as character_reference, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -33,7 +34,7 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), - _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { + _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| { Box::new(if ok { start } else { before_data }) })(tokenizer, code), } @@ -68,7 +69,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Ok, None) } // To do: somehow get these markers from constructs. - Code::Char('&' | '\\') => { + Code::Char('&' | '\\' | '<') => { tokenizer.exit(TokenType::Data); start(tokenizer, code) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4d235ed..4c1caa4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -20,6 +20,11 @@ use std::collections::HashMap; // To do: document each variant. #[derive(Debug, Clone, PartialEq)] pub enum TokenType { + Autolink, + AutolinkMarker, + AutolinkProtocol, + AutolinkEmail, + AtxHeading, AtxHeadingSequence, AtxHeadingWhitespace, diff --git a/tests/autolink.rs b/tests/autolink.rs new file mode 100644 index 0000000..fc49dcb --- /dev/null +++ b/tests/autolink.rs @@ -0,0 +1,247 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn autolink() { + assert_eq!( + micromark("```\n<\n >\n```"), + "
<\n >\n
", + "should support fenced code w/ grave accents" + ); + + assert_eq!( + micromark(""), + "

http://foo.bar.baz

", + "should support protocol autolinks (1)" + ); + + assert_eq!( + micromark(""), + "

http://foo.bar.baz/test?q=hello&id=22&boolean

", + "should support protocol autolinks (2)" + ); + + assert_eq!( + micromark(""), + "

irc://foo.bar:2233/baz

", + "should support protocol autolinks w/ non-HTTP schemes" + ); + + assert_eq!( + micromark(""), + "

MAILTO:FOO@BAR.BAZ

", + "should support protocol autolinks in uppercase" + ); + + // To do: safety. + // assert_eq!( + // micromark("", {allowDangerousProtocol: true}), + // "

a+b+c:d

", + // "should support protocol autolinks w/ incorrect URIs (1)" + // ); + + // To do: safety. + // assert_eq!( + // micromark("", {allowDangerousProtocol: true}), + // "

made-up-scheme://foo,bar

", + // "should support protocol autolinks w/ incorrect URIs (2)" + // ); + + assert_eq!( + micromark(""), + "

http://../

", + "should support protocol autolinks w/ incorrect URIs (3)" + ); + + // To do: safety. + // assert_eq!( + // micromark("", {allowDangerousProtocol: true}), + // "

localhost:5001/foo

", + // "should support protocol autolinks w/ incorrect URIs (4)" + // ); + + assert_eq!( + micromark(""), + "

<http://foo.bar/baz bim>

", + "should not support protocol autolinks w/ spaces" + ); + + // To do: encode urls. + // assert_eq!( + // micromark(""), + // "

http://example.com/\\[\\

", + // "should not support character escapes in protocol autolinks" + // ); + + assert_eq!( + micromark(""), + "

foo@bar.example.com

", + "should support email autolinks (1)" + ); + + assert_eq!( + micromark(""), + "

foo+special@Bar.baz-bar0.com

", + "should support email autolinks (2)" + ); + + assert_eq!( + micromark(""), + "

a@b.c

", + "should support email autolinks (3)" + ); + + assert_eq!( + micromark(""), + "

<foo+@bar.example.com>

", + "should not support character escapes in email autolinks" + ); + + assert_eq!( + micromark("<>"), + "

<>

", + "should not support empty autolinks" + ); + + assert_eq!( + micromark("< http://foo.bar >"), + "

< http://foo.bar >

", + "should not support autolinks w/ space" + ); + + assert_eq!( + micromark(""), + "

<m:abc>

", + "should not support autolinks w/ a single character for a scheme" + ); + + assert_eq!( + micromark(""), + "

<foo.bar.baz>

", + "should not support autolinks w/o a colon or at sign" + ); + + assert_eq!( + micromark("http://example.com"), + "

http://example.com

", + "should not support protocol autolinks w/o angle brackets" + ); + + assert_eq!( + micromark("foo@bar.example.com"), + "

foo@bar.example.com

", + "should not support email autolinks w/o angle brackets" + ); + + // Extra: + assert_eq!( + micromark("<*@example.com>"), + "

*@example.com

", + "should support autolinks w/ atext (1)" + ); + assert_eq!( + micromark(""), + "

a*@example.com

", + "should support autolinks w/ atext (2)" + ); + assert_eq!( + micromark(""), + "

aa*@example.com

", + "should support autolinks w/ atext (3)" + ); + + assert_eq!( + micromark(""), + "

<aaa©@example.com>

", + "should support non-atext in email autolinks local part (1)" + ); + assert_eq!( + micromark(""), + "

<a*a©@example.com>

", + "should support non-atext in email autolinks local part (2)" + ); + + assert_eq!( + micromark(""), + "

<asd@.example.com>

", + "should not support a dot after an at sign in email autolinks" + ); + assert_eq!( + micromark(""), + "

<asd@e..xample.com>

", + "should not support a dot after another dot in email autolinks" + ); + + assert_eq!( + micromark( + "" + ), + "

asd@012345678901234567890123456789012345678901234567890123456789012

", + "should support 63 character in email autolinks domains" + ); + + assert_eq!( + micromark(""), + "

<asd@0123456789012345678901234567890123456789012345678901234567890123>

", + "should not support 64 character in email autolinks domains" + ); + + assert_eq!( + micromark( + "" + ), + "

asd@012345678901234567890123456789012345678901234567890123456789012.a

", + "should support a TLD after a 63 character domain in email autolinks" + ); + + assert_eq!( + micromark(""), + "

<asd@0123456789012345678901234567890123456789012345678901234567890123.a>

", + "should not support a TLD after a 64 character domain in email autolinks" + ); + + assert_eq!( + micromark( + "" + ), + "

asd@a.012345678901234567890123456789012345678901234567890123456789012

", + "should support a 63 character TLD in email autolinks" + ); + + assert_eq!( + micromark(""), + "

<asd@a.0123456789012345678901234567890123456789012345678901234567890123>

", + "should not support a 64 character TLD in email autolinks" + ); + + assert_eq!( + micromark(""), + "

<asd@-example.com>

", + "should not support a dash after `@` in email autolinks" + ); + + assert_eq!( + micromark(""), + "

asd@e-xample.com

", + "should support a dash after other domain characters in email autolinks" + ); + + assert_eq!( + micromark(""), + "

asd@e--xample.com

", + "should support a dash after another dash in email autolinks" + ); + + assert_eq!( + micromark(""), + "

<asd@example-.com>

", + "should not support a dash before a dot in email autolinks" + ); + + // To do: extensions. + // assert_eq!( + // micromark("", {extensions: [{disable: {null: ["autolink"]}}]}), + // "

<a@b.co>

", + // "should support turning off autolinks" + // ); +} -- cgit