diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/compiler.rs | 30 | ||||
-rw-r--r-- | src/constant.rs | 17 | ||||
-rw-r--r-- | src/construct/autolink.rs | 327 | ||||
-rw-r--r-- | src/construct/mod.rs | 1 | ||||
-rw-r--r-- | src/content/text.rs | 9 | ||||
-rw-r--r-- | src/tokenizer.rs | 5 |
6 files changed, 383 insertions, 6 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 48983b6..df26f1b 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -89,7 +89,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterReferenceMarkerNumeric | TokenType::CharacterReferenceMarkerHexadecimal | TokenType::CharacterReferenceMarkerSemi - | TokenType::CharacterReferenceValue => {} + | TokenType::CharacterReferenceValue + | TokenType::Autolink + | TokenType::AutolinkMarker + | TokenType::AutolinkProtocol + | TokenType::AutolinkEmail => {} #[allow(unreachable_patterns)] _ => { unreachable!("unhandled `enter` of TokenType {:?}", token_type) @@ -108,7 +112,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CharacterEscape | TokenType::CharacterEscapeMarker | TokenType::CharacterReference - | TokenType::CharacterReferenceMarkerSemi => {} + | TokenType::CharacterReferenceMarkerSemi + | TokenType::Autolink + | TokenType::AutolinkMarker => {} TokenType::HtmlFlow => { ignore_encode = false; } @@ -229,6 +235,26 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St atx_opening_sequence_size = None; atx_heading_buffer = None; } + TokenType::AutolinkProtocol => { + let slice = slice_serialize(codes, &get_span(events, index), false); + let buf = buf_tail_mut(buffers); + // To do: options.allowDangerousProtocol ? undefined : protocolHref + // let url = sanitize_uri(slice); + let url = encode(&slice); + buf.push(format!("<a href=\"{}\">", url)); + buf.push(encode(&slice)); + buf.push("</a>".to_string()); + } + TokenType::AutolinkEmail => { + let slice = slice_serialize(codes, &get_span(events, index), false); + let buf = buf_tail_mut(buffers); + // To do: options.allowDangerousProtocol ? undefined : protocolHref + // let url = sanitize_uri(slice); + let url = encode(&slice); + buf.push(format!("<a href=\"mailto:{}\">", url)); + buf.push(encode(&slice)); + buf.push("</a>".to_string()); + } TokenType::ThematicBreak => { buf_tail_mut(buffers).push("<hr />".to_string()); } diff --git a/src/constant.rs b/src/constant.rs index 332fdaf..c98c24d 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -27,6 +27,23 @@ /// [code_indented]: crate::construct::code_indented pub const TAB_SIZE: usize = 4; +/// The number of characters allowed in a protocol of an [autolink][]. +/// +/// The protocol part is the `xxx` in `<xxx://example.com>`. +/// 32 characters is fine, 33 is too many. +/// +/// [autolink]: crate::construct::autolink +pub const AUTOLINK_SCHEME_SIZE_MAX: usize = 32; + +/// The number of characters allowed in a domain of an email [autolink][]. +/// +/// There can be multiple “domains”. +/// A domain part is each `xxx` in `<example@xxx.xxx.xxx>`. +/// 63 characters is fine, 64 is too many. +/// +/// [autolink]: crate::construct::autolink +pub const AUTOLINK_DOMAIN_SIZE_MAX: usize = 63; + /// The number of markers needed for a [thematic break][thematic_break] to form. /// /// Like many things in markdown, the number is `3`. diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs new file mode 100644 index 0000000..24f2c20 --- /dev/null +++ b/src/construct/autolink.rs @@ -0,0 +1,327 @@ +//! Autolinks are a construct that occurs in the [text][] content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! autolink ::= '<' ( url | email ) '>' +//! +//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ') +//! email ::= 1*ascii_atext '@' domain *('.' domain) +//! ; Restriction: up to (including) 63 character are allowed in each domain. +//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric ) +//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~' +//! ``` +//! +//! Autolinks relate to the `<a>` element in HTML. +//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info. +//! When an email autolink is used (so, without a protocol), the string +//! `mailto:` is prepended before the email, when generating the `href` +//! attribute of the hyperlink. +//! +//! The maximum allowed size of a scheme is `31` (inclusive), which is defined +//! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max]. +//! The maximum allowed size of a domain is `63` (inclusive), which is defined +//! in [`AUTOLINK_DOMAIN_SIZE_MAX`][autolink_domain_size_max]. +//! +//! The grammar for autolinks is quite strict and requires ASCII to be used +//! (without, for example, spaces). +//! To use non-ascii characters and otherwise impossible characters, in URLs, +//! you can use percent encoding: +//! +//! ```markdown +//! <https://example.com/alpha%20bravo> +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p> +//! ``` +//! +//! Interestingly, there are a couple of things that are valid autolinks in +//! markdown but in HTML would be valid tags, such as `<svg:rect>` and +//! `<xml:lang/>`. +//! However, because CommonMark employs a naïve HTML parsing algorithm, those +//! are not considered HTML. +//! +//! While CommonMark restricts links from occurring in other links in the case +//! of bracketed links, this restriction is not in place for autolinks inside +//! autolinks: +//! +//! ```markdown +//! [<https://example.com>](#) +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p><a href="#"><a href="https://example.com">https://example.com</a></a></p> +//! ``` +//! +//! The generated output, in this case, is invalid according to HTML. +//! When a browser sees that markup, it will instead parse it as: +//! +//! ```html +//! <p><a href="#"></a><a href="https://example.com">https://example.com</a></p> +//! ``` +//! +//! ## References +//! +//! * [`autolink.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/autolink.js) +//! * [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks) +//! +//! [text]: crate::content::text +//! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX +//! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX +//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! +//! <!-- To do: link to `encode` --> + +use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of an autolink. +/// +/// ```markdown +/// a|<https://example.com>b +/// a|<user@example.com>b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('<') => { + tokenizer.enter(TokenType::Autolink); + tokenizer.enter(TokenType::AutolinkMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::AutolinkMarker); + tokenizer.enter(TokenType::AutolinkProtocol); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// After `<`, before the protocol. +/// +/// ```markdown +/// a<|https://example.com>b +/// a<|user@example.com>b +/// ``` +pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + (State::Fn(Box::new(scheme_or_email_atext)), None) + } + Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer, code), + _ => (State::Nok, None), + } +} + +/// After the first character of the protocol or email name. +/// +/// ```markdown +/// a<h|ttps://example.com>b +/// a<u|ser@example.com>b +/// ``` +pub fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // Whether this character can be both a protocol and email atext. + let unknown = match code { + Code::Char('+' | '-' | '.') => true, + Code::Char(char) if char.is_ascii_alphanumeric() => true, + _ => false, + }; + + if unknown { + scheme_inside_or_email_atext(tokenizer, code, 1) + } else { + email_atext(tokenizer, code) + } +} + +/// Inside an ambiguous protocol or email name. +/// +/// ```markdown +/// a<ht|tps://example.com>b +/// a<us|er@example.com>b +/// ``` +pub fn scheme_inside_or_email_atext( + tokenizer: &mut Tokenizer, + code: Code, + size: usize, +) -> StateFnResult { + if let Code::Char(':') = code { + tokenizer.consume(code); + (State::Fn(Box::new(url_inside)), None) + } else { + // Whether this character can be both a protocol and email atext. + let unknown = match code { + Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true, + Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => { + true + } + _ => false, + }; + + if unknown { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |t, c| { + scheme_inside_or_email_atext(t, c, size + 1) + })), + None, + ) + } else { + email_atext(tokenizer, code) + } + } +} + +/// Inside a URL, after the protocol. +/// +/// ```markdown +/// a<https:|//example.com>b +/// ``` +pub fn url_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.exit(TokenType::AutolinkProtocol); + end(tokenizer, code) + } + Code::Char(char) if char.is_ascii_control() => (State::Nok, None), + Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => { + (State::Nok, None) + } + Code::Char(_) => { + tokenizer.consume(code); + (State::Fn(Box::new(url_inside)), None) + } + } +} + +/// Inside email atext. +/// +/// ```markdown +/// a<user.na|me@example.com>b +/// ``` +pub fn email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('@') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))), + None, + ) + } + Code::Char(char) if is_ascii_atext(char) => { + tokenizer.consume(code); + (State::Fn(Box::new(email_atext)), None) + } + _ => (State::Nok, None), + } +} + +/// After an at-sign or a dot in the label. +/// +/// ```markdown +/// a<user.name@|example.com>b +/// a<user.name@example.|com>b +/// ``` +pub fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, code, size), + _ => (State::Nok, None), + } +} + +/// In the label, where `.` and `>` are allowed. +/// +/// ```markdown +/// a<user.name@ex|ample.com>b +/// ``` +pub fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + Code::Char('.') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|t, c| email_at_sign_or_dot(t, c, 0))), + None, + ) + } + Code::Char('>') => { + let tail_index = tokenizer.events.len(); + let head_index = tokenizer.events.len() - 1; + tokenizer.exit(TokenType::AutolinkProtocol); + // Change the token type. + tokenizer.events[head_index].token_type = TokenType::AutolinkEmail; + tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail; + end(tokenizer, code) + } + _ => email_value(tokenizer, code, size), + } +} + +/// In the label, where `.` and `>` are *not* allowed. +/// +/// Though, this is also used in `email_label` to parse other values. +/// +/// ```markdown +/// a<user.name@ex-|ample.com>b +/// ``` +pub fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + let ok = match code { + Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true, + Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true, + _ => false, + }; + + if ok { + tokenizer.consume(code); + let func = if let Code::Char('-') = code { + email_value + } else { + email_label + }; + (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None) + } else { + (State::Nok, None) + } +} + +/// At the `>`. +/// +/// ```markdown +/// a<https://example.com|>b +/// a<user@example.com|>b +/// ``` +pub fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.enter(TokenType::AutolinkMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::AutolinkMarker); + tokenizer.exit(TokenType::Autolink); + (State::Ok, None) + } + _ => unreachable!("expected `>` at `end`"), + } +} + +/// Check whether the character code represents an ASCII atext. +/// +/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in +/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), +/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F +/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E +/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE +/// (`{`) to U+007E TILDE (`~`). +/// +/// See: +/// **\[RFC5322]**: +/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). +/// P. Resnick. +/// IETF. +/// +/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric +fn is_ascii_atext(x: char) -> bool { + matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~') +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index d671db6..0bc8746 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -1,5 +1,6 @@ //! Constructs found in markdown. +pub mod autolink; pub mod blank_line; pub mod character_escape; pub mod character_reference; diff --git a/src/content/text.rs b/src/content/text.rs index 2c93b18..a7b40e7 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -5,7 +5,7 @@ //! //! The constructs found in text are: //! -//! * Autolink +//! * [Autolink][crate::construct::autolink] //! * Attention //! * HTML (text) //! * Hard break escape @@ -17,7 +17,8 @@ //! * [Character reference][crate::construct::character_reference] use crate::construct::{ - character_escape::start as character_escape, character_reference::start as character_reference, + autolink::start as autolink, character_escape::start as character_escape, + character_reference::start as character_reference, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -33,7 +34,7 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), - _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { + _ => tokenizer.attempt_3(character_reference, character_escape, autolink, |ok| { Box::new(if ok { start } else { before_data }) })(tokenizer, code), } @@ -68,7 +69,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Ok, None) } // To do: somehow get these markers from constructs. - Code::Char('&' | '\\') => { + Code::Char('&' | '\\' | '<') => { tokenizer.exit(TokenType::Data); start(tokenizer, code) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4d235ed..4c1caa4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -20,6 +20,11 @@ use std::collections::HashMap; // To do: document each variant. #[derive(Debug, Clone, PartialEq)] pub enum TokenType { + Autolink, + AutolinkMarker, + AutolinkProtocol, + AutolinkEmail, + AtxHeading, AtxHeadingSequence, AtxHeadingWhitespace, |