From 3d00bf57a225369120fd98bee36f65a541260da1 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 5 Sep 2022 15:03:24 +0200 Subject: Fix to implement GFM autolink literals exactly --- src/compiler.rs | 37 +- src/construct/gfm_autolink_literal.rs | 848 +++++++++--- src/construct/gfm_table.rs | 2 +- src/construct/text.rs | 20 +- src/state.rs | 60 + tests/gfm_autolink_literal.rs | 2442 ++++++++++++++++++++++++++++++++- 6 files changed, 3245 insertions(+), 164 deletions(-) diff --git a/src/compiler.rs b/src/compiler.rs index 681ec00..0ea1638 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -871,6 +871,7 @@ fn on_exit_autolink_email(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + false, ); } @@ -884,6 +885,7 @@ fn on_exit_autolink_protocol(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + false, ); } @@ -1154,6 +1156,7 @@ fn on_exit_gfm_autolink_literal_protocol(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + true, ); } @@ -1167,12 +1170,22 @@ fn on_exit_gfm_autolink_literal_www(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + true, ); } /// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail]. fn on_exit_gfm_autolink_literal_email(context: &mut CompileContext) { - on_exit_autolink_email(context); + generate_autolink( + context, + Some("mailto:"), + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + true, + ); } /// Handle [`Exit`][Kind::Exit]:[`GfmFootnoteCall`][Name::GfmFootnoteCall]. @@ -1822,8 +1835,24 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) { } /// Generate an autolink (used by unicode autolinks and GFM autolink literals). -fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { - if !context.image_alt_inside { +fn generate_autolink( + context: &mut CompileContext, + protocol: Option<&str>, + value: &str, + is_gfm_literal: bool, +) { + let mut is_in_link = false; + let mut index = 0; + + while index < context.media_stack.len() { + if !context.media_stack[index].image { + is_in_link = true; + break; + } + index += 1; + } + + if !context.image_alt_inside && (!is_in_link || !is_gfm_literal) { context.push(", value context.push(&encode(value, context.encode_html)); - if !context.image_alt_inside { + if !context.image_alt_inside && (!is_in_link || !is_gfm_literal) { context.push(""); } } diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index 704c536..038330c 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -1,14 +1,621 @@ -//! To do. +//! GFM: autolink literal occurs in the [text][] content type. +//! +//! ## Grammar +//! +//! Autolink literals form with the following BNF +//! (see [construct][crate::construct] for character groups): +//! +//! ```bnf +//! gfm_autolink_literal ::= gfm_protocol_autolink | gfm_www_autolink | gfm_email_autolink +//! +//! ; Restriction: the code before must be `www_autolink_before`. +//! ; Restriction: the code after `.` must not be eof. +//! www_autolink ::= 3('w' | 'W') '.' [domain [path]] +//! www_autolink_before ::= eof | eol | space_or_tab | '(' | '*' | '_' | '[' | ']' | '~' +//! +//! ; Restriction: the code before must be `http_autolink_before`. +//! ; Restriction: the code after the protocol must be `http_autolink_protocol_after`. +//! http_autolink ::= ('h' | 'H') 2('t' | 'T') ('p' | 'P') ['s' | 'S'] ':' 2'/' domain [path] +//! http_autolink_before ::= byte - ascii_alpha +//! http_autolink_protocol_after ::= byte - eof - eol - ascii_control - unicode_whitespace - unicode_punctuation +//! +//! ; Restriction: the code before must be `email_autolink_before`. +//! ; Restriction: `ascii_digit` may not occur in the last label part of the label. +//! email_autolink ::= 1*('+' | '-' | '.' | '_' | ascii_alphanumeric) '@' 1*(1*label_segment label_dot_cont) 1*label_segment +//! email_autolink_before ::= byte - ascii_alpha - '/' +//! +//! ; Restriction: `_` may not occur in the last two domain parts. +//! domain ::= 1*(url_ampt_cont | domain_punct_cont | '-' | byte - eof - ascii_control - unicode_whitespace - unicode_punctuation) +//! ; Restriction: must not be followed by `punct`. +//! domain_punct_cont ::= '.' | '_' +//! ; Restriction: must not be followed by `char-ref`. +//! url_ampt_cont ::= '&' +//! +//! ; Restriction: a counter `balance = 0` is increased for every `(`, and decreased for every `)`. +//! ; Restriction: `)` must not be `paren_at_end`. +//! path ::= 1*(url_ampt_cont | path_punctuation_cont | '(' | ')' | byte - eof - eol - space_or_tab) +//! ; Restriction: must not be followed by `punct`. +//! path_punctuation_cont ::= trailing_punctuation - '<' +//! ; Restriction: must be followed by `punct` and `balance` must be less than `0`. +//! paren_at_end ::= ')' +//! +//! label_segment ::= label_dash_underscore_cont | ascii_alpha | ascii_digit +//! ; Restriction: if followed by `punct`, the whole email autolink is invalid. +//! label_dash_underscore_cont ::= '-' | '_' +//! ; Restriction: must not be followed by `punct`. +//! label_dot_cont ::= '.' +//! +//! punct ::= *trailing_punctuation ( byte - eof - eol - space_or_tab - '<' ) +//! char_ref ::= *ascii_alpha ';' path_end +//! trailing_punctuation ::= '!' | '"' | '\'' | ')' | '*' | ',' | '.' | ':' | ';' | '<' | '?' | '_' | '~' +//! ``` +//! +//! The grammar for GFM autolink literal is very relaxed: basically anything +//! except for whitespace is allowed after a prefix. +//! To use whitespace characters and otherwise impossible characters, in URLs, +//! you can use percent encoding: +//! +//! ```markdown +//! https://example.com/alpha%20bravo +//! ``` +//! +//! Yields: +//! +//! ```html +//!

https://example.com/alpha%20bravo

+//! ``` +//! +//! There are several cases where incorrect encoding of URLs would, in other +//! languages, result in a parse error. +//! In markdown, there are no errors, and URLs are normalized. +//! In addition, many characters are percent encoded +//! ([`sanitize_uri`][sanitize_uri]). +//! For example: +//! +//! ```markdown +//! www.ađź‘Ťb% +//! ``` +//! +//! Yields: +//! +//! ```html +//!

www.ađź‘Ťb%

+//! ``` +//! +//! There is a big difference between how www and protocol literals work +//! compared to how email literals work. +//! The first two are done when parsing, and work like anything else in +//! markdown. +//! But email literals are handled afterwards: when everything is parsed, we +//! look back at the events to figure out if there were email addresses. +//! This particularly affects how they interleave with character escapes and +//! character references. +//! +//! ## HTML +//! +//! GFM autolink literals relate to the `` element in HTML. +//! See [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info. +//! When an email autolink is used, the string `mailto:` is prepended when +//! generating the `href` attribute of the hyperlink. +//! When a www autolink is used, the string `http:` is prepended. +//! +//! ## Recommendation +//! +//! It is recommended to use labels ([label start link][label_start_link], +//! [label end][label_end]), either with a resource or a definition +//! ([definition][]), instead of autolink literals, as those allow relative +//! URLs and descriptive text to explain the URL in prose. +//! +//! ## Bugs +//! +//! GitHub’s own algorithm to parse autolink literals contains three bugs. +//! A smaller bug is left unfixed in this project for consistency. +//! Two main bugs are not present in this project. +//! The issues relating to autolink literals are: +//! +//! * [GFM autolink extension (`www.`, `https?://` parts): links don’t work when after bracket](https://github.com/github/cmark-gfm/issues/278)\ +//! fixed here ✅ +//! * [GFM autolink extension (`www.` part): uppercase does not match on issues/PRs/comments](https://github.com/github/cmark-gfm/issues/280)\ +//! fixed here ✅ +//! * [GFM autolink extension (`www.` part): the word `www` matches](https://github.com/github/cmark-gfm/issues/279)\ +//! present here for consistency +//! +//! ## Tokens +//! +//! * [`GfmAutolinkLiteralProtocol`][Name::GfmAutolinkLiteralProtocol] +//! * [`GfmAutolinkLiteralWww`][Name::GfmAutolinkLiteralWww] +//! * [`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail] +//! +//! ## References +//! +//! * [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) +//! * [*§ 6.9 Autolinks (extension)* in `GFM`](https://github.github.com/gfm/#autolinks-extension-) +//! +//! [text]: crate::construct::text +//! [definition]: crate::construct::definition +//! [attention]: crate::construct::attention +//! [label_start_link]: crate::construct::label_start_link +//! [label_end]: crate::construct::label_end +//! [sanitize_uri]: crate::util::sanitize_uri +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element use crate::event::{Event, Kind, Name}; +use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::util::classify_character::{classify, Kind as CharacterKind}; -use crate::util::slice::{Position, Slice}; +use crate::util::{ + classify_character::{classify_opt, Kind as CharacterKind}, + slice::{char_after_index, Position, Slice}, +}; use alloc::vec::Vec; -use core::str; -// To do: doc al functions. +/// Start of protocol autolink literal. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^ +/// ``` +pub fn protocol_start(tokenizer: &mut Tokenizer) -> State { + if tokenizer + .parse_state + .options + .constructs + .gfm_autolink_literal && + matches!(tokenizer.current, Some(b'H' | b'h')) + // Source: . + && !matches!(tokenizer.previous, Some(b'A'..=b'Z' | b'a'..=b'z')) + { + tokenizer.enter(Name::GfmAutolinkLiteralProtocol); + tokenizer.attempt( + State::Next(StateName::GfmAutolinkLiteralProtocolAfter), + State::Nok, + ); + tokenizer.attempt( + State::Next(StateName::GfmAutolinkLiteralDomainInside), + State::Nok, + ); + tokenizer.tokenize_state.start = tokenizer.point.index; + State::Retry(StateName::GfmAutolinkLiteralProtocolPrefixInside) + } else { + State::Nok + } +} + +/// After a protocol autolink literal. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^ +/// ``` +pub fn protocol_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.exit(Name::GfmAutolinkLiteralProtocol); + State::Ok +} + +/// In protocol. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^^^^^ +/// ``` +pub fn protocol_prefix_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'A'..=b'Z' | b'a'..=b'z') + // `5` is size of `https` + if tokenizer.point.index - tokenizer.tokenize_state.start < 5 => + { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralProtocolPrefixInside) + } + Some(b':') => { + let slice = Slice::from_indices( + tokenizer.parse_state.bytes, + tokenizer.tokenize_state.start, + tokenizer.point.index, + ); + let name = slice.as_str().to_ascii_lowercase(); + + tokenizer.tokenize_state.start = 0; + + if name == "http" || name == "https" { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside) + } else { + State::Nok + } + } + _ => { + tokenizer.tokenize_state.start = 0; + State::Nok + } + } +} + +/// In protocol slashes. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^^ +/// ``` +pub fn protocol_slashes_inside(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(b'/') { + tokenizer.consume(); + if tokenizer.tokenize_state.size == 0 { + tokenizer.tokenize_state.size += 1; + State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside) + } else { + tokenizer.tokenize_state.size = 0; + State::Ok + } + } else { + tokenizer.tokenize_state.size = 0; + State::Nok + } +} +/// Start of www autolink literal. +/// +/// ```markdown +/// > | www.example.com/a?b#c +/// ^ +/// ``` +pub fn www_start(tokenizer: &mut Tokenizer) -> State { + if tokenizer + .parse_state + .options + .constructs + .gfm_autolink_literal && + matches!(tokenizer.current, Some(b'W' | b'w')) + // Source: . + && matches!(tokenizer.previous, None | Some(b'\t' | b'\n' | b' ' | b'(' | b'*' | b'_' | b'[' | b']' | b'~')) + { + tokenizer.enter(Name::GfmAutolinkLiteralWww); + tokenizer.attempt( + State::Next(StateName::GfmAutolinkLiteralWwwAfter), + State::Nok, + ); + // Note: we *check*, so we can discard the `www.` we parsed. + // If it worked, we consider it as a part of the domain. + tokenizer.check( + State::Next(StateName::GfmAutolinkLiteralDomainInside), + State::Nok, + ); + State::Retry(StateName::GfmAutolinkLiteralWwwPrefixInside) + } else { + State::Nok + } +} + +/// After a www autolink literal. +/// +/// ```markdown +/// > | www.example.com/a?b#c +/// ^ +/// ``` +pub fn www_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.exit(Name::GfmAutolinkLiteralWww); + State::Ok +} + +/// In www prefix. +/// +/// ```markdown +/// > | www.example.com +/// ^^^^ +/// ``` +pub fn www_prefix_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'.') if tokenizer.tokenize_state.size == 3 => { + tokenizer.tokenize_state.size = 0; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralWwwPrefixAfter) + } + Some(b'W' | b'w') if tokenizer.tokenize_state.size < 3 => { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralWwwPrefixInside) + } + _ => { + tokenizer.tokenize_state.size = 0; + State::Nok + } + } +} + +/// After www prefix. +/// +/// ```markdown +/// > | www.example.com +/// ^ +/// ``` +pub fn www_prefix_after(tokenizer: &mut Tokenizer) -> State { + // If there is *anything*, we can link. + if tokenizer.current == None { + State::Nok + } else { + State::Ok + } +} + +/// In domain. +/// +/// ```markdown +/// > | https://example.com/a +/// ^^^^^^^^^^^ +/// ``` +pub fn domain_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Check whether this marker, which is a trailing punctuation + // marker, optionally followed by more trailing markers, and then + // followed by an end. + Some(b'.' | b'_') => { + tokenizer.check( + State::Next(StateName::GfmAutolinkLiteralDomainAfter), + State::Next(StateName::GfmAutolinkLiteralDomainAtPunctuation), + ); + State::Retry(StateName::GfmAutolinkLiteralTrail) + } + // Dashes and continuation bytes are fine. + Some(b'-' | 0x80..=0xBF) => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralDomainInside) + } + _ => { + // Source: . + if byte_to_kind( + tokenizer.parse_state.bytes, + tokenizer.point.index, + tokenizer.current, + ) == CharacterKind::Other + { + tokenizer.tokenize_state.seen = true; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralDomainInside) + } else { + State::Retry(StateName::GfmAutolinkLiteralDomainAfter) + } + } + } +} + +/// In domain, at potential trailing punctuation, that was not trailing. +/// +/// ```markdown +/// > | https://example.com +/// ^ +/// ``` +pub fn domain_at_punctuation(tokenizer: &mut Tokenizer) -> State { + // There is an underscore in the last segment of the domain + if matches!(tokenizer.current, Some(b'_')) { + tokenizer.tokenize_state.marker = b'_'; + } + // Otherwise, it’s a `.`: save the last segment underscore in the + // penultimate segment slot. + else { + tokenizer.tokenize_state.marker_b = tokenizer.tokenize_state.marker; + tokenizer.tokenize_state.marker = 0; + } + + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralDomainInside) +} + +/// After domain +/// +/// ```markdown +/// > | https://example.com/a +/// ^ +/// ``` +pub fn domain_after(tokenizer: &mut Tokenizer) -> State { + // No underscores allowed in last two segments. + let result = if tokenizer.tokenize_state.marker_b == b'_' + || tokenizer.tokenize_state.marker == b'_' + // At least one character must be seen. + || !tokenizer.tokenize_state.seen + // Note: that’s GH says a dot is needed, but it’s not true: + // + { + State::Nok + } else { + State::Retry(StateName::GfmAutolinkLiteralPathInside) + }; + + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.marker_b = 0; + result +} + +/// In path. +/// +/// ```markdown +/// > | https://example.com/a +/// ^^ +/// ``` +pub fn path_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Continuation bytes are fine, we’ve already checked the first one. + Some(0x80..=0xBF) => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) + } + // Count opening parens. + Some(b'(') => { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) + } + // Check whether this trailing punctuation marker is optionally + // followed by more trailing markers, and then followed + // by an end. + // If this is a paren (followed by trailing, then the end), we + // *continue* if we saw less closing parens than opening parens. + Some( + b'!' | b'"' | b'&' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'<' | b'?' + | b']' | b'_' | b'~', + ) => { + let next = if tokenizer.current == Some(b')') + && tokenizer.tokenize_state.size_b < tokenizer.tokenize_state.size + { + StateName::GfmAutolinkLiteralPathAtPunctuation + } else { + StateName::GfmAutolinkLiteralPathAfter + }; + tokenizer.check( + State::Next(next), + State::Next(StateName::GfmAutolinkLiteralPathAtPunctuation), + ); + State::Retry(StateName::GfmAutolinkLiteralTrail) + } + _ => { + // Source: . + if byte_to_kind( + tokenizer.parse_state.bytes, + tokenizer.point.index, + tokenizer.current, + ) == CharacterKind::Whitespace + { + State::Retry(StateName::GfmAutolinkLiteralPathAfter) + } else { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) + } + } + } +} + +/// In path, at potential trailing punctuation, that was not trailing. +/// +/// ```markdown +/// > | https://example.com/a"b +/// ^ +/// ``` +pub fn path_at_punctuation(tokenizer: &mut Tokenizer) -> State { + // Count closing parens. + if tokenizer.current == Some(b')') { + tokenizer.tokenize_state.size_b += 1; + } + + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) +} + +/// At end of path, reset parens. +/// +/// ```markdown +/// > | https://example.com/asd(qwe). +/// ^ +/// ``` +pub fn path_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_b = 0; + State::Ok +} + +/// In trail of domain or path. +/// +/// ```markdown +/// > | https://example.com"). +/// ^ +/// ``` +pub fn trail(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Regular trailing punctuation. + Some( + b'!' | b'"' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'?' | b'_' | b'~', + ) => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrail) + } + // `&` followed by one or more alphabeticals and then a `;`, is + // as a whole considered as trailing punctuation. + // In all other cases, it is considered as continuation of the URL. + Some(b'&') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrailCharRefStart) + } + // `<` is an end. + Some(b'<') => State::Ok, + // Needed because we allow literals after `[`, as we fix: + // . + // Check that it is not followed by `(` or `[`. + Some(b']') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrailBracketAfter) + } + _ => { + // Whitespace is the end of the URL, anything else is continuation. + if byte_to_kind( + tokenizer.parse_state.bytes, + tokenizer.point.index, + tokenizer.current, + ) == CharacterKind::Whitespace + { + State::Ok + } else { + State::Nok + } + } + } +} + +/// In trail, after `]`. +/// +/// > 👉 **Note**: this deviates from `cmark-gfm` to fix a bug. +/// > See end of for more. +/// +/// ```markdown +/// > | https://example.com]( +/// ^ +/// ``` +pub fn trail_bracket_after(tokenizer: &mut Tokenizer) -> State { + // Whitespace or something that could start a resource or reference is the end. + // Switch back to trail otherwise. + if matches!( + tokenizer.current, + None | Some(b'\t' | b'\n' | b' ' | b'(' | b'[') + ) { + State::Ok + } else { + State::Retry(StateName::GfmAutolinkLiteralTrail) + } +} + +/// In character-reference like trail, after `&`. +/// +/// ```markdown +/// > | https://example.com&). +/// ^ +/// ``` +pub fn trail_char_ref_start(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'A'..=b'Z' | b'a'..=b'z')) { + State::Retry(StateName::GfmAutolinkLiteralTrailCharRefInside) + } else { + State::Nok + } +} + +/// In character-reference like trail. +/// +/// ```markdown +/// > | https://example.com&). +/// ^ +/// ``` +pub fn trail_char_ref_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'A'..=b'Z' | b'a'..=b'z') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrailCharRefInside) + } + // Switch back to trail if this is well-formed. + Some(b';') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrail) + } + _ => State::Nok, + } +} + +/// Resolve: postprocess text to find email autolink literals. pub fn resolve(tokenizer: &mut Tokenizer) { tokenizer.map.consume(&mut tokenizer.events); @@ -36,23 +643,30 @@ pub fn resolve(tokenizer: &mut Tokenizer) { let mut start = 0; while byte_index < bytes.len() { - if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') { - if let Some(autolink) = peek(bytes, byte_index) { - byte_index = autolink.1; + if bytes[byte_index] == b'@' { + let mut range = (0, 0); + + if let Some(start) = peek_bytes_atext(bytes, byte_index) { + if let Some(end) = peek_bytes_email_domain(bytes, byte_index + 1) { + let end = peek_bytes_truncate(bytes, start, end); + range = (start, end); + } + } + + if range.1 != 0 { + byte_index = range.1; // If there is something between the last link // (or the start) and this link. - if start != autolink.0 { + if start != range.0 { replace.push(Event { kind: Kind::Enter, name: Name::Data, point: point.clone(), link: None, }); - point = point.shift_to( - tokenizer.parse_state.bytes, - start_index + autolink.0, - ); + point = point + .shift_to(tokenizer.parse_state.bytes, start_index + range.0); replace.push(Event { kind: Kind::Exit, name: Name::Data, @@ -64,19 +678,19 @@ pub fn resolve(tokenizer: &mut Tokenizer) { // Add the link. replace.push(Event { kind: Kind::Enter, - name: autolink.2.clone(), + name: Name::GfmAutolinkLiteralEmail, point: point.clone(), link: None, }); - point = point - .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1); + point = + point.shift_to(tokenizer.parse_state.bytes, start_index + range.1); replace.push(Event { kind: Kind::Exit, - name: autolink.2.clone(), + name: Name::GfmAutolinkLiteralEmail, point: point.clone(), link: None, }); - start = autolink.1; + start = range.1; } } @@ -114,140 +728,19 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } } -fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> { - // Protocol. - if let Some(protocol_end) = peek_protocol(bytes, index) { - if let Some(domain_end) = peek_domain(bytes, protocol_end, true) { - let end = truncate(bytes, protocol_end, domain_end); - - // Cannot be empty. - if end != protocol_end { - return Some((index, end, Name::GfmAutolinkLiteralProtocol)); - } - } - } - - // Www. - if peek_www(bytes, index).is_some() { - // Note: we discard the `www.` we parsed, we now try to parse it as a domain. - let domain_end = peek_domain(bytes, index, false).unwrap_or(index); - let end = truncate(bytes, index, domain_end); - return Some((index, end, Name::GfmAutolinkLiteralWww)); - } - - // Email. - if bytes[index] == b'@' { - if let Some(start) = peek_atext(bytes, index) { - if let Some(end) = peek_email_domain(bytes, index + 1) { - let end = truncate(bytes, start, end); - return Some((start, end, Name::GfmAutolinkLiteralEmail)); - } - } - } - - None -} - -/// Move past `http://`, `https://`, case-insensitive. -fn peek_protocol(bytes: &[u8], mut index: usize) -> Option { - // `http` - if index + 3 < bytes.len() - && matches!(bytes[index], b'H' | b'h') - && matches!(bytes[index + 1], b'T' | b't') - && matches!(bytes[index + 2], b'T' | b't') - && matches!(bytes[index + 3], b'P' | b'p') - { - index += 4; - - // `s`, optional. - if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') { - index += 1; - } - - // `://` - if index + 3 < bytes.len() - && bytes[index] == b':' - && bytes[index + 1] == b'/' - && bytes[index + 2] == b'/' - { - return Some(index + 3); - } - } - - None -} - -/// Move past `www.`, case-insensitive. -fn peek_www(bytes: &[u8], index: usize) -> Option { - // `www.` - if index + 3 < bytes.len() - // Source: . - && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~')) - && matches!(bytes[index], b'W' | b'w') - && matches!(bytes[index + 1], b'W' | b'w') - && matches!(bytes[index + 2], b'W' | b'w') - && bytes[index + 3] == b'.' - { - Some(index + 4) - } else { - None - } -} - -/// Move past `example.com`. -fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option { - let mut dots = false; - let mut penultime = false; - let mut last = false; - // To do: expose this from slice? - // To do: do it ourselves? , , , . - let char_indices = str::from_utf8(&bytes[start..]) - .unwrap() - .char_indices() - .collect::>(); - let mut index = 0; - - while index < char_indices.len() { - match char_indices[index].1 { - '_' => last = true, - '.' => { - penultime = last; - last = false; - dots = true; - } - '-' => {} - // Source: . - char if classify(char) == CharacterKind::Other => {} - _ => break, - } - - index += 1; - } - - // No underscores allowed in last two parts. - // A valid domain needs to have at least a dot. - if penultime || last || (!allow_short && !dots) { - None - } else { - // Now peek past `/path?search#hash` (anything except whitespace). - while index < char_indices.len() { - if classify(char_indices[index].1) == CharacterKind::Whitespace { - break; - } - - index += 1; - } - - Some(if index == char_indices.len() { - bytes.len() - } else { - start + char_indices[index].0 - }) - } -} - -/// Move back past `contact`. -fn peek_atext(bytes: &[u8], end: usize) -> Option { +// To do: add `xmpp`, `mailto` support. + +/// Move back past atext. +/// +/// Moving back is only used when post processing text: so for the email address +/// algorithm. +/// +/// ```markdown +/// > | a contact@example.org b +/// ^-- from +/// ^-- to +/// ``` +fn peek_bytes_atext(bytes: &[u8], end: usize) -> Option { let mut index = end; // Take simplified atext. @@ -270,8 +763,17 @@ fn peek_atext(bytes: &[u8], end: usize) -> Option { } } -/// Move past `example.com`. -fn peek_email_domain(bytes: &[u8], start: usize) -> Option { +/// Move past email domain. +/// +/// Peeking like this only used when post processing text: so for the email +/// address algorithm. +/// +/// ```markdown +/// > | a contact@example.org b +/// ^-- from +/// ^-- to +/// ``` +fn peek_bytes_email_domain(bytes: &[u8], start: usize) -> Option { let mut index = start; let mut dot = false; @@ -303,8 +805,21 @@ fn peek_email_domain(bytes: &[u8], start: usize) -> Option { } } -/// Split trialing stuff from a URL. -fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { +/// Move back past punctuation. +/// +/// Moving back is only used when post processing text: so for the email address +/// algorithm. +/// +/// This is much more complex that needed, because GH allows a lot of +/// punctuation in the protocol and www algorithms. +/// However, those aren’t implemented like the email algo. +/// +/// ```markdown +/// > | a contact@example.org”) b +/// ^-- from +/// ^-- to +/// ``` +fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { let mut index = start; // Source: @@ -379,3 +894,24 @@ fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { split } + +/// Classify a byte (or `char`). +fn byte_to_kind(bytes: &[u8], index: usize, byte: Option) -> CharacterKind { + match byte { + None => CharacterKind::Whitespace, + Some(byte) => { + if byte.is_ascii_whitespace() { + CharacterKind::Whitespace + } else if byte.is_ascii_punctuation() { + CharacterKind::Punctuation + } else if byte.is_ascii_alphanumeric() { + CharacterKind::Other + } else { + // Otherwise: seems to be an ASCII control, so it seems to be a + // non-ASCII `char`. + let char = char_after_index(bytes, index); + classify_opt(char) + } + } + } +} diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index d7c2b69..27fbadf 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -191,7 +191,7 @@ //! This bug is not present in this project. //! The issue relating to tables is: //! -//! * [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277)\ +//! * [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277) //! //! ## Tokens //! diff --git a/src/construct/text.rs b/src/construct/text.rs index 3cb0f10..0168d02 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -29,17 +29,21 @@ use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; /// Characters that can start something in text. -const MARKERS: [u8; 11] = [ +const MARKERS: [u8; 15] = [ b'!', // `label_start_image` b'$', // `raw_text` (math (text)) b'&', // `character_reference` b'*', // `attention` (emphasis, strong) b'<', // `autolink`, `html_text` + b'H', // `gfm_autolink_literal` (`protocol` kind) + b'W', // `gfm_autolink_literal` (`www.` kind) b'[', // `label_start_link` b'\\', // `character_escape`, `hard_break_escape` b']', // `label_end`, `gfm_label_start_footnote` b'_', // `attention` (emphasis, strong) b'`', // `raw_text` (code (text)) + b'h', // `gfm_autolink_literal` (`protocol` kind) + b'w', // `gfm_autolink_literal` (`www.` kind) b'~', // `attention` (gfm strikethrough) ]; @@ -113,6 +117,20 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::AutolinkStart) } + Some(b'H' | b'h') => { + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBeforeData), + ); + State::Retry(StateName::GfmAutolinkLiteralProtocolStart) + } + Some(b'W' | b'w') => { + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBeforeData), + ); + State::Retry(StateName::GfmAutolinkLiteralWwwStart) + } Some(b'[') => { tokenizer.attempt( State::Next(StateName::TextBefore), diff --git a/src/state.rs b/src/state.rs index 5013ec8..d7c0c8a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -310,6 +310,29 @@ pub enum Name { StringBefore, StringBeforeData, + GfmAutolinkLiteralProtocolStart, + GfmAutolinkLiteralProtocolAfter, + GfmAutolinkLiteralProtocolPrefixInside, + GfmAutolinkLiteralProtocolSlashesInside, + + GfmAutolinkLiteralWwwStart, + GfmAutolinkLiteralWwwAfter, + GfmAutolinkLiteralWwwPrefixInside, + GfmAutolinkLiteralWwwPrefixAfter, + + GfmAutolinkLiteralDomainInside, + GfmAutolinkLiteralDomainAtPunctuation, + GfmAutolinkLiteralDomainAfter, + + GfmAutolinkLiteralPathInside, + GfmAutolinkLiteralPathAtPunctuation, + GfmAutolinkLiteralPathAfter, + + GfmAutolinkLiteralTrail, + GfmAutolinkLiteralTrailCharRefInside, + GfmAutolinkLiteralTrailCharRefStart, + GfmAutolinkLiteralTrailBracketAfter, + GfmTableStart, GfmTableHeadRowBefore, GfmTableHeadRowStart, @@ -686,6 +709,43 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::StringBefore => construct::string::before, Name::StringBeforeData => construct::string::before_data, + Name::GfmAutolinkLiteralProtocolStart => construct::gfm_autolink_literal::protocol_start, + Name::GfmAutolinkLiteralProtocolAfter => construct::gfm_autolink_literal::protocol_after, + Name::GfmAutolinkLiteralProtocolPrefixInside => { + construct::gfm_autolink_literal::protocol_prefix_inside + } + Name::GfmAutolinkLiteralProtocolSlashesInside => { + construct::gfm_autolink_literal::protocol_slashes_inside + } + + Name::GfmAutolinkLiteralWwwAfter => construct::gfm_autolink_literal::www_after, + Name::GfmAutolinkLiteralWwwStart => construct::gfm_autolink_literal::www_start, + Name::GfmAutolinkLiteralWwwPrefixInside => { + construct::gfm_autolink_literal::www_prefix_inside + } + Name::GfmAutolinkLiteralWwwPrefixAfter => construct::gfm_autolink_literal::www_prefix_after, + Name::GfmAutolinkLiteralDomainInside => construct::gfm_autolink_literal::domain_inside, + Name::GfmAutolinkLiteralDomainAtPunctuation => { + construct::gfm_autolink_literal::domain_at_punctuation + } + Name::GfmAutolinkLiteralDomainAfter => construct::gfm_autolink_literal::domain_after, + + Name::GfmAutolinkLiteralPathInside => construct::gfm_autolink_literal::path_inside, + Name::GfmAutolinkLiteralPathAtPunctuation => { + construct::gfm_autolink_literal::path_at_punctuation + } + Name::GfmAutolinkLiteralPathAfter => construct::gfm_autolink_literal::path_after, + Name::GfmAutolinkLiteralTrail => construct::gfm_autolink_literal::trail, + Name::GfmAutolinkLiteralTrailCharRefStart => { + construct::gfm_autolink_literal::trail_char_ref_start + } + Name::GfmAutolinkLiteralTrailCharRefInside => { + construct::gfm_autolink_literal::trail_char_ref_inside + } + Name::GfmAutolinkLiteralTrailBracketAfter => { + construct::gfm_autolink_literal::trail_bracket_after + } + Name::GfmTableStart => construct::gfm_table::start, Name::GfmTableHeadRowBefore => construct::gfm_table::head_row_before, Name::GfmTableHeadRowStart => construct::gfm_table::head_row_start, diff --git a/tests/gfm_autolink_literal.rs b/tests/gfm_autolink_literal.rs index 9551751..2e84e6d 100644 --- a/tests/gfm_autolink_literal.rs +++ b/tests/gfm_autolink_literal.rs @@ -41,6 +41,22 @@ fn gfm_autolink_literal() { "should support email urls if enabled" ); + assert_eq!( + micromark_with_options("[https://example.com](xxx)", &gfm), + "

https://example.com

", + "should not link protocol urls in links" + ); + assert_eq!( + micromark_with_options("[www.example.com](xxx)", &gfm), + "

www.example.com

", + "should not link www urls in links" + ); + assert_eq!( + micromark_with_options("[user@example.com](xxx)", &gfm), + "

user@example.com

", + "should not link email urls in links" + ); + assert_eq!( micromark_with_options("user@example.com", &gfm), "

user@example.com

", @@ -174,7 +190,7 @@ fn gfm_autolink_literal() { ); // Note: GH comments/issues/PRs do not link this, but Gists/readmes do. - // Fixing it would mean defiating from `cmark-gfm`: + // Fixing it would mean deviating from `cmark-gfm`: // Source: . // assert_eq!( // micromark_with_options(",www.example.com", &gfm), @@ -209,6 +225,55 @@ fn gfm_autolink_literal() { "should stop domains/paths at `<`" ); + assert_eq!( + micromark_with_options( + r###" +a www.example.com&xxx;b c + +a www.example.com&xxx;. b + +a www.example.com&xxxxxxxxx;. b + +a www.example.com&xxxxxxxxxx;. b + +a www.example.com&xxxxxxxxxxx;. b + +a www.example.com&xxx. b + +a www.example.com{. b + +a www.example.com&123. b + +a www.example.com&x. b + +a www.example.com. b + +a www.example.com&1. b + +a www.example.com&. b + +a www.example.com& b +"###, + &gfm + ), + r###"

a www.example.com&xxx;b c

+

a www.example.com&xxx;. b

+

a www.example.com&xxxxxxxxx;. b

+

a www.example.com&xxxxxxxxxx;. b

+

a www.example.com&xxxxxxxxxxx;. b

+

a www.example.com&xxx. b

+

a www.example.com&#123. b

+

a www.example.com&123. b

+

a www.example.com&x. b

+

a www.example.com&#1. b

+

a www.example.com&1. b

+

a www.example.com&. b

+

a www.example.com& b

+"###, + "should match “character references” like GitHub does" + ); + + // Note: this deviates from GFM, as is fixed. assert_eq!( micromark_with_options( r###" @@ -251,6 +316,2379 @@ fn gfm_autolink_literal() {

 https://example.com

 contact@example.com

"###, - "should interplay with brackets, links, and images" + "should match interplay with brackets, links, and images, like GitHub does (but without the bugs)" + ); + + assert_eq!( + micromark_with_options( + r###" +www.example.com/?=a(b)cccccc + +www.example.com/?=a(b(c)ccccc + +www.example.com/?=a(b(c)c)cccc + +www.example.com/?=a(b(c)c)c)ccc + +www.example.com/?q=a(business) + +www.example.com/?q=a(business))) + +(www.example.com/?q=a(business)) + +(www.example.com/?q=a(business) + +www.example.com/?q=a(business)". + +www.example.com/?q=a(business))) + +(www.example.com/?q=a(business))". + +(www.example.com/?q=a(business)".) + +(www.example.com/?q=a(business)". +"###, + &gfm + ), + r###"

www.example.com/?=a(b)cccccc

+

www.example.com/?=a(b(c)ccccc

+

www.example.com/?=a(b(c)c)cccc

+

www.example.com/?=a(b(c)c)c)ccc

+

www.example.com/?q=a(business)

+

www.example.com/?q=a(business)))

+

(www.example.com/?q=a(business))

+

(www.example.com/?q=a(business)

+

www.example.com/?q=a(business)".

+

www.example.com/?q=a(business)))

+

(www.example.com/?q=a(business))".

+

(www.example.com/?q=a(business)".)

+

(www.example.com/?q=a(business)".

+"###, + "should match parens like GitHub does" + ); + + // Note: this deviates from GFM. + // Here, the following issues are fixed: + // - + assert_eq!( + micromark_with_options( + r###" +# Literal autolinks + +## WWW autolinks + +w.commonmark.org + +ww.commonmark.org + +www.commonmark.org + +Www.commonmark.org + +wWw.commonmark.org + +wwW.commonmark.org + +WWW.COMMONMARK.ORG + +Visit www.commonmark.org/help for more information. + +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. + +www.aaa.bbb.ccc_ccc + +www.aaa_bbb.ccc + +www.aaa.bbb.ccc.ddd_ddd + +www.aaa.bbb.ccc_ccc.ddd + +www.aaa.bbb_bbb.ccc.ddd + +www.aaa_aaa.bbb.ccc.ddd + +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. + +www.google.com/search?q=Markup+(business) + +www.google.com/search?q=Markup+(business))) + +(www.google.com/search?q=Markup+(business)) + +(www.google.com/search?q=Markup+(business) + +www.google.com/search?q=(business))+ok + +www.google.com/search?q=commonmark&hl=en + +www.google.com/search?q=commonmark&hl;en + +www.google.com/search?q=commonmark&hl; + +www.commonmark.org/he should still be expanded. +"###, + &gfm + ), + r###"

Literal autolinks

+

WWW autolinks

+

w.commonmark.org

+

ww.commonmark.org

+

www.commonmark.org

+

Www.commonmark.org

+

wWw.commonmark.org

+

wwW.commonmark.org

+

WWW.COMMONMARK.ORG

+

Visit www.commonmark.org/help for more information.

+

Visit www.commonmark.org.

+

Visit www.commonmark.org/a.b.

+

www.aaa.bbb.ccc_ccc

+

www.aaa_bbb.ccc

+

www.aaa.bbb.ccc.ddd_ddd

+

www.aaa.bbb.ccc_ccc.ddd

+

www.aaa.bbb_bbb.ccc.ddd

+

www.aaa_aaa.bbb.ccc.ddd

+

Visit www.commonmark.org.

+

Visit www.commonmark.org/a.b.

+

www.google.com/search?q=Markup+(business)

+

www.google.com/search?q=Markup+(business)))

+

(www.google.com/search?q=Markup+(business))

+

(www.google.com/search?q=Markup+(business)

+

www.google.com/search?q=(business))+ok

+

www.google.com/search?q=commonmark&hl=en

+

www.google.com/search?q=commonmark&hl;en

+

www.google.com/search?q=commonmark&hl;

+

www.commonmark.org/he<lp

+

HTTP autolinks

+

hexample.com

+

htexample.com

+

httexample.com

+

httpexample.com

+

http:example.com

+

http:/example.com

+

https:/example.com

+

http://example.com

+

https://example.com

+

https://example

+

http://commonmark.org

+

(Visit https://encrypted.google.com/search?q=Markup+(business))

+

Email autolinks

+

No dot: foo@barbaz

+

No dot: foo@barbaz.

+

foo@bar.baz

+

hello@mail+xyz.example isn’t valid, but hello+xyz@mail.example is.

+

a.b-c_d@a.b

+

a.b-c_d@a.b.

+

a.b-c_d@a.b-

+

a.b-c_d@a.b_

+

a@a_b.c

+

a@a-b.c

+

Can’t end in an underscore followed by a period: aaa@a.b_.

+

Can contain an underscore followed by a period: aaa@a.b_.c

+

Link text should not be expanded

+

Visit www.example.com please.

+

Visit http://www.example.com please.

+

Mail example@example.com please.

+

link http://autolink should still be expanded.

+"###, + "should match base like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"H0. + +[https://a.com©b + +[www.a.com©b + +H1. + +[]https://a.com©b + +[]www.a.com©b + +H2. + +[] https://a.com©b + +[] www.a.com©b + +H3. + +[[https://a.com©b + +[[www.a.com©b + +H4. + +[[]https://a.com©b + +[[]www.a.com©b + +H5. + +[[]]https://a.com©b + +[[]]www.a.com©b +"###, + &gfm + ), + r###"

H0.

+

[https://a.com&copy;b

+

[www.a.com&copy;b

+

H1.

+

[]https://a.com&copy;b

+

[]www.a.com&copy;b

+

H2.

+

[] https://a.com&copy;b

+

[] www.a.com&copy;b

+

H3.

+

[[https://a.com&copy;b

+

[[www.a.com&copy;b

+

H4.

+

[[]https://a.com&copy;b

+

[[]www.a.com&copy;b

+

H5.

+

[[]]https://a.com&copy;b

+

[[]]www.a.com&copy;b

+"###, + "should match brackets like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options(r###"Image start. + +![https://a.com + +![http://a.com + +![www.a.com + +![a@b.c + +Image start and label end. + +![https://a.com] + +![http://a.com] + +![www.a.com] + +![a@b.c] + +Image label with reference (note: GH cleans hashes here, but we keep them in). + +![https://a.com][x] + +![http://a.com][x] + +![www.a.com][x] + +![a@b.c][x] + +[x]: # + +Image label with resource. + +![https://a.com]() + +![http://a.com]() + +![www.a.com]() + +![a@b.c]() + +Autolink literal after image. + +![a]() https://a.com + +![a]() http://a.com + +![a]() www.a.com + +![a]() a@b.c +"###, &gfm), + r###"

Image start.

+

![https://a.com

+

![http://a.com

+

![www.a.com

+

![a@b.c

+

Image start and label end.

+

![https://a.com]

+

![http://a.com]

+

![www.a.com]

+

![a@b.c]

+

Image label with reference (note: GH cleans hashes here, but we keep them in).

+

https://a.com

+

http://a.com

+

www.a.com

+

a@b.c

+

Image label with resource.

+

https://a.com

+

http://a.com

+

www.a.com

+

a@b.c

+

Autolink literal after image.

+

a https://a.com

+

a http://a.com

+

a www.a.com

+

a a@b.c

+"###, + "should match autolink literals combined w/ images like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options(r###"Link start. + +[https://a.com + +[http://a.com + +[www.a.com + +[a@b.c + +Label end. + +https://a.com] + +http://a.com] + +www.a.com] + +a@b.c] + +Link start and label end. + +[https://a.com] + +[http://a.com] + +[www.a.com] + +[a@b.c] + +What naĂŻvely seems like a label end (A). + +https://a.com`]` + +http://a.com`]` + +www.a.com`]` + +a@b.c`]` + +Link start and what naĂŻvely seems like a balanced brace (B). + +[https://a.com`]` + +[http://a.com`]` + +[www.a.com`]` + +[a@b.c`]` + +What naĂŻvely seems like a label end (C). + +https://a.com `]` + +http://a.com `]` + +www.a.com `]` + +a@b.c `]` + +Link start and what naĂŻvely seems like a balanced brace (D). + +[https://a.com `]` + +[http://a.com `]` + +[www.a.com `]` + +[a@b.c `]` + +Link label with reference. + +[https://a.com][x] + +[http://a.com][x] + +[www.a.com][x] + +[a@b.c][x] + +[x]: # + +Link label with resource. + +[https://a.com]() + +[http://a.com]() + +[www.a.com]() + +[a@b.c]() + +More in link. + +[a https://b.com c]() + +[a http://b.com c]() + +[a www.b.com c]() + +[a b@c.d e]() + +Autolink literal after link. + +[a]() https://a.com + +[a]() http://a.com + +[a]() www.a.com + +[a]() a@b.c +"###, &gfm), + r###"

Link start.

+

[https://a.com

+

[http://a.com

+

[www.a.com

+

[a@b.c

+

Label end.

+

https://a.com]

+

http://a.com]

+

www.a.com]

+

a@b.c]

+

Link start and label end.

+

[https://a.com]

+

[http://a.com]

+

[www.a.com]

+

[a@b.c]

+

What naĂŻvely seems like a label end (A).

+

https://a.com`]`

+

http://a.com`]`

+

www.a.com`]`

+

a@b.c]

+

Link start and what naĂŻvely seems like a balanced brace (B).

+

[https://a.com`]`

+

[http://a.com`]`

+

[www.a.com`]`

+

[a@b.c]

+

What naĂŻvely seems like a label end (C).

+

https://a.com ]

+

http://a.com ]

+

www.a.com ]

+

a@b.c ]

+

Link start and what naĂŻvely seems like a balanced brace (D).

+

[https://a.com ]

+

[http://a.com ]

+

[www.a.com ]

+

[a@b.c ]

+

Link label with reference.

+

https://a.com

+

http://a.com

+

www.a.com

+

a@b.c

+

Link label with resource.

+

https://a.com

+

http://a.com

+

www.a.com

+

a@b.c

+

More in link.

+

a https://b.com c

+

a http://b.com c

+

a www.b.com c

+

a b@c.d e

+

Autolink literal after link.

+

a https://a.com

+

a http://a.com

+

a www.a.com

+

a a@b.c

+"###, + "should match autolink literals combined w/ links like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# “character reference” + +www.a&b (space) + +www.a&b! + +www.a&b" + +www.a&b# + +www.a&b$ + +www.a&b% + +www.a&b& + +www.a&b' + +www.a&b( + +www.a&b) + +www.a&b* + +www.a&b+ + +www.a&b, + +www.a&b- + +www.a&b + +www.a&b. + +www.a&b/ + +www.a&b: + +www.a&b; + +www.a&b< + +www.a&b= + +www.a&b> + +www.a&b? + +www.a&b@ + +www.a&b[ + +www.a&b\ + +www.a&b] + +www.a&b^ + +www.a&b_ + +www.a&b` + +www.a&b{ + +www.a&b| + +www.a&b} + +www.a&b~ +"###, + &gfm + ), + r###"

“character reference”

+

www.a&b (space)

+

www.a&b!

+

www.a&b"

+

www.a&b#

+

www.a&b$

+

www.a&b%

+

www.a&b&

+

www.a&b'

+

www.a&b(

+

www.a&b)

+

www.a&b*

+

www.a&b+

+

www.a&b,

+

www.a&b-

+

www.a&b

+

www.a&b.

+

www.a&b/

+

www.a&b:

+

www.a&b;

+

www.a&b<

+

www.a&b=

+

www.a&b>

+

www.a&b?

+

www.a&b@

+

www.a&b[

+

www.a&b\

+

www.a&b]

+

www.a&b^

+

www.a&b_

+

www.a&b`

+

www.a&b{

+

www.a&b|

+

www.a&b}

+

www.a&b~

+"###, + "should match “character references (named)” like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options(r###"# “character reference” + +www.a# (space) + +www.a#! + +www.a#" + +www.a## + +www.a#$ + +www.a#% + +www.a#& + +www.a#' + +www.a#( + +www.a#) + +www.a#* + +www.a#+ + +www.a#, + +www.a#- + +www.a# + +www.a#. + +www.a#/ + +www.a#: + +www.a# + +www.a#< + +www.a#= + +www.a#> + +www.a#? + +www.a#@ + +www.a#[ + +www.a#\ + +www.a#] + +www.a#^ + +www.a#_ + +www.a#` + +www.a#{ + +www.a#| + +www.a#} + +www.a#~ +"###, &gfm), + r###"

“character reference”

+

www.a&#35 (space)

+

www.a&#35!

+

www.a&#35"

+

www.a&#35#

+

www.a&#35$

+

www.a&#35%

+

www.a&#35&

+

www.a&#35'

+

www.a&#35(

+

www.a&#35)

+

www.a&#35*

+

www.a&#35+

+

www.a&#35,

+

www.a&#35-

+

www.a&#35

+

www.a&#35.

+

www.a&#35/

+

www.a&#35:

+

www.a&#35;

+

www.a&#35<

+

www.a&#35=

+

www.a&#35>

+

www.a&#35?

+

www.a&#35@

+

www.a&#35[

+

www.a&#35\

+

www.a&#35]

+

www.a&#35^

+

www.a&#35_

+

www.a&#35`

+

www.a&#35{

+

www.a&#35|

+

www.a&#35}

+

www.a&#35~

+"###, + "should match “character references (numeric)” like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"a@0.0 + +a@0.b + +a@a.29 + +a@a.b + +a@0.0.c + +react@0.11.1 + +react@0.12.0-rc1 + +react@0.14.0-alpha1 + +react@16.7.0-alpha.2 + +react@0.0.0-experimental-aae83a4b9 + +[ react@0.11.1 + +[ react@0.12.0-rc1 + +[ react@0.14.0-alpha1 + +[ react@16.7.0-alpha.2 + +[ react@0.0.0-experimental-aae83a4b9 +"###, + &gfm + ), + r###"

a@0.0

+

a@0.b

+

a@a.29

+

a@a.b

+

a@0.0.c

+

react@0.11.1

+

react@0.12.0-rc1

+

react@0.14.0-alpha1

+

react@16.7.0-alpha.2

+

react@0.0.0-experimental-aae83a4b9

+

[ react@0.11.1

+

[ react@0.12.0-rc1

+

[ react@0.14.0-alpha1

+

[ react@16.7.0-alpha.2

+

[ react@0.0.0-experimental-aae83a4b9

+"###, + "should match email TLD digits like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (2) + +http://a (space) + +http://a! + +http://a" + +http://a# + +http://a$ + +http://a% + +http://a& + +http://a' + +http://a( + +http://a) + +http://a* + +http://a+ + +http://a, + +http://a- + +http://a + +http://a. + +http://a/ + +http://a: + +http://a; + +http://a< + +http://a= + +http://a> + +http://a? + +http://a@ + +http://a[ + +http://a\ + +http://a] + +http://a^ + +http://a_ + +http://a` + +http://a{ + +http://a| + +http://a} + +http://a~ +"###, + &gfm + ), + r###"

httpshhh? (2)

+

http://a (space)

+

http://a!

+

http://a"

+

http://a#

+

http://a$

+

http://a%

+

http://a&

+

http://a'

+

http://a(

+

http://a)

+

http://a*

+

http://a+

+

http://a,

+

http://a-

+

http://a

+

http://a.

+

http://a/

+

http://a:

+

http://a;

+

http://a<

+

http://a=

+

http://a>

+

http://a?

+

http://a@

+

http://a[

+

http://a\

+

http://a]

+

http://a^

+

http://a_

+

http://a`

+

http://a{

+

http://a|

+

http://a}

+

http://a~

+"###, + "should match protocol domain continue like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (1) + +http:// (space) + +http://! + +http://" + +http://# + +http://$ + +http://% + +http://& + +http://' + +http://( + +http://) + +http://* + +http://+ + +http://, + +http://- + +http:// + +http://. + +http:/// + +http://: + +http://; + +http://< + +http://= + +http://> + +http://? + +http://@ + +http://[ + +http://\ + +http://] + +http://^ + +http://_ + +http://` + +http://{ + +http://| + +http://} + +http://~ +"###, + &gfm + ), + r###"

httpshhh? (1)

+

http:// (space)

+

http://!

+

http://"

+

http://#

+

http://$

+

http://%

+

http://&

+

http://'

+

http://(

+

http://)

+

http://*

+

http://+

+

http://,

+

http://-

+

http://

+

http://.

+

http:///

+

http://:

+

http://;

+

http://<

+

http://=

+

http://>

+

http://?

+

http://@

+

http://[

+

http://\

+

http://]

+

http://^

+

http://_

+

http://`

+

http://{

+

http://|

+

http://}

+

http://~

+"###, + "should match protocol domain start like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (4) + +http://a/b (space) + +http://a/b! + +http://a/b" + +http://a/b# + +http://a/b$ + +http://a/b% + +http://a/b& + +http://a/b' + +http://a/b( + +http://a/b) + +http://a/b* + +http://a/b+ + +http://a/b, + +http://a/b- + +http://a/b + +http://a/b. + +http://a/b/ + +http://a/b: + +http://a/b; + +http://a/b< + +http://a/b= + +http://a/b> + +http://a/b? + +http://a/b@ + +http://a/b[ + +http://a/b\ + +http://a/b] + +http://a/b^ + +http://a/b_ + +http://a/b` + +http://a/b{ + +http://a/b| + +http://a/b} + +http://a/b~ +"###, + &gfm + ), + r###"

httpshhh? (4)

+

http://a/b (space)

+

http://a/b!

+

http://a/b"

+

http://a/b#

+

http://a/b$

+

http://a/b%

+

http://a/b&

+

http://a/b'

+

http://a/b(

+

http://a/b)

+

http://a/b*

+

http://a/b+

+

http://a/b,

+

http://a/b-

+

http://a/b

+

http://a/b.

+

http://a/b/

+

http://a/b:

+

http://a/b;

+

http://a/b<

+

http://a/b=

+

http://a/b>

+

http://a/b?

+

http://a/b@

+

http://a/b[

+

http://a/b\

+

http://a/b]

+

http://a/b^

+

http://a/b_

+

http://a/b`

+

http://a/b{

+

http://a/b|

+

http://a/b}

+

http://a/b~

+"###, + "should match protocol path continue like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (3) + +http://a/ (space) + +http://a/! + +http://a/" + +http://a/# + +http://a/$ + +http://a/% + +http://a/& + +http://a/' + +http://a/( + +http://a/) + +http://a/* + +http://a/+ + +http://a/, + +http://a/- + +http://a/ + +http://a/. + +http://a// + +http://a/: + +http://a/; + +http://a/< + +http://a/= + +http://a/> + +http://a/? + +http://a/@ + +http://a/[ + +http://a/\ + +http://a/] + +http://a/^ + +http://a/_ + +http://a/` + +http://a/{ + +http://a/| + +http://a/} + +http://a/~ +"###, + &gfm + ), + r###"

httpshhh? (3)

+

http://a/ (space)

+

http://a/!

+

http://a/"

+

http://a/#

+

http://a/$

+

http://a/%

+

http://a/&

+

http://a/'

+

http://a/(

+

http://a/)

+

http://a/*

+

http://a/+

+

http://a/,

+

http://a/-

+

http://a/

+

http://a/.

+

http://a//

+

http://a/:

+

http://a/;

+

http://a/<

+

http://a/=

+

http://a/>

+

http://a/?

+

http://a/@

+

http://a/[

+

http://a/\

+

http://a/]

+

http://a/^

+

http://a/_

+

http://a/`

+

http://a/{

+

http://a/|

+

http://a/}

+

http://a/~

+"###, + "should match protocol path start like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"[www.example.com/a©](#) + +www.example.com/a© + +[www.example.com/a&bogus;](#) + +www.example.com/a&bogus; + +[www.example.com/a\.](#) + +www.example.com/a\. +"###, + &gfm + ), + r###"

www.example.com/a©

+

www.example.com/a©

+

www.example.com/a&bogus;

+

www.example.com/a&bogus;

+

www.example.com/a\.

+

www.example.com/a\.

+"###, + "should match links, autolink literals, and characters like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# “character reference” + +www.a/b&c (space) + +www.a/b&c! + +www.a/b&c" + +www.a/b&c# + +www.a/b&c$ + +www.a/b&c% + +www.a/b&c& + +www.a/b&c' + +www.a/b&c( + +www.a/b&c) + +www.a/b&c* + +www.a/b&c+ + +www.a/b&c, + +www.a/b&c- + +www.a/b&c + +www.a/b&c. + +www.a/b&c/ + +www.a/b&c: + +www.a/b&c; + +www.a/b&c< + +www.a/b&c= + +www.a/b&c> + +www.a/b&c? + +www.a/b&c@ + +www.a/b&c[ + +www.a/b&c\ + +www.a/b&c] + +www.a/b&c^ + +www.a/b&c_ + +www.a/b&c` + +www.a/b&c{ + +www.a/b&c| + +www.a/b&c} + +www.a/b&c~ +"###, + &gfm + ), + r###"

“character reference”

+

www.a/b&c (space)

+

www.a/b&c!

+

www.a/b&c"

+

www.a/b&c#

+

www.a/b&c$

+

www.a/b&c%

+

www.a/b&c&

+

www.a/b&c'

+

www.a/b&c(

+

www.a/b&c)

+

www.a/b&c*

+

www.a/b&c+

+

www.a/b&c,

+

www.a/b&c-

+

www.a/b&c

+

www.a/b&c.

+

www.a/b&c/

+

www.a/b&c:

+

www.a/b&c;

+

www.a/b&c<

+

www.a/b&c=

+

www.a/b&c>

+

www.a/b&c?

+

www.a/b&c@

+

www.a/b&c[

+

www.a/b&c\

+

www.a/b&c]

+

www.a/b&c^

+

www.a/b&c_

+

www.a/b&c`

+

www.a/b&c{

+

www.a/b&c|

+

www.a/b&c}

+

www.a/b&c~

+"###, + "should match character reference-like (named) things in paths like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# “character reference” + +www.a/b# (space) + +www.a/b#! + +www.a/b#" + +www.a/b## + +www.a/b#$ + +www.a/b#% + +www.a/b#& + +www.a/b#' + +www.a/b#( + +www.a/b#) + +www.a/b#* + +www.a/b#+ + +www.a/b#, + +www.a/b#- + +www.a/b# + +www.a/b#. + +www.a/b#/ + +www.a/b#: + +www.a/b# + +www.a/b#< + +www.a/b#= + +www.a/b#> + +www.a/b#? + +www.a/b#@ + +www.a/b#[ + +www.a/b#\ + +www.a/b#] + +www.a/b#^ + +www.a/b#_ + +www.a/b#` + +www.a/b#{ + +www.a/b#| + +www.a/b#} + +www.a/b#~ +"###, + &gfm + ), + r###"

“character reference”

+

www.a/b&#35 (space)

+

www.a/b&#35!

+

www.a/b&#35"

+

www.a/b&#35#

+

www.a/b&#35$

+

www.a/b&#35%

+

www.a/b&#35&

+

www.a/b&#35'

+

www.a/b&#35(

+

www.a/b&#35)

+

www.a/b&#35*

+

www.a/b&#35+

+

www.a/b&#35,

+

www.a/b&#35-

+

www.a/b&#35

+

www.a/b&#35.

+

www.a/b&#35/

+

www.a/b&#35:

+

www.a/b&#35;

+

www.a/b&#35<

+

www.a/b&#35=

+

www.a/b&#35>

+

www.a/b&#35?

+

www.a/b&#35@

+

www.a/b&#35[

+

www.a/b&#35\

+

www.a/b&#35]

+

www.a/b&#35^

+

www.a/b&#35_

+

www.a/b&#35`

+

www.a/b&#35{

+

www.a/b&#35|

+

www.a/b&#35}

+

www.a/b&#35~

+"###, + "should match character reference-like (numeric) things in paths like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"In autolink literal path or link end? + +[https://a.com/d]() + +[http://a.com/d]() + +[www.a.com/d]() + +https://a.com/d]() + +http://a.com/d]() + +www.a.com/d]() + +In autolink literal search or link end? + +[https://a.com?d]() + +[http://a.com?d]() + +[www.a.com?d]() + +https://a.com?d]() + +http://a.com?d]() + +www.a.com?d]() + +In autolink literal hash or link end? + +[https://a.com#d]() + +[http://a.com#d]() + +[www.a.com#d]() + +https://a.com#d]() + +http://a.com#d]() + +www.a.com#d]() +"###, + &gfm + ), + r###"

In autolink literal path or link end?

+

https://a.com/d

+

http://a.com/d

+

www.a.com/d

+

https://a.com/d]()

+

http://a.com/d]()

+

www.a.com/d]()

+

In autolink literal search or link end?

+

https://a.com?d

+

http://a.com?d

+

www.a.com?d

+

https://a.com?d]()

+

http://a.com?d]()

+

www.a.com?d]()

+

In autolink literal hash or link end?

+

https://a.com#d

+

http://a.com#d

+

www.a.com#d

+

https://a.com#d]()

+

http://a.com#d]()

+

www.a.com#d]()

+"###, + "should match path or link end like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"Last non-markdown ASCII whitespace (FF): noreply@example.com, http://example.com, https://example.com, www.example.com + +Last non-whitespace ASCII control (US): noreply@example.com, http://example.com, https://example.com, www.example.com + +First punctuation after controls: !noreply@example.com, !http://example.com, !https://example.com, !www.example.com + +Last punctuation before digits: /noreply@example.com, /http://example.com, /https://example.com, /www.example.com + +First digit: 0noreply@example.com, 0http://example.com, 0https://example.com, 0www.example.com + +First punctuation after digits: :noreply@example.com, :http://example.com, :https://example.com, :www.example.com + +Last punctuation before caps: @noreply@example.com, @http://example.com, @https://example.com, @www.example.com + +First uppercase: Anoreply@example.com, Ahttp://example.com, Ahttps://example.com, Awww.example.com + +Punctuation after uppercase: \noreply@example.com, \http://example.com, \https://example.com, \www.example.com + +Last punctuation before lowercase (1): `noreply@example.com; + +(2) `http://example.com; + +(3) `https://example.com; + +(4) `www.example.com; (broken up to prevent code from forming) + +First lowercase: anoreply@example.com, ahttp://example.com, ahttps://example.com, awww.example.com + +First punctuation after lowercase: {noreply@example.com, {http://example.com, {https://example.com, {www.example.com + +Last punctuation: ~noreply@example.com, ~http://example.com, ~https://example.com, ~www.example.com + +First non-ASCII unicode whitespace (0x80): …noreply@example.com, …http://example.com, …https://example.com, …www.example.com + +Last non-ASCII unicode whitespace (0x3000):  noreply@example.com,  http://example.com,  https://example.com,  www.example.com + +First non-ASCII punctuation: ¡noreply@example.com, ¡http://example.com, ¡https://example.com, ¡www.example.com + +Last non-ASCII punctuation: ・noreply@example.com, ・http://example.com, ・https://example.com, ・www.example.com + +Some non-ascii: 中noreply@example.com, 中http://example.com, 中https://example.com, 中www.example.com + +Some more non-ascii: 🤷‍noreply@example.com, 🤷‍http://example.com, 🤷‍https://example.com, 🤷‍www.example.com +"###, + &gfm + ), + r###"

Last non-markdown ASCII whitespace (FF): noreply@example.com, http://example.com, https://example.com, www.example.com

+

Last non-whitespace ASCII control (US): noreply@example.com, http://example.com, https://example.com, www.example.com

+

First punctuation after controls: !noreply@example.com, !http://example.com, !https://example.com, !www.example.com

+

Last punctuation before digits: /noreply@example.com, /http://example.com, /https://example.com, /www.example.com

+

First digit: 0noreply@example.com, 0http://example.com, 0https://example.com, 0www.example.com

+

First punctuation after digits: :noreply@example.com, :http://example.com, :https://example.com, :www.example.com

+

Last punctuation before caps: @noreply@example.com, @http://example.com, @https://example.com, @www.example.com

+

First uppercase: Anoreply@example.com, Ahttp://example.com, Ahttps://example.com, Awww.example.com

+

Punctuation after uppercase: \noreply@example.com, \http://example.com, \https://example.com, \www.example.com

+

Last punctuation before lowercase (1): `noreply@example.com;

+

(2) `http://example.com;

+

(3) `https://example.com;

+

(4) `www.example.com; (broken up to prevent code from forming)

+

First lowercase: anoreply@example.com, ahttp://example.com, ahttps://example.com, awww.example.com

+

First punctuation after lowercase: {noreply@example.com, {http://example.com, {https://example.com, {www.example.com

+

Last punctuation: ~noreply@example.com, ~http://example.com, ~https://example.com, ~www.example.com

+

First non-ASCII unicode whitespace (0x80): Â…noreply@example.com, Â…http://example.com, Â…https://example.com, Â…www.example.com

+

Last non-ASCII unicode whitespace (0x3000):  noreply@example.com,  http://example.com,  https://example.com,  www.example.com

+

First non-ASCII punctuation: ¡noreply@example.com, ¡http://example.com, ¡https://example.com, ¡www.example.com

+

Last non-ASCII punctuation: ・noreply@example.com, ・http://example.com, ・https://example.com, ・www.example.com

+

Some non-ascii: 中noreply@example.com, 中http://example.com, 中https://example.com, 中www.example.com

+

Some more non-ascii: 🤷‍noreply@example.com, 🤷‍http://example.com, 🤷‍https://example.com, 🤷‍www.example.com

+"###, + "should match previous (complex) like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# HTTP + +https://a.b can start after EOF + +Can start after EOL: +https://a.b + +Can start after tab: https://a.b. + +Can start after space: https://a.b. + +Can start after left paren (https://a.b. + +Can start after asterisk *https://a.b. + +Can start after underscore *_https://a.b. + +Can start after tilde ~https://a.b. + +# www + +www.a.b can start after EOF + +Can start after EOL: +www.a.b + +Can start after tab: www.a.b. + +Can start after space: www.a.b. + +Can start after left paren (www.a.b. + +Can start after asterisk *www.a.b. + +Can start after underscore *_www.a.b. + +Can start after tilde ~www.a.b. + +# Email + +## Correct character before + +a@b.c can start after EOF + +Can start after EOL: +a@b.c + +Can start after tab: a@b.c. + +Can start after space: a@b.c. + +Can start after left paren(a@b.c. + +Can start after asterisk*a@b.c. + +While theoretically it’s possible to start at an underscore, that underscore +is part of the email, so it’s in fact part of the link: _a@b.c. + +Can start after tilde~a@b.c. + +## Others characters before + +While other characters before the email aren’t allowed by GFM, they work on +github.com: !a@b.c, "a@b.c, #a@b.c, $a@b.c, &a@b.c, 'a@b.c, )a@b.c, +a@b.c, +,a@b.c, -a@b.c, .a@b.c, /a@b.c, :a@b.c, ;a@b.c, a@b.c, ?a@b.c, +@a@b.c, \a@b.c, ]a@b.c, ^a@b.c, `a@b.c, {a@b.c, }a@b.c. + +## Commas + +See `https://github.com/remarkjs/remark/discussions/678`. + +,https://github.com + +[ ,https://github.com + +[asd] ,https://github.com +"###, + &gfm + ), + r###"

HTTP

+

https://a.b can start after EOF

+

Can start after EOL: +https://a.b

+

Can start after tab: https://a.b.

+

Can start after space: https://a.b.

+

Can start after left paren (https://a.b.

+

Can start after asterisk *https://a.b.

+

Can start after underscore *_https://a.b.

+

Can start after tilde ~https://a.b.

+

www

+

www.a.b can start after EOF

+

Can start after EOL: +www.a.b

+

Can start after tab: www.a.b.

+

Can start after space: www.a.b.

+

Can start after left paren (www.a.b.

+

Can start after asterisk *www.a.b.

+

Can start after underscore *_www.a.b.

+

Can start after tilde ~www.a.b.

+

Email

+

Correct character before

+

a@b.c can start after EOF

+

Can start after EOL: +a@b.c

+

Can start after tab: a@b.c.

+

Can start after space: a@b.c.

+

Can start after left paren(a@b.c.

+

Can start after asterisk*a@b.c.

+

While theoretically it’s possible to start at an underscore, that underscore +is part of the email, so it’s in fact part of the link: _a@b.c.

+

Can start after tilde~a@b.c.

+

Others characters before

+

While other characters before the email aren’t allowed by GFM, they work on +github.com: !a@b.c, "a@b.c, #a@b.c, $a@b.c, &a@b.c, 'a@b.c, )a@b.c, +a@b.c, +,a@b.c, -a@b.c, .a@b.c, /a@b.c, :a@b.c, ;a@b.c, <a@b.c, =a@b.c, >a@b.c, ?a@b.c, +@a@b.c, \a@b.c, ]a@b.c, ^a@b.c, `a@b.c, {a@b.c, }a@b.c.

+

Commas

+

See https://github.com/remarkjs/remark/discussions/678.

+

,https://github.com

+

[ ,https://github.com

+

[asd] ,https://github.com

+"###, + "should match previous like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf 2? + +www.a (space) + +www.a! + +www.a" + +www.a# + +www.a$ + +www.a% + +www.a& + +www.a' + +www.a( + +www.a) + +www.a* + +www.a+ + +www.a, + +www.a- + +www.a + +www.a. + +www.a/ + +www.a: + +www.a; + +www.a< + +www.a= + +www.a> + +www.a? + +www.a@ + +www.a[ + +www.a\ + +www.a] + +www.a^ + +www.a_ + +www.a` + +www.a{ + +www.a| + +www.a} + +www.a~ +"###, + &gfm + ), + r###"

wwwtf 2?

+

www.a (space)

+

www.a!

+

www.a"

+

www.a#

+

www.a$

+

www.a%

+

www.a&

+

www.a'

+

www.a(

+

www.a)

+

www.a*

+

www.a+

+

www.a,

+

www.a-

+

www.a

+

www.a.

+

www.a/

+

www.a:

+

www.a;

+

www.a<

+

www.a=

+

www.a>

+

www.a?

+

www.a@

+

www.a[

+

www.a\

+

www.a]

+

www.a^

+

www.a_

+

www.a`

+

www.a{

+

www.a|

+

www.a}

+

www.a~

+"###, + "should match www (domain continue) like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf 5? + +www.a. (space) + +www.a.! + +www.a." + +www.a.# + +www.a.$ + +www.a.% + +www.a.& + +www.a.' + +www.a.( + +www.a.) + +www.a.* + +www.a.+ + +www.a., + +www.a.- + +www.a. + +www.a.. + +www.a./ + +www.a.: + +www.a.; + +www.a.< + +www.a.= + +www.a.> + +www.a.? + +www.a.@ + +www.a.[ + +www.a.\ + +www.a.] + +www.a.^ + +www.a._ + +www.a.` + +www.a.{ + +www.a.| + +www.a.} + +www.a.~ +"###, + &gfm + ), + r###"

wwwtf 5?

+

www.a. (space)

+

www.a.!

+

www.a."

+

www.a.#

+

www.a.$

+

www.a.%

+

www.a.&

+

www.a.'

+

www.a.(

+

www.a.)

+

www.a.*

+

www.a.+

+

www.a.,

+

www.a.-

+

www.a.

+

www.a..

+

www.a./

+

www.a.:

+

www.a.;

+

www.a.<

+

www.a.=

+

www.a.>

+

www.a.?

+

www.a.@

+

www.a.[

+

www.a.\

+

www.a.]

+

www.a.^

+

www.a._

+

www.a.`

+

www.a.{

+

www.a.|

+

www.a.}

+

www.a.~

+"###, + "should match www (domain dot) like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf? + +www. (space) + +www.! + +www." + +www.# + +www.$ + +www.% + +www.& + +www.' + +www.( + +www.) + +www.* + +www.+ + +www., + +www.- + +www. + +www.. + +www./ + +www.: + +www.; + +www.< + +www.= + +www.> + +www.? + +www.@ + +www.[ + +www.\ + +www.] + +www.^ + +www._ + +www.` + +www.{ + +www.| + +www.} + +www.~ +"###, + &gfm + ), + r###"

wwwtf?

+

www. (space)

+

www.!

+

www."

+

www.#

+

www.$

+

www.%

+

www.&

+

www.'

+

www.(

+

www.)

+

www.*

+

www.+

+

www.,

+

www.-

+

www.

+

www..

+

www./

+

www.:

+

www.;

+

www.<

+

www.=

+

www.>

+

www.?

+

www.@

+

www.[

+

www.\

+

www.]

+

www.^

+

www._

+

www.`

+

www.{

+

www.|

+

www.}

+

www.~

+"###, + "should match www (domain start) like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf? (4) + +www.a/b (space) + +www.a/b! + +www.a/b" + +www.a/b# + +www.a/b$ + +www.a/b% + +www.a/b& + +www.a/b' + +www.a/b( + +www.a/b) + +www.a/b* + +www.a/b+ + +www.a/b, + +www.a/b- + +www.a/b + +www.a/b. + +www.a/b/ + +www.a/b: + +www.a/b; + +www.a/b< + +www.a/b= + +www.a/b> + +www.a/b? + +www.a/b@ + +www.a/b[ + +www.a/b\ + +www.a/b] + +www.a/b^ + +www.a/b_ + +www.a/b` + +www.a/b{ + +www.a/b| + +www.a/b} + +www.a/b~ +"###, + &gfm + ), + r###"

wwwtf? (4)

+

www.a/b (space)

+

www.a/b!

+

www.a/b"

+

www.a/b#

+

www.a/b$

+

www.a/b%

+

www.a/b&

+

www.a/b'

+

www.a/b(

+

www.a/b)

+

www.a/b*

+

www.a/b+

+

www.a/b,

+

www.a/b-

+

www.a/b

+

www.a/b.

+

www.a/b/

+

www.a/b:

+

www.a/b;

+

www.a/b<

+

www.a/b=

+

www.a/b>

+

www.a/b?

+

www.a/b@

+

www.a/b[

+

www.a/b\

+

www.a/b]

+

www.a/b^

+

www.a/b_

+

www.a/b`

+

www.a/b{

+

www.a/b|

+

www.a/b}

+

www.a/b~

+"###, + "should match www (path continue) like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf? (3) + +www.a/ (space) + +www.a/! + +www.a/" + +www.a/# + +www.a/$ + +www.a/% + +www.a/& + +www.a/' + +www.a/( + +www.a/) + +www.a/* + +www.a/+ + +www.a/, + +www.a/- + +www.a/ + +www.a/. + +www.a// + +www.a/: + +www.a/; + +www.a/< + +www.a/= + +www.a/> + +www.a/? + +www.a/@ + +www.a/[ + +www.a/\ + +www.a/] + +www.a/^ + +www.a/_ + +www.a/` + +www.a/{ + +www.a/| + +www.a/} + +www.a/~ +"###, + &gfm + ), + r###"

wwwtf? (3)

+

www.a/ (space)

+

www.a/!

+

www.a/"

+

www.a/#

+

www.a/$

+

www.a/%

+

www.a/&

+

www.a/'

+

www.a/(

+

www.a/)

+

www.a/*

+

www.a/+

+

www.a/,

+

www.a/-

+

www.a/

+

www.a/.

+

www.a//

+

www.a/:

+

www.a/;

+

www.a/<

+

www.a/=

+

www.a/>

+

www.a/?

+

www.a/@

+

www.a/[

+

www.a/\

+

www.a/]

+

www.a/^

+

www.a/_

+

www.a/`

+

www.a/{

+

www.a/|

+

www.a/}

+

www.a/~

+"###, + "should match www (path start) like GitHub does (except for the bracket bug)" ); } -- cgit