diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-09-05 15:03:24 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-09-05 15:03:24 +0200 |
commit | 3d00bf57a225369120fd98bee36f65a541260da1 (patch) | |
tree | 65780bdbc880f06ba3c92d8c5dbddbdd00ccb92e | |
parent | 16de10fe2395002644d685fdfcf76823346d1cc4 (diff) | |
download | markdown-rs-3d00bf57a225369120fd98bee36f65a541260da1.tar.gz markdown-rs-3d00bf57a225369120fd98bee36f65a541260da1.tar.bz2 markdown-rs-3d00bf57a225369120fd98bee36f65a541260da1.zip |
Fix to implement GFM autolink literals exactly
-rw-r--r-- | src/compiler.rs | 37 | ||||
-rw-r--r-- | src/construct/gfm_autolink_literal.rs | 848 | ||||
-rw-r--r-- | src/construct/gfm_table.rs | 2 | ||||
-rw-r--r-- | src/construct/text.rs | 20 | ||||
-rw-r--r-- | src/state.rs | 60 | ||||
-rw-r--r-- | tests/gfm_autolink_literal.rs | 2442 |
6 files changed, 3245 insertions, 164 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 681ec00..0ea1638 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -871,6 +871,7 @@ fn on_exit_autolink_email(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + false, ); } @@ -884,6 +885,7 @@ fn on_exit_autolink_protocol(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + false, ); } @@ -1154,6 +1156,7 @@ fn on_exit_gfm_autolink_literal_protocol(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + true, ); } @@ -1167,12 +1170,22 @@ fn on_exit_gfm_autolink_literal_www(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .as_str(), + true, ); } /// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail]. fn on_exit_gfm_autolink_literal_email(context: &mut CompileContext) { - on_exit_autolink_email(context); + generate_autolink( + context, + Some("mailto:"), + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + true, + ); } /// Handle [`Exit`][Kind::Exit]:[`GfmFootnoteCall`][Name::GfmFootnoteCall]. @@ -1822,8 +1835,24 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) { } /// Generate an autolink (used by unicode autolinks and GFM autolink literals). -fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { - if !context.image_alt_inside { +fn generate_autolink( + context: &mut CompileContext, + protocol: Option<&str>, + value: &str, + is_gfm_literal: bool, +) { + let mut is_in_link = false; + let mut index = 0; + + while index < context.media_stack.len() { + if !context.media_stack[index].image { + is_in_link = true; + break; + } + index += 1; + } + + if !context.image_alt_inside && (!is_in_link || !is_gfm_literal) { context.push("<a href=\""); let url = if let Some(protocol) = protocol { format!("{}{}", protocol, value) @@ -1843,7 +1872,7 @@ fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value context.push(&encode(value, context.encode_html)); - if !context.image_alt_inside { + if !context.image_alt_inside && (!is_in_link || !is_gfm_literal) { context.push("</a>"); } } diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index 704c536..038330c 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -1,14 +1,621 @@ -//! To do. +//! GFM: autolink literal occurs in the [text][] content type. +//! +//! ## Grammar +//! +//! Autolink literals form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! gfm_autolink_literal ::= gfm_protocol_autolink | gfm_www_autolink | gfm_email_autolink +//! +//! ; Restriction: the code before must be `www_autolink_before`. +//! ; Restriction: the code after `.` must not be eof. +//! www_autolink ::= 3('w' | 'W') '.' [domain [path]] +//! www_autolink_before ::= eof | eol | space_or_tab | '(' | '*' | '_' | '[' | ']' | '~' +//! +//! ; Restriction: the code before must be `http_autolink_before`. +//! ; Restriction: the code after the protocol must be `http_autolink_protocol_after`. +//! http_autolink ::= ('h' | 'H') 2('t' | 'T') ('p' | 'P') ['s' | 'S'] ':' 2'/' domain [path] +//! http_autolink_before ::= byte - ascii_alpha +//! http_autolink_protocol_after ::= byte - eof - eol - ascii_control - unicode_whitespace - unicode_punctuation +//! +//! ; Restriction: the code before must be `email_autolink_before`. +//! ; Restriction: `ascii_digit` may not occur in the last label part of the label. +//! email_autolink ::= 1*('+' | '-' | '.' | '_' | ascii_alphanumeric) '@' 1*(1*label_segment label_dot_cont) 1*label_segment +//! email_autolink_before ::= byte - ascii_alpha - '/' +//! +//! ; Restriction: `_` may not occur in the last two domain parts. +//! domain ::= 1*(url_ampt_cont | domain_punct_cont | '-' | byte - eof - ascii_control - unicode_whitespace - unicode_punctuation) +//! ; Restriction: must not be followed by `punct`. +//! domain_punct_cont ::= '.' | '_' +//! ; Restriction: must not be followed by `char-ref`. +//! url_ampt_cont ::= '&' +//! +//! ; Restriction: a counter `balance = 0` is increased for every `(`, and decreased for every `)`. +//! ; Restriction: `)` must not be `paren_at_end`. +//! path ::= 1*(url_ampt_cont | path_punctuation_cont | '(' | ')' | byte - eof - eol - space_or_tab) +//! ; Restriction: must not be followed by `punct`. +//! path_punctuation_cont ::= trailing_punctuation - '<' +//! ; Restriction: must be followed by `punct` and `balance` must be less than `0`. +//! paren_at_end ::= ')' +//! +//! label_segment ::= label_dash_underscore_cont | ascii_alpha | ascii_digit +//! ; Restriction: if followed by `punct`, the whole email autolink is invalid. +//! label_dash_underscore_cont ::= '-' | '_' +//! ; Restriction: must not be followed by `punct`. +//! label_dot_cont ::= '.' +//! +//! punct ::= *trailing_punctuation ( byte - eof - eol - space_or_tab - '<' ) +//! char_ref ::= *ascii_alpha ';' path_end +//! trailing_punctuation ::= '!' | '"' | '\'' | ')' | '*' | ',' | '.' | ':' | ';' | '<' | '?' | '_' | '~' +//! ``` +//! +//! The grammar for GFM autolink literal is very relaxed: basically anything +//! except for whitespace is allowed after a prefix. +//! To use whitespace characters and otherwise impossible characters, in URLs, +//! you can use percent encoding: +//! +//! ```markdown +//! https://example.com/alpha%20bravo +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p> +//! ``` +//! +//! There are several cases where incorrect encoding of URLs would, in other +//! languages, result in a parse error. +//! In markdown, there are no errors, and URLs are normalized. +//! In addition, many characters are percent encoded +//! ([`sanitize_uri`][sanitize_uri]). +//! For example: +//! +//! ```markdown +//! www.ađź‘Ťb% +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p><a href="http://www.a%F0%9F%91%8Db%25">www.ađź‘Ťb%</a></p> +//! ``` +//! +//! There is a big difference between how www and protocol literals work +//! compared to how email literals work. +//! The first two are done when parsing, and work like anything else in +//! markdown. +//! But email literals are handled afterwards: when everything is parsed, we +//! look back at the events to figure out if there were email addresses. +//! This particularly affects how they interleave with character escapes and +//! character references. +//! +//! ## HTML +//! +//! GFM autolink literals relate to the `<a>` element in HTML. +//! See [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info. +//! When an email autolink is used, the string `mailto:` is prepended when +//! generating the `href` attribute of the hyperlink. +//! When a www autolink is used, the string `http:` is prepended. +//! +//! ## Recommendation +//! +//! It is recommended to use labels ([label start link][label_start_link], +//! [label end][label_end]), either with a resource or a definition +//! ([definition][]), instead of autolink literals, as those allow relative +//! URLs and descriptive text to explain the URL in prose. +//! +//! ## Bugs +//! +//! GitHub’s own algorithm to parse autolink literals contains three bugs. +//! A smaller bug is left unfixed in this project for consistency. +//! Two main bugs are not present in this project. +//! The issues relating to autolink literals are: +//! +//! * [GFM autolink extension (`www.`, `https?://` parts): links don’t work when after bracket](https://github.com/github/cmark-gfm/issues/278)\ +//! fixed here âś… +//! * [GFM autolink extension (`www.` part): uppercase does not match on issues/PRs/comments](https://github.com/github/cmark-gfm/issues/280)\ +//! fixed here âś… +//! * [GFM autolink extension (`www.` part): the word `www` matches](https://github.com/github/cmark-gfm/issues/279)\ +//! present here for consistency +//! +//! ## Tokens +//! +//! * [`GfmAutolinkLiteralProtocol`][Name::GfmAutolinkLiteralProtocol] +//! * [`GfmAutolinkLiteralWww`][Name::GfmAutolinkLiteralWww] +//! * [`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail] +//! +//! ## References +//! +//! * [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) +//! * [*§ 6.9 Autolinks (extension)* in `GFM`](https://github.github.com/gfm/#autolinks-extension-) +//! +//! [text]: crate::construct::text +//! [definition]: crate::construct::definition +//! [attention]: crate::construct::attention +//! [label_start_link]: crate::construct::label_start_link +//! [label_end]: crate::construct::label_end +//! [sanitize_uri]: crate::util::sanitize_uri +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element use crate::event::{Event, Kind, Name}; +use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::util::classify_character::{classify, Kind as CharacterKind}; -use crate::util::slice::{Position, Slice}; +use crate::util::{ + classify_character::{classify_opt, Kind as CharacterKind}, + slice::{char_after_index, Position, Slice}, +}; use alloc::vec::Vec; -use core::str; -// To do: doc al functions. +/// Start of protocol autolink literal. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^ +/// ``` +pub fn protocol_start(tokenizer: &mut Tokenizer) -> State { + if tokenizer + .parse_state + .options + .constructs + .gfm_autolink_literal && + matches!(tokenizer.current, Some(b'H' | b'h')) + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L214>. + && !matches!(tokenizer.previous, Some(b'A'..=b'Z' | b'a'..=b'z')) + { + tokenizer.enter(Name::GfmAutolinkLiteralProtocol); + tokenizer.attempt( + State::Next(StateName::GfmAutolinkLiteralProtocolAfter), + State::Nok, + ); + tokenizer.attempt( + State::Next(StateName::GfmAutolinkLiteralDomainInside), + State::Nok, + ); + tokenizer.tokenize_state.start = tokenizer.point.index; + State::Retry(StateName::GfmAutolinkLiteralProtocolPrefixInside) + } else { + State::Nok + } +} + +/// After a protocol autolink literal. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^ +/// ``` +pub fn protocol_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.exit(Name::GfmAutolinkLiteralProtocol); + State::Ok +} + +/// In protocol. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^^^^^ +/// ``` +pub fn protocol_prefix_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'A'..=b'Z' | b'a'..=b'z') + // `5` is size of `https` + if tokenizer.point.index - tokenizer.tokenize_state.start < 5 => + { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralProtocolPrefixInside) + } + Some(b':') => { + let slice = Slice::from_indices( + tokenizer.parse_state.bytes, + tokenizer.tokenize_state.start, + tokenizer.point.index, + ); + let name = slice.as_str().to_ascii_lowercase(); + + tokenizer.tokenize_state.start = 0; + + if name == "http" || name == "https" { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside) + } else { + State::Nok + } + } + _ => { + tokenizer.tokenize_state.start = 0; + State::Nok + } + } +} + +/// In protocol slashes. +/// +/// ```markdown +/// > | https://example.com/a?b#c +/// ^^ +/// ``` +pub fn protocol_slashes_inside(tokenizer: &mut Tokenizer) -> State { + if tokenizer.current == Some(b'/') { + tokenizer.consume(); + if tokenizer.tokenize_state.size == 0 { + tokenizer.tokenize_state.size += 1; + State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside) + } else { + tokenizer.tokenize_state.size = 0; + State::Ok + } + } else { + tokenizer.tokenize_state.size = 0; + State::Nok + } +} +/// Start of www autolink literal. +/// +/// ```markdown +/// > | www.example.com/a?b#c +/// ^ +/// ``` +pub fn www_start(tokenizer: &mut Tokenizer) -> State { + if tokenizer + .parse_state + .options + .constructs + .gfm_autolink_literal && + matches!(tokenizer.current, Some(b'W' | b'w')) + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. + && matches!(tokenizer.previous, None | Some(b'\t' | b'\n' | b' ' | b'(' | b'*' | b'_' | b'[' | b']' | b'~')) + { + tokenizer.enter(Name::GfmAutolinkLiteralWww); + tokenizer.attempt( + State::Next(StateName::GfmAutolinkLiteralWwwAfter), + State::Nok, + ); + // Note: we *check*, so we can discard the `www.` we parsed. + // If it worked, we consider it as a part of the domain. + tokenizer.check( + State::Next(StateName::GfmAutolinkLiteralDomainInside), + State::Nok, + ); + State::Retry(StateName::GfmAutolinkLiteralWwwPrefixInside) + } else { + State::Nok + } +} + +/// After a www autolink literal. +/// +/// ```markdown +/// > | www.example.com/a?b#c +/// ^ +/// ``` +pub fn www_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.exit(Name::GfmAutolinkLiteralWww); + State::Ok +} + +/// In www prefix. +/// +/// ```markdown +/// > | www.example.com +/// ^^^^ +/// ``` +pub fn www_prefix_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'.') if tokenizer.tokenize_state.size == 3 => { + tokenizer.tokenize_state.size = 0; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralWwwPrefixAfter) + } + Some(b'W' | b'w') if tokenizer.tokenize_state.size < 3 => { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralWwwPrefixInside) + } + _ => { + tokenizer.tokenize_state.size = 0; + State::Nok + } + } +} + +/// After www prefix. +/// +/// ```markdown +/// > | www.example.com +/// ^ +/// ``` +pub fn www_prefix_after(tokenizer: &mut Tokenizer) -> State { + // If there is *anything*, we can link. + if tokenizer.current == None { + State::Nok + } else { + State::Ok + } +} + +/// In domain. +/// +/// ```markdown +/// > | https://example.com/a +/// ^^^^^^^^^^^ +/// ``` +pub fn domain_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Check whether this marker, which is a trailing punctuation + // marker, optionally followed by more trailing markers, and then + // followed by an end. + Some(b'.' | b'_') => { + tokenizer.check( + State::Next(StateName::GfmAutolinkLiteralDomainAfter), + State::Next(StateName::GfmAutolinkLiteralDomainAtPunctuation), + ); + State::Retry(StateName::GfmAutolinkLiteralTrail) + } + // Dashes and continuation bytes are fine. + Some(b'-' | 0x80..=0xBF) => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralDomainInside) + } + _ => { + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. + if byte_to_kind( + tokenizer.parse_state.bytes, + tokenizer.point.index, + tokenizer.current, + ) == CharacterKind::Other + { + tokenizer.tokenize_state.seen = true; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralDomainInside) + } else { + State::Retry(StateName::GfmAutolinkLiteralDomainAfter) + } + } + } +} + +/// In domain, at potential trailing punctuation, that was not trailing. +/// +/// ```markdown +/// > | https://example.com +/// ^ +/// ``` +pub fn domain_at_punctuation(tokenizer: &mut Tokenizer) -> State { + // There is an underscore in the last segment of the domain + if matches!(tokenizer.current, Some(b'_')) { + tokenizer.tokenize_state.marker = b'_'; + } + // Otherwise, it’s a `.`: save the last segment underscore in the + // penultimate segment slot. + else { + tokenizer.tokenize_state.marker_b = tokenizer.tokenize_state.marker; + tokenizer.tokenize_state.marker = 0; + } + + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralDomainInside) +} + +/// After domain +/// +/// ```markdown +/// > | https://example.com/a +/// ^ +/// ``` +pub fn domain_after(tokenizer: &mut Tokenizer) -> State { + // No underscores allowed in last two segments. + let result = if tokenizer.tokenize_state.marker_b == b'_' + || tokenizer.tokenize_state.marker == b'_' + // At least one character must be seen. + || !tokenizer.tokenize_state.seen + // Note: that’s GH says a dot is needed, but it’s not true: + // <https://github.com/github/cmark-gfm/issues/279> + { + State::Nok + } else { + State::Retry(StateName::GfmAutolinkLiteralPathInside) + }; + + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.marker = 0; + tokenizer.tokenize_state.marker_b = 0; + result +} + +/// In path. +/// +/// ```markdown +/// > | https://example.com/a +/// ^^ +/// ``` +pub fn path_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Continuation bytes are fine, we’ve already checked the first one. + Some(0x80..=0xBF) => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) + } + // Count opening parens. + Some(b'(') => { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) + } + // Check whether this trailing punctuation marker is optionally + // followed by more trailing markers, and then followed + // by an end. + // If this is a paren (followed by trailing, then the end), we + // *continue* if we saw less closing parens than opening parens. + Some( + b'!' | b'"' | b'&' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'<' | b'?' + | b']' | b'_' | b'~', + ) => { + let next = if tokenizer.current == Some(b')') + && tokenizer.tokenize_state.size_b < tokenizer.tokenize_state.size + { + StateName::GfmAutolinkLiteralPathAtPunctuation + } else { + StateName::GfmAutolinkLiteralPathAfter + }; + tokenizer.check( + State::Next(next), + State::Next(StateName::GfmAutolinkLiteralPathAtPunctuation), + ); + State::Retry(StateName::GfmAutolinkLiteralTrail) + } + _ => { + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. + if byte_to_kind( + tokenizer.parse_state.bytes, + tokenizer.point.index, + tokenizer.current, + ) == CharacterKind::Whitespace + { + State::Retry(StateName::GfmAutolinkLiteralPathAfter) + } else { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) + } + } + } +} + +/// In path, at potential trailing punctuation, that was not trailing. +/// +/// ```markdown +/// > | https://example.com/a"b +/// ^ +/// ``` +pub fn path_at_punctuation(tokenizer: &mut Tokenizer) -> State { + // Count closing parens. + if tokenizer.current == Some(b')') { + tokenizer.tokenize_state.size_b += 1; + } + + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralPathInside) +} + +/// At end of path, reset parens. +/// +/// ```markdown +/// > | https://example.com/asd(qwe). +/// ^ +/// ``` +pub fn path_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_b = 0; + State::Ok +} + +/// In trail of domain or path. +/// +/// ```markdown +/// > | https://example.com"). +/// ^ +/// ``` +pub fn trail(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Regular trailing punctuation. + Some( + b'!' | b'"' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'?' | b'_' | b'~', + ) => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrail) + } + // `&` followed by one or more alphabeticals and then a `;`, is + // as a whole considered as trailing punctuation. + // In all other cases, it is considered as continuation of the URL. + Some(b'&') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrailCharRefStart) + } + // `<` is an end. + Some(b'<') => State::Ok, + // Needed because we allow literals after `[`, as we fix: + // <https://github.com/github/cmark-gfm/issues/278>. + // Check that it is not followed by `(` or `[`. + Some(b']') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrailBracketAfter) + } + _ => { + // Whitespace is the end of the URL, anything else is continuation. + if byte_to_kind( + tokenizer.parse_state.bytes, + tokenizer.point.index, + tokenizer.current, + ) == CharacterKind::Whitespace + { + State::Ok + } else { + State::Nok + } + } + } +} + +/// In trail, after `]`. +/// +/// > 👉 **Note**: this deviates from `cmark-gfm` to fix a bug. +/// > See end of <https://github.com/github/cmark-gfm/issues/278> for more. +/// +/// ```markdown +/// > | https://example.com]( +/// ^ +/// ``` +pub fn trail_bracket_after(tokenizer: &mut Tokenizer) -> State { + // Whitespace or something that could start a resource or reference is the end. + // Switch back to trail otherwise. + if matches!( + tokenizer.current, + None | Some(b'\t' | b'\n' | b' ' | b'(' | b'[') + ) { + State::Ok + } else { + State::Retry(StateName::GfmAutolinkLiteralTrail) + } +} + +/// In character-reference like trail, after `&`. +/// +/// ```markdown +/// > | https://example.com&). +/// ^ +/// ``` +pub fn trail_char_ref_start(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'A'..=b'Z' | b'a'..=b'z')) { + State::Retry(StateName::GfmAutolinkLiteralTrailCharRefInside) + } else { + State::Nok + } +} + +/// In character-reference like trail. +/// +/// ```markdown +/// > | https://example.com&). +/// ^ +/// ``` +pub fn trail_char_ref_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'A'..=b'Z' | b'a'..=b'z') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrailCharRefInside) + } + // Switch back to trail if this is well-formed. + Some(b';') => { + tokenizer.consume(); + State::Next(StateName::GfmAutolinkLiteralTrail) + } + _ => State::Nok, + } +} + +/// Resolve: postprocess text to find email autolink literals. pub fn resolve(tokenizer: &mut Tokenizer) { tokenizer.map.consume(&mut tokenizer.events); @@ -36,23 +643,30 @@ pub fn resolve(tokenizer: &mut Tokenizer) { let mut start = 0; while byte_index < bytes.len() { - if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') { - if let Some(autolink) = peek(bytes, byte_index) { - byte_index = autolink.1; + if bytes[byte_index] == b'@' { + let mut range = (0, 0); + + if let Some(start) = peek_bytes_atext(bytes, byte_index) { + if let Some(end) = peek_bytes_email_domain(bytes, byte_index + 1) { + let end = peek_bytes_truncate(bytes, start, end); + range = (start, end); + } + } + + if range.1 != 0 { + byte_index = range.1; // If there is something between the last link // (or the start) and this link. - if start != autolink.0 { + if start != range.0 { replace.push(Event { kind: Kind::Enter, name: Name::Data, point: point.clone(), link: None, }); - point = point.shift_to( - tokenizer.parse_state.bytes, - start_index + autolink.0, - ); + point = point + .shift_to(tokenizer.parse_state.bytes, start_index + range.0); replace.push(Event { kind: Kind::Exit, name: Name::Data, @@ -64,19 +678,19 @@ pub fn resolve(tokenizer: &mut Tokenizer) { // Add the link. replace.push(Event { kind: Kind::Enter, - name: autolink.2.clone(), + name: Name::GfmAutolinkLiteralEmail, point: point.clone(), link: None, }); - point = point - .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1); + point = + point.shift_to(tokenizer.parse_state.bytes, start_index + range.1); replace.push(Event { kind: Kind::Exit, - name: autolink.2.clone(), + name: Name::GfmAutolinkLiteralEmail, point: point.clone(), link: None, }); - start = autolink.1; + start = range.1; } } @@ -114,140 +728,19 @@ pub fn resolve(tokenizer: &mut Tokenizer) { } } -fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> { - // Protocol. - if let Some(protocol_end) = peek_protocol(bytes, index) { - if let Some(domain_end) = peek_domain(bytes, protocol_end, true) { - let end = truncate(bytes, protocol_end, domain_end); - - // Cannot be empty. - if end != protocol_end { - return Some((index, end, Name::GfmAutolinkLiteralProtocol)); - } - } - } - - // Www. - if peek_www(bytes, index).is_some() { - // Note: we discard the `www.` we parsed, we now try to parse it as a domain. - let domain_end = peek_domain(bytes, index, false).unwrap_or(index); - let end = truncate(bytes, index, domain_end); - return Some((index, end, Name::GfmAutolinkLiteralWww)); - } - - // Email. - if bytes[index] == b'@' { - if let Some(start) = peek_atext(bytes, index) { - if let Some(end) = peek_email_domain(bytes, index + 1) { - let end = truncate(bytes, start, end); - return Some((start, end, Name::GfmAutolinkLiteralEmail)); - } - } - } - - None -} - -/// Move past `http://`, `https://`, case-insensitive. -fn peek_protocol(bytes: &[u8], mut index: usize) -> Option<usize> { - // `http` - if index + 3 < bytes.len() - && matches!(bytes[index], b'H' | b'h') - && matches!(bytes[index + 1], b'T' | b't') - && matches!(bytes[index + 2], b'T' | b't') - && matches!(bytes[index + 3], b'P' | b'p') - { - index += 4; - - // `s`, optional. - if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') { - index += 1; - } - - // `://` - if index + 3 < bytes.len() - && bytes[index] == b':' - && bytes[index + 1] == b'/' - && bytes[index + 2] == b'/' - { - return Some(index + 3); - } - } - - None -} - -/// Move past `www.`, case-insensitive. -fn peek_www(bytes: &[u8], index: usize) -> Option<usize> { - // `www.` - if index + 3 < bytes.len() - // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. - && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~')) - && matches!(bytes[index], b'W' | b'w') - && matches!(bytes[index + 1], b'W' | b'w') - && matches!(bytes[index + 2], b'W' | b'w') - && bytes[index + 3] == b'.' - { - Some(index + 4) - } else { - None - } -} - -/// Move past `example.com`. -fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option<usize> { - let mut dots = false; - let mut penultime = false; - let mut last = false; - // To do: expose this from slice? - // To do: do it ourselves? <https://github.com/commonmark/cmark/blob/8a023286198a7e408398e282f293e3b0baebb644/src/utf8.c#L150>, <https://doc.rust-lang.org/core/str/fn.next_code_point.html>, <https://www.reddit.com/r/rust/comments/4g2zu0/lazy_unicode_iterator_from_byte_iteratorslice/>, <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>. - let char_indices = str::from_utf8(&bytes[start..]) - .unwrap() - .char_indices() - .collect::<Vec<_>>(); - let mut index = 0; - - while index < char_indices.len() { - match char_indices[index].1 { - '_' => last = true, - '.' => { - penultime = last; - last = false; - dots = true; - } - '-' => {} - // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. - char if classify(char) == CharacterKind::Other => {} - _ => break, - } - - index += 1; - } - - // No underscores allowed in last two parts. - // A valid domain needs to have at least a dot. - if penultime || last || (!allow_short && !dots) { - None - } else { - // Now peek past `/path?search#hash` (anything except whitespace). - while index < char_indices.len() { - if classify(char_indices[index].1) == CharacterKind::Whitespace { - break; - } - - index += 1; - } - - Some(if index == char_indices.len() { - bytes.len() - } else { - start + char_indices[index].0 - }) - } -} - -/// Move back past `contact`. -fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> { +// To do: add `xmpp`, `mailto` support. + +/// Move back past atext. +/// +/// Moving back is only used when post processing text: so for the email address +/// algorithm. +/// +/// ```markdown +/// > | a contact@example.org b +/// ^-- from +/// ^-- to +/// ``` +fn peek_bytes_atext(bytes: &[u8], end: usize) -> Option<usize> { let mut index = end; // Take simplified atext. @@ -270,8 +763,17 @@ fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> { } } -/// Move past `example.com`. -fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> { +/// Move past email domain. +/// +/// Peeking like this only used when post processing text: so for the email +/// address algorithm. +/// +/// ```markdown +/// > | a contact@example.org b +/// ^-- from +/// ^-- to +/// ``` +fn peek_bytes_email_domain(bytes: &[u8], start: usize) -> Option<usize> { let mut index = start; let mut dot = false; @@ -303,8 +805,21 @@ fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> { } } -/// Split trialing stuff from a URL. -fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { +/// Move back past punctuation. +/// +/// Moving back is only used when post processing text: so for the email address +/// algorithm. +/// +/// This is much more complex that needed, because GH allows a lot of +/// punctuation in the protocol and www algorithms. +/// However, those aren’t implemented like the email algo. +/// +/// ```markdown +/// > | a contact@example.org”) b +/// ^-- from +/// ^-- to +/// ``` +fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { let mut index = start; // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L42> @@ -379,3 +894,24 @@ fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { split } + +/// Classify a byte (or `char`). +fn byte_to_kind(bytes: &[u8], index: usize, byte: Option<u8>) -> CharacterKind { + match byte { + None => CharacterKind::Whitespace, + Some(byte) => { + if byte.is_ascii_whitespace() { + CharacterKind::Whitespace + } else if byte.is_ascii_punctuation() { + CharacterKind::Punctuation + } else if byte.is_ascii_alphanumeric() { + CharacterKind::Other + } else { + // Otherwise: seems to be an ASCII control, so it seems to be a + // non-ASCII `char`. + let char = char_after_index(bytes, index); + classify_opt(char) + } + } + } +} diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index d7c2b69..27fbadf 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -191,7 +191,7 @@ //! This bug is not present in this project. //! The issue relating to tables is: //! -//! * [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277)\ +//! * [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277) //! //! ## Tokens //! diff --git a/src/construct/text.rs b/src/construct/text.rs index 3cb0f10..0168d02 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -29,17 +29,21 @@ use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; /// Characters that can start something in text. -const MARKERS: [u8; 11] = [ +const MARKERS: [u8; 15] = [ b'!', // `label_start_image` b'$', // `raw_text` (math (text)) b'&', // `character_reference` b'*', // `attention` (emphasis, strong) b'<', // `autolink`, `html_text` + b'H', // `gfm_autolink_literal` (`protocol` kind) + b'W', // `gfm_autolink_literal` (`www.` kind) b'[', // `label_start_link` b'\\', // `character_escape`, `hard_break_escape` b']', // `label_end`, `gfm_label_start_footnote` b'_', // `attention` (emphasis, strong) b'`', // `raw_text` (code (text)) + b'h', // `gfm_autolink_literal` (`protocol` kind) + b'w', // `gfm_autolink_literal` (`www.` kind) b'~', // `attention` (gfm strikethrough) ]; @@ -113,6 +117,20 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::AutolinkStart) } + Some(b'H' | b'h') => { + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBeforeData), + ); + State::Retry(StateName::GfmAutolinkLiteralProtocolStart) + } + Some(b'W' | b'w') => { + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBeforeData), + ); + State::Retry(StateName::GfmAutolinkLiteralWwwStart) + } Some(b'[') => { tokenizer.attempt( State::Next(StateName::TextBefore), diff --git a/src/state.rs b/src/state.rs index 5013ec8..d7c0c8a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -310,6 +310,29 @@ pub enum Name { StringBefore, StringBeforeData, + GfmAutolinkLiteralProtocolStart, + GfmAutolinkLiteralProtocolAfter, + GfmAutolinkLiteralProtocolPrefixInside, + GfmAutolinkLiteralProtocolSlashesInside, + + GfmAutolinkLiteralWwwStart, + GfmAutolinkLiteralWwwAfter, + GfmAutolinkLiteralWwwPrefixInside, + GfmAutolinkLiteralWwwPrefixAfter, + + GfmAutolinkLiteralDomainInside, + GfmAutolinkLiteralDomainAtPunctuation, + GfmAutolinkLiteralDomainAfter, + + GfmAutolinkLiteralPathInside, + GfmAutolinkLiteralPathAtPunctuation, + GfmAutolinkLiteralPathAfter, + + GfmAutolinkLiteralTrail, + GfmAutolinkLiteralTrailCharRefInside, + GfmAutolinkLiteralTrailCharRefStart, + GfmAutolinkLiteralTrailBracketAfter, + GfmTableStart, GfmTableHeadRowBefore, GfmTableHeadRowStart, @@ -686,6 +709,43 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::StringBefore => construct::string::before, Name::StringBeforeData => construct::string::before_data, + Name::GfmAutolinkLiteralProtocolStart => construct::gfm_autolink_literal::protocol_start, + Name::GfmAutolinkLiteralProtocolAfter => construct::gfm_autolink_literal::protocol_after, + Name::GfmAutolinkLiteralProtocolPrefixInside => { + construct::gfm_autolink_literal::protocol_prefix_inside + } + Name::GfmAutolinkLiteralProtocolSlashesInside => { + construct::gfm_autolink_literal::protocol_slashes_inside + } + + Name::GfmAutolinkLiteralWwwAfter => construct::gfm_autolink_literal::www_after, + Name::GfmAutolinkLiteralWwwStart => construct::gfm_autolink_literal::www_start, + Name::GfmAutolinkLiteralWwwPrefixInside => { + construct::gfm_autolink_literal::www_prefix_inside + } + Name::GfmAutolinkLiteralWwwPrefixAfter => construct::gfm_autolink_literal::www_prefix_after, + Name::GfmAutolinkLiteralDomainInside => construct::gfm_autolink_literal::domain_inside, + Name::GfmAutolinkLiteralDomainAtPunctuation => { + construct::gfm_autolink_literal::domain_at_punctuation + } + Name::GfmAutolinkLiteralDomainAfter => construct::gfm_autolink_literal::domain_after, + + Name::GfmAutolinkLiteralPathInside => construct::gfm_autolink_literal::path_inside, + Name::GfmAutolinkLiteralPathAtPunctuation => { + construct::gfm_autolink_literal::path_at_punctuation + } + Name::GfmAutolinkLiteralPathAfter => construct::gfm_autolink_literal::path_after, + Name::GfmAutolinkLiteralTrail => construct::gfm_autolink_literal::trail, + Name::GfmAutolinkLiteralTrailCharRefStart => { + construct::gfm_autolink_literal::trail_char_ref_start + } + Name::GfmAutolinkLiteralTrailCharRefInside => { + construct::gfm_autolink_literal::trail_char_ref_inside + } + Name::GfmAutolinkLiteralTrailBracketAfter => { + construct::gfm_autolink_literal::trail_bracket_after + } + Name::GfmTableStart => construct::gfm_table::start, Name::GfmTableHeadRowBefore => construct::gfm_table::head_row_before, Name::GfmTableHeadRowStart => construct::gfm_table::head_row_start, diff --git a/tests/gfm_autolink_literal.rs b/tests/gfm_autolink_literal.rs index 9551751..2e84e6d 100644 --- a/tests/gfm_autolink_literal.rs +++ b/tests/gfm_autolink_literal.rs @@ -42,6 +42,22 @@ fn gfm_autolink_literal() { ); assert_eq!( + micromark_with_options("[https://example.com](xxx)", &gfm), + "<p><a href=\"xxx\">https://example.com</a></p>", + "should not link protocol urls in links" + ); + assert_eq!( + micromark_with_options("[www.example.com](xxx)", &gfm), + "<p><a href=\"xxx\">www.example.com</a></p>", + "should not link www urls in links" + ); + assert_eq!( + micromark_with_options("[user@example.com](xxx)", &gfm), + "<p><a href=\"xxx\">user@example.com</a></p>", + "should not link email urls in links" + ); + + assert_eq!( micromark_with_options("user@example.com", &gfm), "<p><a href=\"mailto:user@example.com\">user@example.com</a></p>", "should support a closing paren at TLD (email)" @@ -174,7 +190,7 @@ fn gfm_autolink_literal() { ); // Note: GH comments/issues/PRs do not link this, but Gists/readmes do. - // Fixing it would mean defiating from `cmark-gfm`: + // Fixing it would mean deviating from `cmark-gfm`: // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. // assert_eq!( // micromark_with_options(",www.example.com", &gfm), @@ -212,6 +228,55 @@ fn gfm_autolink_literal() { assert_eq!( micromark_with_options( r###" +a www.example.com&xxx;b c + +a www.example.com&xxx;. b + +a www.example.com&xxxxxxxxx;. b + +a www.example.com&xxxxxxxxxx;. b + +a www.example.com&xxxxxxxxxxx;. b + +a www.example.com&xxx. b + +a www.example.com{. b + +a www.example.com&123. b + +a www.example.com&x. b + +a www.example.com. b + +a www.example.com&1. b + +a www.example.com&. b + +a www.example.com& b +"###, + &gfm + ), + r###"<p>a <a href="http://www.example.com&xxx;b">www.example.com&xxx;b</a> c</p> +<p>a <a href="http://www.example.com">www.example.com</a>&xxx;. b</p> +<p>a <a href="http://www.example.com">www.example.com</a>&xxxxxxxxx;. b</p> +<p>a <a href="http://www.example.com">www.example.com</a>&xxxxxxxxxx;. b</p> +<p>a <a href="http://www.example.com">www.example.com</a>&xxxxxxxxxxx;. b</p> +<p>a <a href="http://www.example.com&xxx">www.example.com&xxx</a>. b</p> +<p>a <a href="http://www.example.com&#123">www.example.com&#123</a>. b</p> +<p>a <a href="http://www.example.com&123">www.example.com&123</a>. b</p> +<p>a <a href="http://www.example.com&x">www.example.com&x</a>. b</p> +<p>a <a href="http://www.example.com&#1">www.example.com&#1</a>. b</p> +<p>a <a href="http://www.example.com&1">www.example.com&1</a>. b</p> +<p>a <a href="http://www.example.com&">www.example.com&</a>. b</p> +<p>a <a href="http://www.example.com&">www.example.com&</a> b</p> +"###, + "should match “character references” like GitHub does" + ); + + // Note: this deviates from GFM, as <https://github.com/github/cmark-gfm/issues/278> is fixed. + assert_eq!( + micromark_with_options( + r###" [ www.example.com [ https://example.com @@ -251,6 +316,2379 @@ fn gfm_autolink_literal() { <p><img src="#" alt=" https://example.com " /></p> <p><img src="#" alt=" contact@example.com " /></p> "###, - "should interplay with brackets, links, and images" + "should match interplay with brackets, links, and images, like GitHub does (but without the bugs)" + ); + + assert_eq!( + micromark_with_options( + r###" +www.example.com/?=a(b)cccccc + +www.example.com/?=a(b(c)ccccc + +www.example.com/?=a(b(c)c)cccc + +www.example.com/?=a(b(c)c)c)ccc + +www.example.com/?q=a(business) + +www.example.com/?q=a(business))) + +(www.example.com/?q=a(business)) + +(www.example.com/?q=a(business) + +www.example.com/?q=a(business)". + +www.example.com/?q=a(business))) + +(www.example.com/?q=a(business))". + +(www.example.com/?q=a(business)".) + +(www.example.com/?q=a(business)". +"###, + &gfm + ), + r###"<p><a href="http://www.example.com/?=a(b)cccccc">www.example.com/?=a(b)cccccc</a></p> +<p><a href="http://www.example.com/?=a(b(c)ccccc">www.example.com/?=a(b(c)ccccc</a></p> +<p><a href="http://www.example.com/?=a(b(c)c)cccc">www.example.com/?=a(b(c)c)cccc</a></p> +<p><a href="http://www.example.com/?=a(b(c)c)c)ccc">www.example.com/?=a(b(c)c)c)ccc</a></p> +<p><a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a></p> +<p><a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>))</p> +<p>(<a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>)</p> +<p>(<a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a></p> +<p><a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>".</p> +<p><a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>))</p> +<p>(<a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>)".</p> +<p>(<a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>".)</p> +<p>(<a href="http://www.example.com/?q=a(business)">www.example.com/?q=a(business)</a>".</p> +"###, + "should match parens like GitHub does" + ); + + // Note: this deviates from GFM. + // Here, the following issues are fixed: + // - <https://github.com/github/cmark-gfm/issues/280> + assert_eq!( + micromark_with_options( + r###" +# Literal autolinks + +## WWW autolinks + +w.commonmark.org + +ww.commonmark.org + +www.commonmark.org + +Www.commonmark.org + +wWw.commonmark.org + +wwW.commonmark.org + +WWW.COMMONMARK.ORG + +Visit www.commonmark.org/help for more information. + +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. + +www.aaa.bbb.ccc_ccc + +www.aaa_bbb.ccc + +www.aaa.bbb.ccc.ddd_ddd + +www.aaa.bbb.ccc_ccc.ddd + +www.aaa.bbb_bbb.ccc.ddd + +www.aaa_aaa.bbb.ccc.ddd + +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. + +www.google.com/search?q=Markup+(business) + +www.google.com/search?q=Markup+(business))) + +(www.google.com/search?q=Markup+(business)) + +(www.google.com/search?q=Markup+(business) + +www.google.com/search?q=(business))+ok + +www.google.com/search?q=commonmark&hl=en + +www.google.com/search?q=commonmark&hl;en + +www.google.com/search?q=commonmark&hl; + +www.commonmark.org/he<lp + +## HTTP autolinks + +hexample.com + +htexample.com + +httexample.com + +httpexample.com + +http:example.com + +http:/example.com + +https:/example.com + +http://example.com + +https://example.com + +https://example + +http://commonmark.org + +(Visit https://encrypted.google.com/search?q=Markup+(business)) + +## Email autolinks + +No dot: foo@barbaz + +No dot: foo@barbaz. + +foo@bar.baz + +hello@mail+xyz.example isn’t valid, but hello+xyz@mail.example is. + +a.b-c_d@a.b + +a.b-c_d@a.b. + +a.b-c_d@a.b- + +a.b-c_d@a.b_ + +a@a_b.c + +a@a-b.c + +Can’t end in an underscore followed by a period: aaa@a.b_. + +Can contain an underscore followed by a period: aaa@a.b_.c + +## Link text should not be expanded + +[Visit www.example.com](http://www.example.com) please. + +[Visit http://www.example.com](http://www.example.com) please. + +[Mail example@example.com](mailto:example@example.com) please. + +[link]() <http://autolink> should still be expanded. +"###, + &gfm + ), + r###"<h1>Literal autolinks</h1> +<h2>WWW autolinks</h2> +<p>w.commonmark.org</p> +<p>ww.commonmark.org</p> +<p><a href="http://www.commonmark.org">www.commonmark.org</a></p> +<p><a href="http://Www.commonmark.org">Www.commonmark.org</a></p> +<p><a href="http://wWw.commonmark.org">wWw.commonmark.org</a></p> +<p><a href="http://wwW.commonmark.org">wwW.commonmark.org</a></p> +<p><a href="http://WWW.COMMONMARK.ORG">WWW.COMMONMARK.ORG</a></p> +<p>Visit <a href="http://www.commonmark.org/help">www.commonmark.org/help</a> for more information.</p> +<p>Visit <a href="http://www.commonmark.org">www.commonmark.org</a>.</p> +<p>Visit <a href="http://www.commonmark.org/a.b">www.commonmark.org/a.b</a>.</p> +<p>www.aaa.bbb.ccc_ccc</p> +<p>www.aaa_bbb.ccc</p> +<p>www.aaa.bbb.ccc.ddd_ddd</p> +<p>www.aaa.bbb.ccc_ccc.ddd</p> +<p><a href="http://www.aaa.bbb_bbb.ccc.ddd">www.aaa.bbb_bbb.ccc.ddd</a></p> +<p><a href="http://www.aaa_aaa.bbb.ccc.ddd">www.aaa_aaa.bbb.ccc.ddd</a></p> +<p>Visit <a href="http://www.commonmark.org">www.commonmark.org</a>.</p> +<p>Visit <a href="http://www.commonmark.org/a.b">www.commonmark.org/a.b</a>.</p> +<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p> +<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p> +<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p> +<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p> +<p><a href="http://www.google.com/search?q=(business))+ok">www.google.com/search?q=(business))+ok</a></p> +<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&hl=en</a></p> +<p><a href="http://www.google.com/search?q=commonmark&hl;en">www.google.com/search?q=commonmark&hl;en</a></p> +<p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&hl;</p> +<p><a href="http://www.commonmark.org/he">www.commonmark.org/he</a><lp</p> +<h2>HTTP autolinks</h2> +<p>hexample.com</p> +<p>htexample.com</p> +<p>httexample.com</p> +<p>httpexample.com</p> +<p>http:example.com</p> +<p>http:/example.com</p> +<p>https:/example.com</p> +<p><a href="http://example.com">http://example.com</a></p> +<p><a href="https://example.com">https://example.com</a></p> +<p><a href="https://example">https://example</a></p> +<p><a href="http://commonmark.org">http://commonmark.org</a></p> +<p>(Visit <a href="https://encrypted.google.com/search?q=Markup+(business)">https://encrypted.google.com/search?q=Markup+(business)</a>)</p> +<h2>Email autolinks</h2> +<p>No dot: foo@barbaz</p> +<p>No dot: foo@barbaz.</p> +<p><a href="mailto:foo@bar.baz">foo@bar.baz</a></p> +<p>hello@mail+xyz.example isn’t valid, but <a href="mailto:hello+xyz@mail.example">hello+xyz@mail.example</a> is.</p> +<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a></p> +<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a>.</p> +<p>a.b-c_d@a.b-</p> +<p>a.b-c_d@a.b_</p> +<p><a href="mailto:a@a_b.c">a@a_b.c</a></p> +<p><a href="mailto:a@a-b.c">a@a-b.c</a></p> +<p>Can’t end in an underscore followed by a period: aaa@a.b_.</p> +<p>Can contain an underscore followed by a period: <a href="mailto:aaa@a.b_.c">aaa@a.b_.c</a></p> +<h2>Link text should not be expanded</h2> +<p><a href="http://www.example.com">Visit www.example.com</a> please.</p> +<p><a href="http://www.example.com">Visit http://www.example.com</a> please.</p> +<p><a href="mailto:example@example.com">Mail example@example.com</a> please.</p> +<p><a href="">link</a> <a href="http://autolink">http://autolink</a> should still be expanded.</p> +"###, + "should match base like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"H0. + +[https://a.com©b + +[www.a.com©b + +H1. + +[]https://a.com©b + +[]www.a.com©b + +H2. + +[] https://a.com©b + +[] www.a.com©b + +H3. + +[[https://a.com©b + +[[www.a.com©b + +H4. + +[[]https://a.com©b + +[[]www.a.com©b + +H5. + +[[]]https://a.com©b + +[[]]www.a.com©b +"###, + &gfm + ), + r###"<p>H0.</p> +<p>[<a href="https://a.com&copy;b">https://a.com&copy;b</a></p> +<p>[<a href="http://www.a.com&copy;b">www.a.com&copy;b</a></p> +<p>H1.</p> +<p>[]<a href="https://a.com&copy;b">https://a.com&copy;b</a></p> +<p>[]<a href="http://www.a.com&copy;b">www.a.com&copy;b</a></p> +<p>H2.</p> +<p>[] <a href="https://a.com&copy;b">https://a.com&copy;b</a></p> +<p>[] <a href="http://www.a.com&copy;b">www.a.com&copy;b</a></p> +<p>H3.</p> +<p>[[<a href="https://a.com&copy;b">https://a.com&copy;b</a></p> +<p>[[<a href="http://www.a.com&copy;b">www.a.com&copy;b</a></p> +<p>H4.</p> +<p>[[]<a href="https://a.com&copy;b">https://a.com&copy;b</a></p> +<p>[[]<a href="http://www.a.com&copy;b">www.a.com&copy;b</a></p> +<p>H5.</p> +<p>[[]]<a href="https://a.com&copy;b">https://a.com&copy;b</a></p> +<p>[[]]<a href="http://www.a.com&copy;b">www.a.com&copy;b</a></p> +"###, + "should match brackets like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options(r###"Image start. + +![https://a.com + +![http://a.com + +![www.a.com + +![a@b.c + +Image start and label end. + +![https://a.com] + +![http://a.com] + +![www.a.com] + +![a@b.c] + +Image label with reference (note: GH cleans hashes here, but we keep them in). + +![https://a.com][x] + +![http://a.com][x] + +![www.a.com][x] + +![a@b.c][x] + +[x]: # + +Image label with resource. + +![https://a.com]() + +![http://a.com]() + +![www.a.com]() + +![a@b.c]() + +Autolink literal after image. + +![a]() https://a.com + +![a]() http://a.com + +![a]() www.a.com + +![a]() a@b.c +"###, &gfm), + r###"<p>Image start.</p> +<p>![<a href="https://a.com">https://a.com</a></p> +<p>![<a href="http://a.com">http://a.com</a></p> +<p>![<a href="http://www.a.com">www.a.com</a></p> +<p>![<a href="mailto:a@b.c">a@b.c</a></p> +<p>Image start and label end.</p> +<p>![<a href="https://a.com">https://a.com</a>]</p> +<p>![<a href="http://a.com">http://a.com</a>]</p> +<p>![<a href="http://www.a.com">www.a.com</a>]</p> +<p>![<a href="mailto:a@b.c">a@b.c</a>]</p> +<p>Image label with reference (note: GH cleans hashes here, but we keep them in).</p> +<p><img src="#" alt="https://a.com" /></p> +<p><img src="#" alt="http://a.com" /></p> +<p><img src="#" alt="www.a.com" /></p> +<p><img src="#" alt="a@b.c" /></p> +<p>Image label with resource.</p> +<p><img src="" alt="https://a.com" /></p> +<p><img src="" alt="http://a.com" /></p> +<p><img src="" alt="www.a.com" /></p> +<p><img src="" alt="a@b.c" /></p> +<p>Autolink literal after image.</p> +<p><img src="" alt="a" /> <a href="https://a.com">https://a.com</a></p> +<p><img src="" alt="a" /> <a href="http://a.com">http://a.com</a></p> +<p><img src="" alt="a" /> <a href="http://www.a.com">www.a.com</a></p> +<p><img src="" alt="a" /> <a href="mailto:a@b.c">a@b.c</a></p> +"###, + "should match autolink literals combined w/ images like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options(r###"Link start. + +[https://a.com + +[http://a.com + +[www.a.com + +[a@b.c + +Label end. + +https://a.com] + +http://a.com] + +www.a.com] + +a@b.c] + +Link start and label end. + +[https://a.com] + +[http://a.com] + +[www.a.com] + +[a@b.c] + +What naĂŻvely seems like a label end (A). + +https://a.com`]` + +http://a.com`]` + +www.a.com`]` + +a@b.c`]` + +Link start and what naĂŻvely seems like a balanced brace (B). + +[https://a.com`]` + +[http://a.com`]` + +[www.a.com`]` + +[a@b.c`]` + +What naĂŻvely seems like a label end (C). + +https://a.com `]` + +http://a.com `]` + +www.a.com `]` + +a@b.c `]` + +Link start and what naĂŻvely seems like a balanced brace (D). + +[https://a.com `]` + +[http://a.com `]` + +[www.a.com `]` + +[a@b.c `]` + +Link label with reference. + +[https://a.com][x] + +[http://a.com][x] + +[www.a.com][x] + +[a@b.c][x] + +[x]: # + +Link label with resource. + +[https://a.com]() + +[http://a.com]() + +[www.a.com]() + +[a@b.c]() + +More in link. + +[a https://b.com c]() + +[a http://b.com c]() + +[a www.b.com c]() + +[a b@c.d e]() + +Autolink literal after link. + +[a]() https://a.com + +[a]() http://a.com + +[a]() www.a.com + +[a]() a@b.c +"###, &gfm), + r###"<p>Link start.</p> +<p>[<a href="https://a.com">https://a.com</a></p> +<p>[<a href="http://a.com">http://a.com</a></p> +<p>[<a href="http://www.a.com">www.a.com</a></p> +<p>[<a href="mailto:a@b.c">a@b.c</a></p> +<p>Label end.</p> +<p><a href="https://a.com">https://a.com</a>]</p> +<p><a href="http://a.com">http://a.com</a>]</p> +<p><a href="http://www.a.com">www.a.com</a>]</p> +<p><a href="mailto:a@b.c">a@b.c</a>]</p> +<p>Link start and label end.</p> +<p>[<a href="https://a.com">https://a.com</a>]</p> +<p>[<a href="http://a.com">http://a.com</a>]</p> +<p>[<a href="http://www.a.com">www.a.com</a>]</p> +<p>[<a href="mailto:a@b.c">a@b.c</a>]</p> +<p>What naĂŻvely seems like a label end (A).</p> +<p><a href="https://a.com%60%5D%60">https://a.com`]`</a></p> +<p><a href="http://a.com%60%5D%60">http://a.com`]`</a></p> +<p><a href="http://www.a.com%60%5D%60">www.a.com`]`</a></p> +<p><a href="mailto:a@b.c">a@b.c</a><code>]</code></p> +<p>Link start and what naĂŻvely seems like a balanced brace (B).</p> +<p>[<a href="https://a.com%60%5D%60">https://a.com`]`</a></p> +<p>[<a href="http://a.com%60%5D%60">http://a.com`]`</a></p> +<p>[<a href="http://www.a.com%60%5D%60">www.a.com`]`</a></p> +<p>[<a href="mailto:a@b.c">a@b.c</a><code>]</code></p> +<p>What naĂŻvely seems like a label end (C).</p> +<p><a href="https://a.com">https://a.com</a> <code>]</code></p> +<p><a href="http://a.com">http://a.com</a> <code>]</code></p> +<p><a href="http://www.a.com">www.a.com</a> <code>]</code></p> +<p><a href="mailto:a@b.c">a@b.c</a> <code>]</code></p> +<p>Link start and what naĂŻvely seems like a balanced brace (D).</p> +<p>[<a href="https://a.com">https://a.com</a> <code>]</code></p> +<p>[<a href="http://a.com">http://a.com</a> <code>]</code></p> +<p>[<a href="http://www.a.com">www.a.com</a> <code>]</code></p> +<p>[<a href="mailto:a@b.c">a@b.c</a> <code>]</code></p> +<p>Link label with reference.</p> +<p><a href="#">https://a.com</a></p> +<p><a href="#">http://a.com</a></p> +<p><a href="#">www.a.com</a></p> +<p><a href="#">a@b.c</a></p> +<p>Link label with resource.</p> +<p><a href="">https://a.com</a></p> +<p><a href="">http://a.com</a></p> +<p><a href="">www.a.com</a></p> +<p><a href="">a@b.c</a></p> +<p>More in link.</p> +<p><a href="">a https://b.com c</a></p> +<p><a href="">a http://b.com c</a></p> +<p><a href="">a www.b.com c</a></p> +<p><a href="">a b@c.d e</a></p> +<p>Autolink literal after link.</p> +<p><a href="">a</a> <a href="https://a.com">https://a.com</a></p> +<p><a href="">a</a> <a href="http://a.com">http://a.com</a></p> +<p><a href="">a</a> <a href="http://www.a.com">www.a.com</a></p> +<p><a href="">a</a> <a href="mailto:a@b.c">a@b.c</a></p> +"###, + "should match autolink literals combined w/ links like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# “character reference” + +www.a&b (space) + +www.a&b! + +www.a&b" + +www.a&b# + +www.a&b$ + +www.a&b% + +www.a&b& + +www.a&b' + +www.a&b( + +www.a&b) + +www.a&b* + +www.a&b+ + +www.a&b, + +www.a&b- + +www.a&b + +www.a&b. + +www.a&b/ + +www.a&b: + +www.a&b; + +www.a&b< + +www.a&b= + +www.a&b> + +www.a&b? + +www.a&b@ + +www.a&b[ + +www.a&b\ + +www.a&b] + +www.a&b^ + +www.a&b_ + +www.a&b` + +www.a&b{ + +www.a&b| + +www.a&b} + +www.a&b~ +"###, + &gfm + ), + r###"<h1>“character reference”</h1> +<p><a href="http://www.a&b">www.a&b</a> (space)</p> +<p><a href="http://www.a&b">www.a&b</a>!</p> +<p><a href="http://www.a&b">www.a&b</a>"</p> +<p><a href="http://www.a&b#">www.a&b#</a></p> +<p><a href="http://www.a&b$">www.a&b$</a></p> +<p><a href="http://www.a&b%25">www.a&b%</a></p> +<p><a href="http://www.a&b&">www.a&b&</a></p> +<p><a href="http://www.a&b">www.a&b</a>'</p> +<p><a href="http://www.a&b(">www.a&b(</a></p> +<p><a href="http://www.a&b">www.a&b</a>)</p> +<p><a href="http://www.a&b">www.a&b</a>*</p> +<p><a href="http://www.a&b+">www.a&b+</a></p> +<p><a href="http://www.a&b">www.a&b</a>,</p> +<p><a href="http://www.a&b-">www.a&b-</a></p> +<p><a href="http://www.a&b">www.a&b</a></p> +<p><a href="http://www.a&b">www.a&b</a>.</p> +<p><a href="http://www.a&b/">www.a&b/</a></p> +<p><a href="http://www.a&b">www.a&b</a>:</p> +<p><a href="http://www.a">www.a</a>&b;</p> +<p><a href="http://www.a&b">www.a&b</a><</p> +<p><a href="http://www.a&b=">www.a&b=</a></p> +<p><a href="http://www.a&b%3E">www.a&b></a></p> +<p><a href="http://www.a&b">www.a&b</a>?</p> +<p><a href="http://www.a&b@">www.a&b@</a></p> +<p><a href="http://www.a&b%5B">www.a&b[</a></p> +<p><a href="http://www.a&b%5C">www.a&b\</a></p> +<p><a href="http://www.a&b">www.a&b</a>]</p> +<p><a href="http://www.a&b%5E">www.a&b^</a></p> +<p><a href="http://www.a&b">www.a&b</a>_</p> +<p><a href="http://www.a&b%60">www.a&b`</a></p> +<p><a href="http://www.a&b%7B">www.a&b{</a></p> +<p><a href="http://www.a&b%7C">www.a&b|</a></p> +<p><a href="http://www.a&b%7D">www.a&b}</a></p> +<p><a href="http://www.a&b">www.a&b</a>~</p> +"###, + "should match “character references (named)” like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options(r###"# “character reference” + +www.a# (space) + +www.a#! + +www.a#" + +www.a## + +www.a#$ + +www.a#% + +www.a#& + +www.a#' + +www.a#( + +www.a#) + +www.a#* + +www.a#+ + +www.a#, + +www.a#- + +www.a# + +www.a#. + +www.a#/ + +www.a#: + +www.a# + +www.a#< + +www.a#= + +www.a#> + +www.a#? + +www.a#@ + +www.a#[ + +www.a#\ + +www.a#] + +www.a#^ + +www.a#_ + +www.a#` + +www.a#{ + +www.a#| + +www.a#} + +www.a#~ +"###, &gfm), + r###"<h1>“character reference”</h1> +<p><a href="http://www.a&#35">www.a&#35</a> (space)</p> +<p><a href="http://www.a&#35">www.a&#35</a>!</p> +<p><a href="http://www.a&#35">www.a&#35</a>"</p> +<p><a href="http://www.a&#35#">www.a&#35#</a></p> +<p><a href="http://www.a&#35$">www.a&#35$</a></p> +<p><a href="http://www.a&#35%25">www.a&#35%</a></p> +<p><a href="http://www.a&#35&">www.a&#35&</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>'</p> +<p><a href="http://www.a&#35(">www.a&#35(</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>)</p> +<p><a href="http://www.a&#35">www.a&#35</a>*</p> +<p><a href="http://www.a&#35+">www.a&#35+</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>,</p> +<p><a href="http://www.a&#35-">www.a&#35-</a></p> +<p><a href="http://www.a&#35">www.a&#35</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>.</p> +<p><a href="http://www.a&#35/">www.a&#35/</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>:</p> +<p><a href="http://www.a&#35">www.a&#35</a>;</p> +<p><a href="http://www.a&#35">www.a&#35</a><</p> +<p><a href="http://www.a&#35=">www.a&#35=</a></p> +<p><a href="http://www.a&#35%3E">www.a&#35></a></p> +<p><a href="http://www.a&#35">www.a&#35</a>?</p> +<p><a href="http://www.a&#35@">www.a&#35@</a></p> +<p><a href="http://www.a&#35%5B">www.a&#35[</a></p> +<p><a href="http://www.a&#35%5C">www.a&#35\</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>]</p> +<p><a href="http://www.a&#35%5E">www.a&#35^</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>_</p> +<p><a href="http://www.a&#35%60">www.a&#35`</a></p> +<p><a href="http://www.a&#35%7B">www.a&#35{</a></p> +<p><a href="http://www.a&#35%7C">www.a&#35|</a></p> +<p><a href="http://www.a&#35%7D">www.a&#35}</a></p> +<p><a href="http://www.a&#35">www.a&#35</a>~</p> +"###, + "should match “character references (numeric)” like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"a@0.0 + +a@0.b + +a@a.29 + +a@a.b + +a@0.0.c + +react@0.11.1 + +react@0.12.0-rc1 + +react@0.14.0-alpha1 + +react@16.7.0-alpha.2 + +react@0.0.0-experimental-aae83a4b9 + +[ react@0.11.1 + +[ react@0.12.0-rc1 + +[ react@0.14.0-alpha1 + +[ react@16.7.0-alpha.2 + +[ react@0.0.0-experimental-aae83a4b9 +"###, + &gfm + ), + r###"<p>a@0.0</p> +<p><a href="mailto:a@0.b">a@0.b</a></p> +<p>a@a.29</p> +<p><a href="mailto:a@a.b">a@a.b</a></p> +<p><a href="mailto:a@0.0.c">a@0.0.c</a></p> +<p>react@0.11.1</p> +<p>react@0.12.0-rc1</p> +<p>react@0.14.0-alpha1</p> +<p>react@16.7.0-alpha.2</p> +<p>react@0.0.0-experimental-aae83a4b9</p> +<p>[ react@0.11.1</p> +<p>[ react@0.12.0-rc1</p> +<p>[ react@0.14.0-alpha1</p> +<p>[ react@16.7.0-alpha.2</p> +<p>[ react@0.0.0-experimental-aae83a4b9</p> +"###, + "should match email TLD digits like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (2) + +http://a (space) + +http://a! + +http://a" + +http://a# + +http://a$ + +http://a% + +http://a& + +http://a' + +http://a( + +http://a) + +http://a* + +http://a+ + +http://a, + +http://a- + +http://a + +http://a. + +http://a/ + +http://a: + +http://a; + +http://a< + +http://a= + +http://a> + +http://a? + +http://a@ + +http://a[ + +http://a\ + +http://a] + +http://a^ + +http://a_ + +http://a` + +http://a{ + +http://a| + +http://a} + +http://a~ +"###, + &gfm + ), + r###"<h1>httpshhh? (2)</h1> +<p><a href="http://a">http://a</a> (space)</p> +<p><a href="http://a">http://a</a>!</p> +<p><a href="http://a">http://a</a>"</p> +<p><a href="http://a#">http://a#</a></p> +<p><a href="http://a$">http://a$</a></p> +<p><a href="http://a%25">http://a%</a></p> +<p><a href="http://a&">http://a&</a></p> +<p><a href="http://a">http://a</a>'</p> +<p><a href="http://a(">http://a(</a></p> +<p><a href="http://a">http://a</a>)</p> +<p><a href="http://a">http://a</a>*</p> +<p><a href="http://a+">http://a+</a></p> +<p><a href="http://a">http://a</a>,</p> +<p><a href="http://a-">http://a-</a></p> +<p><a href="http://a">http://a</a></p> +<p><a href="http://a">http://a</a>.</p> +<p><a href="http://a/">http://a/</a></p> +<p><a href="http://a">http://a</a>:</p> +<p><a href="http://a">http://a</a>;</p> +<p><a href="http://a">http://a</a><</p> +<p><a href="http://a=">http://a=</a></p> +<p><a href="http://a%3E">http://a></a></p> +<p><a href="http://a">http://a</a>?</p> +<p><a href="http://a@">http://a@</a></p> +<p><a href="http://a%5B">http://a[</a></p> +<p><a href="http://a%5C">http://a\</a></p> +<p><a href="http://a">http://a</a>]</p> +<p><a href="http://a%5E">http://a^</a></p> +<p><a href="http://a">http://a</a>_</p> +<p><a href="http://a%60">http://a`</a></p> +<p><a href="http://a%7B">http://a{</a></p> +<p><a href="http://a%7C">http://a|</a></p> +<p><a href="http://a%7D">http://a}</a></p> +<p><a href="http://a">http://a</a>~</p> +"###, + "should match protocol domain continue like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (1) + +http:// (space) + +http://! + +http://" + +http://# + +http://$ + +http://% + +http://& + +http://' + +http://( + +http://) + +http://* + +http://+ + +http://, + +http://- + +http:// + +http://. + +http:/// + +http://: + +http://; + +http://< + +http://= + +http://> + +http://? + +http://@ + +http://[ + +http://\ + +http://] + +http://^ + +http://_ + +http://` + +http://{ + +http://| + +http://} + +http://~ +"###, + &gfm + ), + r###"<h1>httpshhh? (1)</h1> +<p>http:// (space)</p> +<p>http://!</p> +<p>http://"</p> +<p>http://#</p> +<p>http://$</p> +<p>http://%</p> +<p>http://&</p> +<p>http://'</p> +<p>http://(</p> +<p>http://)</p> +<p>http://*</p> +<p>http://+</p> +<p>http://,</p> +<p>http://-</p> +<p>http://</p> +<p>http://.</p> +<p>http:///</p> +<p>http://:</p> +<p>http://;</p> +<p>http://<</p> +<p>http://=</p> +<p>http://></p> +<p>http://?</p> +<p>http://@</p> +<p>http://[</p> +<p>http://\</p> +<p>http://]</p> +<p>http://^</p> +<p>http://_</p> +<p>http://`</p> +<p>http://{</p> +<p>http://|</p> +<p>http://}</p> +<p>http://~</p> +"###, + "should match protocol domain start like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (4) + +http://a/b (space) + +http://a/b! + +http://a/b" + +http://a/b# + +http://a/b$ + +http://a/b% + +http://a/b& + +http://a/b' + +http://a/b( + +http://a/b) + +http://a/b* + +http://a/b+ + +http://a/b, + +http://a/b- + +http://a/b + +http://a/b. + +http://a/b/ + +http://a/b: + +http://a/b; + +http://a/b< + +http://a/b= + +http://a/b> + +http://a/b? + +http://a/b@ + +http://a/b[ + +http://a/b\ + +http://a/b] + +http://a/b^ + +http://a/b_ + +http://a/b` + +http://a/b{ + +http://a/b| + +http://a/b} + +http://a/b~ +"###, + &gfm + ), + r###"<h1>httpshhh? (4)</h1> +<p><a href="http://a/b">http://a/b</a> (space)</p> +<p><a href="http://a/b">http://a/b</a>!</p> +<p><a href="http://a/b">http://a/b</a>"</p> +<p><a href="http://a/b#">http://a/b#</a></p> +<p><a href="http://a/b$">http://a/b$</a></p> +<p><a href="http://a/b%25">http://a/b%</a></p> +<p><a href="http://a/b&">http://a/b&</a></p> +<p><a href="http://a/b">http://a/b</a>'</p> +<p><a href="http://a/b(">http://a/b(</a></p> +<p><a href="http://a/b">http://a/b</a>)</p> +<p><a href="http://a/b">http://a/b</a>*</p> +<p><a href="http://a/b+">http://a/b+</a></p> +<p><a href="http://a/b">http://a/b</a>,</p> +<p><a href="http://a/b-">http://a/b-</a></p> +<p><a href="http://a/b">http://a/b</a></p> +<p><a href="http://a/b">http://a/b</a>.</p> +<p><a href="http://a/b/">http://a/b/</a></p> +<p><a href="http://a/b">http://a/b</a>:</p> +<p><a href="http://a/b">http://a/b</a>;</p> +<p><a href="http://a/b">http://a/b</a><</p> +<p><a href="http://a/b=">http://a/b=</a></p> +<p><a href="http://a/b%3E">http://a/b></a></p> +<p><a href="http://a/b">http://a/b</a>?</p> +<p><a href="http://a/b@">http://a/b@</a></p> +<p><a href="http://a/b%5B">http://a/b[</a></p> +<p><a href="http://a/b%5C">http://a/b\</a></p> +<p><a href="http://a/b">http://a/b</a>]</p> +<p><a href="http://a/b%5E">http://a/b^</a></p> +<p><a href="http://a/b">http://a/b</a>_</p> +<p><a href="http://a/b%60">http://a/b`</a></p> +<p><a href="http://a/b%7B">http://a/b{</a></p> +<p><a href="http://a/b%7C">http://a/b|</a></p> +<p><a href="http://a/b%7D">http://a/b}</a></p> +<p><a href="http://a/b">http://a/b</a>~</p> +"###, + "should match protocol path continue like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# httpshhh? (3) + +http://a/ (space) + +http://a/! + +http://a/" + +http://a/# + +http://a/$ + +http://a/% + +http://a/& + +http://a/' + +http://a/( + +http://a/) + +http://a/* + +http://a/+ + +http://a/, + +http://a/- + +http://a/ + +http://a/. + +http://a// + +http://a/: + +http://a/; + +http://a/< + +http://a/= + +http://a/> + +http://a/? + +http://a/@ + +http://a/[ + +http://a/\ + +http://a/] + +http://a/^ + +http://a/_ + +http://a/` + +http://a/{ + +http://a/| + +http://a/} + +http://a/~ +"###, + &gfm + ), + r###"<h1>httpshhh? (3)</h1> +<p><a href="http://a/">http://a/</a> (space)</p> +<p><a href="http://a/">http://a/</a>!</p> +<p><a href="http://a/">http://a/</a>"</p> +<p><a href="http://a/#">http://a/#</a></p> +<p><a href="http://a/$">http://a/$</a></p> +<p><a href="http://a/%25">http://a/%</a></p> +<p><a href="http://a/&">http://a/&</a></p> +<p><a href="http://a/">http://a/</a>'</p> +<p><a href="http://a/(">http://a/(</a></p> +<p><a href="http://a/">http://a/</a>)</p> +<p><a href="http://a/">http://a/</a>*</p> +<p><a href="http://a/+">http://a/+</a></p> +<p><a href="http://a/">http://a/</a>,</p> +<p><a href="http://a/-">http://a/-</a></p> +<p><a href="http://a/">http://a/</a></p> +<p><a href="http://a/">http://a/</a>.</p> +<p><a href="http://a//">http://a//</a></p> +<p><a href="http://a/">http://a/</a>:</p> +<p><a href="http://a/">http://a/</a>;</p> +<p><a href="http://a/">http://a/</a><</p> +<p><a href="http://a/=">http://a/=</a></p> +<p><a href="http://a/%3E">http://a/></a></p> +<p><a href="http://a/">http://a/</a>?</p> +<p><a href="http://a/@">http://a/@</a></p> +<p><a href="http://a/%5B">http://a/[</a></p> +<p><a href="http://a/%5C">http://a/\</a></p> +<p><a href="http://a/">http://a/</a>]</p> +<p><a href="http://a/%5E">http://a/^</a></p> +<p><a href="http://a/">http://a/</a>_</p> +<p><a href="http://a/%60">http://a/`</a></p> +<p><a href="http://a/%7B">http://a/{</a></p> +<p><a href="http://a/%7C">http://a/|</a></p> +<p><a href="http://a/%7D">http://a/}</a></p> +<p><a href="http://a/">http://a/</a>~</p> +"###, + "should match protocol path start like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"[www.example.com/a©](#) + +www.example.com/a© + +[www.example.com/a&bogus;](#) + +www.example.com/a&bogus; + +[www.example.com/a\.](#) + +www.example.com/a\. +"###, + &gfm + ), + r###"<p><a href="#">www.example.com/a©</a></p> +<p><a href="http://www.example.com/a">www.example.com/a</a>©</p> +<p><a href="#">www.example.com/a&bogus;</a></p> +<p><a href="http://www.example.com/a">www.example.com/a</a>&bogus;</p> +<p><a href="#">www.example.com/a\.</a></p> +<p><a href="http://www.example.com/a%5C">www.example.com/a\</a>.</p> +"###, + "should match links, autolink literals, and characters like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# “character reference” + +www.a/b&c (space) + +www.a/b&c! + +www.a/b&c" + +www.a/b&c# + +www.a/b&c$ + +www.a/b&c% + +www.a/b&c& + +www.a/b&c' + +www.a/b&c( + +www.a/b&c) + +www.a/b&c* + +www.a/b&c+ + +www.a/b&c, + +www.a/b&c- + +www.a/b&c + +www.a/b&c. + +www.a/b&c/ + +www.a/b&c: + +www.a/b&c; + +www.a/b&c< + +www.a/b&c= + +www.a/b&c> + +www.a/b&c? + +www.a/b&c@ + +www.a/b&c[ + +www.a/b&c\ + +www.a/b&c] + +www.a/b&c^ + +www.a/b&c_ + +www.a/b&c` + +www.a/b&c{ + +www.a/b&c| + +www.a/b&c} + +www.a/b&c~ +"###, + &gfm + ), + r###"<h1>“character reference”</h1> +<p><a href="http://www.a/b&c">www.a/b&c</a> (space)</p> +<p><a href="http://www.a/b&c">www.a/b&c</a>!</p> +<p><a href="http://www.a/b&c">www.a/b&c</a>"</p> +<p><a href="http://www.a/b&c#">www.a/b&c#</a></p> +<p><a href="http://www.a/b&c$">www.a/b&c$</a></p> +<p><a href="http://www.a/b&c%25">www.a/b&c%</a></p> +<p><a href="http://www.a/b&c&">www.a/b&c&</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>'</p> +<p><a href="http://www.a/b&c(">www.a/b&c(</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>)</p> +<p><a href="http://www.a/b&c">www.a/b&c</a>*</p> +<p><a href="http://www.a/b&c+">www.a/b&c+</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>,</p> +<p><a href="http://www.a/b&c-">www.a/b&c-</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>.</p> +<p><a href="http://www.a/b&c/">www.a/b&c/</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>:</p> +<p><a href="http://www.a/b">www.a/b</a>&c;</p> +<p><a href="http://www.a/b&c">www.a/b&c</a><</p> +<p><a href="http://www.a/b&c=">www.a/b&c=</a></p> +<p><a href="http://www.a/b&c%3E">www.a/b&c></a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>?</p> +<p><a href="http://www.a/b&c@">www.a/b&c@</a></p> +<p><a href="http://www.a/b&c%5B">www.a/b&c[</a></p> +<p><a href="http://www.a/b&c%5C">www.a/b&c\</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>]</p> +<p><a href="http://www.a/b&c%5E">www.a/b&c^</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>_</p> +<p><a href="http://www.a/b&c%60">www.a/b&c`</a></p> +<p><a href="http://www.a/b&c%7B">www.a/b&c{</a></p> +<p><a href="http://www.a/b&c%7C">www.a/b&c|</a></p> +<p><a href="http://www.a/b&c%7D">www.a/b&c}</a></p> +<p><a href="http://www.a/b&c">www.a/b&c</a>~</p> +"###, + "should match character reference-like (named) things in paths like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# “character reference” + +www.a/b# (space) + +www.a/b#! + +www.a/b#" + +www.a/b## + +www.a/b#$ + +www.a/b#% + +www.a/b#& + +www.a/b#' + +www.a/b#( + +www.a/b#) + +www.a/b#* + +www.a/b#+ + +www.a/b#, + +www.a/b#- + +www.a/b# + +www.a/b#. + +www.a/b#/ + +www.a/b#: + +www.a/b# + +www.a/b#< + +www.a/b#= + +www.a/b#> + +www.a/b#? + +www.a/b#@ + +www.a/b#[ + +www.a/b#\ + +www.a/b#] + +www.a/b#^ + +www.a/b#_ + +www.a/b#` + +www.a/b#{ + +www.a/b#| + +www.a/b#} + +www.a/b#~ +"###, + &gfm + ), + r###"<h1>“character reference”</h1> +<p><a href="http://www.a/b&#35">www.a/b&#35</a> (space)</p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>!</p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>"</p> +<p><a href="http://www.a/b&#35#">www.a/b&#35#</a></p> +<p><a href="http://www.a/b&#35$">www.a/b&#35$</a></p> +<p><a href="http://www.a/b&#35%25">www.a/b&#35%</a></p> +<p><a href="http://www.a/b&#35&">www.a/b&#35&</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>'</p> +<p><a href="http://www.a/b&#35(">www.a/b&#35(</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>)</p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>*</p> +<p><a href="http://www.a/b&#35+">www.a/b&#35+</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>,</p> +<p><a href="http://www.a/b&#35-">www.a/b&#35-</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>.</p> +<p><a href="http://www.a/b&#35/">www.a/b&#35/</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>:</p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>;</p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a><</p> +<p><a href="http://www.a/b&#35=">www.a/b&#35=</a></p> +<p><a href="http://www.a/b&#35%3E">www.a/b&#35></a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>?</p> +<p><a href="http://www.a/b&#35@">www.a/b&#35@</a></p> +<p><a href="http://www.a/b&#35%5B">www.a/b&#35[</a></p> +<p><a href="http://www.a/b&#35%5C">www.a/b&#35\</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>]</p> +<p><a href="http://www.a/b&#35%5E">www.a/b&#35^</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>_</p> +<p><a href="http://www.a/b&#35%60">www.a/b&#35`</a></p> +<p><a href="http://www.a/b&#35%7B">www.a/b&#35{</a></p> +<p><a href="http://www.a/b&#35%7C">www.a/b&#35|</a></p> +<p><a href="http://www.a/b&#35%7D">www.a/b&#35}</a></p> +<p><a href="http://www.a/b&#35">www.a/b&#35</a>~</p> +"###, + "should match character reference-like (numeric) things in paths like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"In autolink literal path or link end? + +[https://a.com/d]() + +[http://a.com/d]() + +[www.a.com/d]() + +https://a.com/d]() + +http://a.com/d]() + +www.a.com/d]() + +In autolink literal search or link end? + +[https://a.com?d]() + +[http://a.com?d]() + +[www.a.com?d]() + +https://a.com?d]() + +http://a.com?d]() + +www.a.com?d]() + +In autolink literal hash or link end? + +[https://a.com#d]() + +[http://a.com#d]() + +[www.a.com#d]() + +https://a.com#d]() + +http://a.com#d]() + +www.a.com#d]() +"###, + &gfm + ), + r###"<p>In autolink literal path or link end?</p> +<p><a href="">https://a.com/d</a></p> +<p><a href="">http://a.com/d</a></p> +<p><a href="">www.a.com/d</a></p> +<p><a href="https://a.com/d">https://a.com/d</a>]()</p> +<p><a href="http://a.com/d">http://a.com/d</a>]()</p> +<p><a href="http://www.a.com/d">www.a.com/d</a>]()</p> +<p>In autolink literal search or link end?</p> +<p><a href="">https://a.com?d</a></p> +<p><a href="">http://a.com?d</a></p> +<p><a href="">www.a.com?d</a></p> +<p><a href="https://a.com?d">https://a.com?d</a>]()</p> +<p><a href="http://a.com?d">http://a.com?d</a>]()</p> +<p><a href="http://www.a.com?d">www.a.com?d</a>]()</p> +<p>In autolink literal hash or link end?</p> +<p><a href="">https://a.com#d</a></p> +<p><a href="">http://a.com#d</a></p> +<p><a href="">www.a.com#d</a></p> +<p><a href="https://a.com#d">https://a.com#d</a>]()</p> +<p><a href="http://a.com#d">http://a.com#d</a>]()</p> +<p><a href="http://www.a.com#d">www.a.com#d</a>]()</p> +"###, + "should match path or link end like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"Last non-markdown ASCII whitespace (FF): noreply@example.com, http://example.com, https://example.com, www.example.com + +Last non-whitespace ASCII control (US): noreply@example.com, http://example.com, https://example.com, www.example.com + +First punctuation after controls: !noreply@example.com, !http://example.com, !https://example.com, !www.example.com + +Last punctuation before digits: /noreply@example.com, /http://example.com, /https://example.com, /www.example.com + +First digit: 0noreply@example.com, 0http://example.com, 0https://example.com, 0www.example.com + +First punctuation after digits: :noreply@example.com, :http://example.com, :https://example.com, :www.example.com + +Last punctuation before caps: @noreply@example.com, @http://example.com, @https://example.com, @www.example.com + +First uppercase: Anoreply@example.com, Ahttp://example.com, Ahttps://example.com, Awww.example.com + +Punctuation after uppercase: \noreply@example.com, \http://example.com, \https://example.com, \www.example.com + +Last punctuation before lowercase (1): `noreply@example.com; + +(2) `http://example.com; + +(3) `https://example.com; + +(4) `www.example.com; (broken up to prevent code from forming) + +First lowercase: anoreply@example.com, ahttp://example.com, ahttps://example.com, awww.example.com + +First punctuation after lowercase: {noreply@example.com, {http://example.com, {https://example.com, {www.example.com + +Last punctuation: ~noreply@example.com, ~http://example.com, ~https://example.com, ~www.example.com + +First non-ASCII unicode whitespace (0x80): Â…noreply@example.com, Â…http://example.com, Â…https://example.com, Â…www.example.com + +Last non-ASCII unicode whitespace (0x3000):  noreply@example.com,  http://example.com,  https://example.com,  www.example.com + +First non-ASCII punctuation: ¡noreply@example.com, ¡http://example.com, ¡https://example.com, ¡www.example.com + +Last non-ASCII punctuation: ・noreply@example.com, ・http://example.com, ・https://example.com, ・www.example.com + +Some non-ascii: ä¸noreply@example.com, ä¸http://example.com, ä¸https://example.com, ä¸www.example.com + +Some more non-ascii: 🤷‍noreply@example.com, 🤷‍http://example.com, 🤷‍https://example.com, 🤷‍www.example.com +"###, + &gfm + ), + r###"<p>Last non-markdown ASCII whitespace (FF): <a href="mailto:noreply@example.com">noreply@example.com</a>, <a href="http://example.com">http://example.com</a>, <a href="https://example.com">https://example.com</a>, www.example.com</p> +<p>Last non-whitespace ASCII control (US): <a href="mailto:noreply@example.com">noreply@example.com</a>, <a href="http://example.com">http://example.com</a>, <a href="https://example.com">https://example.com</a>, www.example.com</p> +<p>First punctuation after controls: !<a href="mailto:noreply@example.com">noreply@example.com</a>, !<a href="http://example.com">http://example.com</a>, !<a href="https://example.com">https://example.com</a>, !www.example.com</p> +<p>Last punctuation before digits: /noreply@example.com, /<a href="http://example.com">http://example.com</a>, /<a href="https://example.com">https://example.com</a>, /www.example.com</p> +<p>First digit: <a href="mailto:0noreply@example.com">0noreply@example.com</a>, 0<a href="http://example.com">http://example.com</a>, 0<a href="https://example.com">https://example.com</a>, 0www.example.com</p> +<p>First punctuation after digits: :<a href="mailto:noreply@example.com">noreply@example.com</a>, :<a href="http://example.com">http://example.com</a>, :<a href="https://example.com">https://example.com</a>, :www.example.com</p> +<p>Last punctuation before caps: @<a href="mailto:noreply@example.com">noreply@example.com</a>, @<a href="http://example.com">http://example.com</a>, @<a href="https://example.com">https://example.com</a>, @www.example.com</p> +<p>First uppercase: <a href="mailto:Anoreply@example.com">Anoreply@example.com</a>, Ahttp://example.com, Ahttps://example.com, Awww.example.com</p> +<p>Punctuation after uppercase: \<a href="mailto:noreply@example.com">noreply@example.com</a>, \<a href="http://example.com">http://example.com</a>, \<a href="https://example.com">https://example.com</a>, \www.example.com</p> +<p>Last punctuation before lowercase (1): `<a href="mailto:noreply@example.com">noreply@example.com</a>;</p> +<p>(2) `<a href="http://example.com">http://example.com</a>;</p> +<p>(3) `<a href="https://example.com">https://example.com</a>;</p> +<p>(4) `www.example.com; (broken up to prevent code from forming)</p> +<p>First lowercase: <a href="mailto:anoreply@example.com">anoreply@example.com</a>, ahttp://example.com, ahttps://example.com, awww.example.com</p> +<p>First punctuation after lowercase: {<a href="mailto:noreply@example.com">noreply@example.com</a>, {<a href="http://example.com">http://example.com</a>, {<a href="https://example.com">https://example.com</a>, {www.example.com</p> +<p>Last punctuation: ~<a href="mailto:noreply@example.com">noreply@example.com</a>, ~<a href="http://example.com">http://example.com</a>, ~<a href="https://example.com">https://example.com</a>, ~<a href="http://www.example.com">www.example.com</a></p> +<p>First non-ASCII unicode whitespace (0x80): Â…<a href="mailto:noreply@example.com">noreply@example.com</a>, Â…<a href="http://example.com">http://example.com</a>, Â…<a href="https://example.com">https://example.com</a>, Â…www.example.com</p> +<p>Last non-ASCII unicode whitespace (0x3000):  <a href="mailto:noreply@example.com">noreply@example.com</a>,  <a href="http://example.com">http://example.com</a>,  <a href="https://example.com">https://example.com</a>,  www.example.com</p> +<p>First non-ASCII punctuation: ¡<a href="mailto:noreply@example.com">noreply@example.com</a>, ¡<a href="http://example.com">http://example.com</a>, ¡<a href="https://example.com">https://example.com</a>, ¡www.example.com</p> +<p>Last non-ASCII punctuation: ・<a href="mailto:noreply@example.com">noreply@example.com</a>, ・<a href="http://example.com">http://example.com</a>, ・<a href="https://example.com">https://example.com</a>, ・www.example.com</p> +<p>Some non-ascii: ä¸<a href="mailto:noreply@example.com">noreply@example.com</a>, ä¸<a href="http://example.com">http://example.com</a>, ä¸<a href="https://example.com">https://example.com</a>, ä¸www.example.com</p> +<p>Some more non-ascii: 🤷‍<a href="mailto:noreply@example.com">noreply@example.com</a>, 🤷‍<a href="http://example.com">http://example.com</a>, 🤷‍<a href="https://example.com">https://example.com</a>, 🤷‍www.example.com</p> +"###, + "should match previous (complex) like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# HTTP + +https://a.b can start after EOF + +Can start after EOL: +https://a.b + +Can start after tab: https://a.b. + +Can start after space: https://a.b. + +Can start after left paren (https://a.b. + +Can start after asterisk *https://a.b. + +Can start after underscore *_https://a.b. + +Can start after tilde ~https://a.b. + +# www + +www.a.b can start after EOF + +Can start after EOL: +www.a.b + +Can start after tab: www.a.b. + +Can start after space: www.a.b. + +Can start after left paren (www.a.b. + +Can start after asterisk *www.a.b. + +Can start after underscore *_www.a.b. + +Can start after tilde ~www.a.b. + +# Email + +## Correct character before + +a@b.c can start after EOF + +Can start after EOL: +a@b.c + +Can start after tab: a@b.c. + +Can start after space: a@b.c. + +Can start after left paren(a@b.c. + +Can start after asterisk*a@b.c. + +While theoretically it’s possible to start at an underscore, that underscore +is part of the email, so it’s in fact part of the link: _a@b.c. + +Can start after tilde~a@b.c. + +## Others characters before + +While other characters before the email aren’t allowed by GFM, they work on +github.com: !a@b.c, "a@b.c, #a@b.c, $a@b.c, &a@b.c, 'a@b.c, )a@b.c, +a@b.c, +,a@b.c, -a@b.c, .a@b.c, /a@b.c, :a@b.c, ;a@b.c, <a@b.c, =a@b.c, >a@b.c, ?a@b.c, +@a@b.c, \a@b.c, ]a@b.c, ^a@b.c, `a@b.c, {a@b.c, }a@b.c. + +## Commas + +See `https://github.com/remarkjs/remark/discussions/678`. + +,https://github.com + +[ ,https://github.com + +[asd] ,https://github.com +"###, + &gfm + ), + r###"<h1>HTTP</h1> +<p><a href="https://a.b">https://a.b</a> can start after EOF</p> +<p>Can start after EOL: +<a href="https://a.b">https://a.b</a></p> +<p>Can start after tab: <a href="https://a.b">https://a.b</a>.</p> +<p>Can start after space: <a href="https://a.b">https://a.b</a>.</p> +<p>Can start after left paren (<a href="https://a.b">https://a.b</a>.</p> +<p>Can start after asterisk *<a href="https://a.b">https://a.b</a>.</p> +<p>Can start after underscore *_<a href="https://a.b">https://a.b</a>.</p> +<p>Can start after tilde ~<a href="https://a.b">https://a.b</a>.</p> +<h1>www</h1> +<p><a href="http://www.a.b">www.a.b</a> can start after EOF</p> +<p>Can start after EOL: +<a href="http://www.a.b">www.a.b</a></p> +<p>Can start after tab: <a href="http://www.a.b">www.a.b</a>.</p> +<p>Can start after space: <a href="http://www.a.b">www.a.b</a>.</p> +<p>Can start after left paren (<a href="http://www.a.b">www.a.b</a>.</p> +<p>Can start after asterisk *<a href="http://www.a.b">www.a.b</a>.</p> +<p>Can start after underscore *_<a href="http://www.a.b">www.a.b</a>.</p> +<p>Can start after tilde ~<a href="http://www.a.b">www.a.b</a>.</p> +<h1>Email</h1> +<h2>Correct character before</h2> +<p><a href="mailto:a@b.c">a@b.c</a> can start after EOF</p> +<p>Can start after EOL: +<a href="mailto:a@b.c">a@b.c</a></p> +<p>Can start after tab: <a href="mailto:a@b.c">a@b.c</a>.</p> +<p>Can start after space: <a href="mailto:a@b.c">a@b.c</a>.</p> +<p>Can start after left paren(<a href="mailto:a@b.c">a@b.c</a>.</p> +<p>Can start after asterisk*<a href="mailto:a@b.c">a@b.c</a>.</p> +<p>While theoretically it’s possible to start at an underscore, that underscore +is part of the email, so it’s in fact part of the link: <a href="mailto:_a@b.c">_a@b.c</a>.</p> +<p>Can start after tilde~<a href="mailto:a@b.c">a@b.c</a>.</p> +<h2>Others characters before</h2> +<p>While other characters before the email aren’t allowed by GFM, they work on +github.com: !<a href="mailto:a@b.c">a@b.c</a>, "<a href="mailto:a@b.c">a@b.c</a>, #<a href="mailto:a@b.c">a@b.c</a>, $<a href="mailto:a@b.c">a@b.c</a>, &<a href="mailto:a@b.c">a@b.c</a>, '<a href="mailto:a@b.c">a@b.c</a>, )<a href="mailto:a@b.c">a@b.c</a>, <a href="mailto:+a@b.c">+a@b.c</a>, +,<a href="mailto:a@b.c">a@b.c</a>, <a href="mailto:-a@b.c">-a@b.c</a>, <a href="mailto:.a@b.c">.a@b.c</a>, /a@b.c, :<a href="mailto:a@b.c">a@b.c</a>, ;<a href="mailto:a@b.c">a@b.c</a>, <<a href="mailto:a@b.c">a@b.c</a>, =<a href="mailto:a@b.c">a@b.c</a>, ><a href="mailto:a@b.c">a@b.c</a>, ?<a href="mailto:a@b.c">a@b.c</a>, +@<a href="mailto:a@b.c">a@b.c</a>, \<a href="mailto:a@b.c">a@b.c</a>, ]<a href="mailto:a@b.c">a@b.c</a>, ^<a href="mailto:a@b.c">a@b.c</a>, `<a href="mailto:a@b.c">a@b.c</a>, {<a href="mailto:a@b.c">a@b.c</a>, }<a href="mailto:a@b.c">a@b.c</a>.</p> +<h2>Commas</h2> +<p>See <code>https://github.com/remarkjs/remark/discussions/678</code>.</p> +<p>,<a href="https://github.com">https://github.com</a></p> +<p>[ ,<a href="https://github.com">https://github.com</a></p> +<p>[asd] ,<a href="https://github.com">https://github.com</a></p> +"###, + "should match previous like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf 2? + +www.a (space) + +www.a! + +www.a" + +www.a# + +www.a$ + +www.a% + +www.a& + +www.a' + +www.a( + +www.a) + +www.a* + +www.a+ + +www.a, + +www.a- + +www.a + +www.a. + +www.a/ + +www.a: + +www.a; + +www.a< + +www.a= + +www.a> + +www.a? + +www.a@ + +www.a[ + +www.a\ + +www.a] + +www.a^ + +www.a_ + +www.a` + +www.a{ + +www.a| + +www.a} + +www.a~ +"###, + &gfm + ), + r###"<h1>wwwtf 2?</h1> +<p><a href="http://www.a">www.a</a> (space)</p> +<p><a href="http://www.a">www.a</a>!</p> +<p><a href="http://www.a">www.a</a>"</p> +<p><a href="http://www.a#">www.a#</a></p> +<p><a href="http://www.a$">www.a$</a></p> +<p><a href="http://www.a%25">www.a%</a></p> +<p><a href="http://www.a&">www.a&</a></p> +<p><a href="http://www.a">www.a</a>'</p> +<p><a href="http://www.a(">www.a(</a></p> +<p><a href="http://www.a">www.a</a>)</p> +<p><a href="http://www.a">www.a</a>*</p> +<p><a href="http://www.a+">www.a+</a></p> +<p><a href="http://www.a">www.a</a>,</p> +<p><a href="http://www.a-">www.a-</a></p> +<p><a href="http://www.a">www.a</a></p> +<p><a href="http://www.a">www.a</a>.</p> +<p><a href="http://www.a/">www.a/</a></p> +<p><a href="http://www.a">www.a</a>:</p> +<p><a href="http://www.a">www.a</a>;</p> +<p><a href="http://www.a">www.a</a><</p> +<p><a href="http://www.a=">www.a=</a></p> +<p><a href="http://www.a%3E">www.a></a></p> +<p><a href="http://www.a">www.a</a>?</p> +<p><a href="http://www.a@">www.a@</a></p> +<p><a href="http://www.a%5B">www.a[</a></p> +<p><a href="http://www.a%5C">www.a\</a></p> +<p><a href="http://www.a">www.a</a>]</p> +<p><a href="http://www.a%5E">www.a^</a></p> +<p><a href="http://www.a">www.a</a>_</p> +<p><a href="http://www.a%60">www.a`</a></p> +<p><a href="http://www.a%7B">www.a{</a></p> +<p><a href="http://www.a%7C">www.a|</a></p> +<p><a href="http://www.a%7D">www.a}</a></p> +<p><a href="http://www.a">www.a</a>~</p> +"###, + "should match www (domain continue) like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf 5? + +www.a. (space) + +www.a.! + +www.a." + +www.a.# + +www.a.$ + +www.a.% + +www.a.& + +www.a.' + +www.a.( + +www.a.) + +www.a.* + +www.a.+ + +www.a., + +www.a.- + +www.a. + +www.a.. + +www.a./ + +www.a.: + +www.a.; + +www.a.< + +www.a.= + +www.a.> + +www.a.? + +www.a.@ + +www.a.[ + +www.a.\ + +www.a.] + +www.a.^ + +www.a._ + +www.a.` + +www.a.{ + +www.a.| + +www.a.} + +www.a.~ +"###, + &gfm + ), + r###"<h1>wwwtf 5?</h1> +<p><a href="http://www.a">www.a</a>. (space)</p> +<p><a href="http://www.a">www.a</a>.!</p> +<p><a href="http://www.a">www.a</a>."</p> +<p><a href="http://www.a.#">www.a.#</a></p> +<p><a href="http://www.a.$">www.a.$</a></p> +<p><a href="http://www.a.%25">www.a.%</a></p> +<p><a href="http://www.a.&">www.a.&</a></p> +<p><a href="http://www.a">www.a</a>.'</p> +<p><a href="http://www.a.(">www.a.(</a></p> +<p><a href="http://www.a">www.a</a>.)</p> +<p><a href="http://www.a">www.a</a>.*</p> +<p><a href="http://www.a.+">www.a.+</a></p> +<p><a href="http://www.a">www.a</a>.,</p> +<p><a href="http://www.a.-">www.a.-</a></p> +<p><a href="http://www.a">www.a</a>.</p> +<p><a href="http://www.a">www.a</a>..</p> +<p><a href="http://www.a./">www.a./</a></p> +<p><a href="http://www.a">www.a</a>.:</p> +<p><a href="http://www.a">www.a</a>.;</p> +<p><a href="http://www.a">www.a</a>.<</p> +<p><a href="http://www.a.=">www.a.=</a></p> +<p><a href="http://www.a.%3E">www.a.></a></p> +<p><a href="http://www.a">www.a</a>.?</p> +<p><a href="http://www.a.@">www.a.@</a></p> +<p><a href="http://www.a.%5B">www.a.[</a></p> +<p><a href="http://www.a.%5C">www.a.\</a></p> +<p><a href="http://www.a">www.a</a>.]</p> +<p><a href="http://www.a.%5E">www.a.^</a></p> +<p><a href="http://www.a">www.a</a>._</p> +<p><a href="http://www.a.%60">www.a.`</a></p> +<p><a href="http://www.a.%7B">www.a.{</a></p> +<p><a href="http://www.a.%7C">www.a.|</a></p> +<p><a href="http://www.a.%7D">www.a.}</a></p> +<p><a href="http://www.a">www.a</a>.~</p> +"###, + "should match www (domain dot) like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf? + +www. (space) + +www.! + +www." + +www.# + +www.$ + +www.% + +www.& + +www.' + +www.( + +www.) + +www.* + +www.+ + +www., + +www.- + +www. + +www.. + +www./ + +www.: + +www.; + +www.< + +www.= + +www.> + +www.? + +www.@ + +www.[ + +www.\ + +www.] + +www.^ + +www._ + +www.` + +www.{ + +www.| + +www.} + +www.~ +"###, + &gfm + ), + r###"<h1>wwwtf?</h1> +<p><a href="http://www">www</a>. (space)</p> +<p><a href="http://www">www</a>.!</p> +<p><a href="http://www">www</a>."</p> +<p><a href="http://www.#">www.#</a></p> +<p><a href="http://www.$">www.$</a></p> +<p><a href="http://www.%25">www.%</a></p> +<p><a href="http://www.&">www.&</a></p> +<p><a href="http://www">www</a>.'</p> +<p><a href="http://www.(">www.(</a></p> +<p><a href="http://www">www</a>.)</p> +<p><a href="http://www">www</a>.*</p> +<p><a href="http://www.+">www.+</a></p> +<p><a href="http://www">www</a>.,</p> +<p><a href="http://www.-">www.-</a></p> +<p>www.</p> +<p><a href="http://www">www</a>..</p> +<p><a href="http://www./">www./</a></p> +<p><a href="http://www">www</a>.:</p> +<p><a href="http://www">www</a>.;</p> +<p><a href="http://www">www</a>.<</p> +<p><a href="http://www.=">www.=</a></p> +<p><a href="http://www.%3E">www.></a></p> +<p><a href="http://www">www</a>.?</p> +<p><a href="http://www.@">www.@</a></p> +<p><a href="http://www.%5B">www.[</a></p> +<p><a href="http://www.%5C">www.\</a></p> +<p><a href="http://www">www</a>.]</p> +<p><a href="http://www.%5E">www.^</a></p> +<p><a href="http://www">www</a>._</p> +<p><a href="http://www.%60">www.`</a></p> +<p><a href="http://www.%7B">www.{</a></p> +<p><a href="http://www.%7C">www.|</a></p> +<p><a href="http://www.%7D">www.}</a></p> +<p><a href="http://www">www</a>.~</p> +"###, + "should match www (domain start) like GitHub does" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf? (4) + +www.a/b (space) + +www.a/b! + +www.a/b" + +www.a/b# + +www.a/b$ + +www.a/b% + +www.a/b& + +www.a/b' + +www.a/b( + +www.a/b) + +www.a/b* + +www.a/b+ + +www.a/b, + +www.a/b- + +www.a/b + +www.a/b. + +www.a/b/ + +www.a/b: + +www.a/b; + +www.a/b< + +www.a/b= + +www.a/b> + +www.a/b? + +www.a/b@ + +www.a/b[ + +www.a/b\ + +www.a/b] + +www.a/b^ + +www.a/b_ + +www.a/b` + +www.a/b{ + +www.a/b| + +www.a/b} + +www.a/b~ +"###, + &gfm + ), + r###"<h1>wwwtf? (4)</h1> +<p><a href="http://www.a/b">www.a/b</a> (space)</p> +<p><a href="http://www.a/b">www.a/b</a>!</p> +<p><a href="http://www.a/b">www.a/b</a>"</p> +<p><a href="http://www.a/b#">www.a/b#</a></p> +<p><a href="http://www.a/b$">www.a/b$</a></p> +<p><a href="http://www.a/b%25">www.a/b%</a></p> +<p><a href="http://www.a/b&">www.a/b&</a></p> +<p><a href="http://www.a/b">www.a/b</a>'</p> +<p><a href="http://www.a/b(">www.a/b(</a></p> +<p><a href="http://www.a/b">www.a/b</a>)</p> +<p><a href="http://www.a/b">www.a/b</a>*</p> +<p><a href="http://www.a/b+">www.a/b+</a></p> +<p><a href="http://www.a/b">www.a/b</a>,</p> +<p><a href="http://www.a/b-">www.a/b-</a></p> +<p><a href="http://www.a/b">www.a/b</a></p> +<p><a href="http://www.a/b">www.a/b</a>.</p> +<p><a href="http://www.a/b/">www.a/b/</a></p> +<p><a href="http://www.a/b">www.a/b</a>:</p> +<p><a href="http://www.a/b">www.a/b</a>;</p> +<p><a href="http://www.a/b">www.a/b</a><</p> +<p><a href="http://www.a/b=">www.a/b=</a></p> +<p><a href="http://www.a/b%3E">www.a/b></a></p> +<p><a href="http://www.a/b">www.a/b</a>?</p> +<p><a href="http://www.a/b@">www.a/b@</a></p> +<p><a href="http://www.a/b%5B">www.a/b[</a></p> +<p><a href="http://www.a/b%5C">www.a/b\</a></p> +<p><a href="http://www.a/b">www.a/b</a>]</p> +<p><a href="http://www.a/b%5E">www.a/b^</a></p> +<p><a href="http://www.a/b">www.a/b</a>_</p> +<p><a href="http://www.a/b%60">www.a/b`</a></p> +<p><a href="http://www.a/b%7B">www.a/b{</a></p> +<p><a href="http://www.a/b%7C">www.a/b|</a></p> +<p><a href="http://www.a/b%7D">www.a/b}</a></p> +<p><a href="http://www.a/b">www.a/b</a>~</p> +"###, + "should match www (path continue) like GitHub does (except for the bracket bug)" + ); + + assert_eq!( + micromark_with_options( + r###"# wwwtf? (3) + +www.a/ (space) + +www.a/! + +www.a/" + +www.a/# + +www.a/$ + +www.a/% + +www.a/& + +www.a/' + +www.a/( + +www.a/) + +www.a/* + +www.a/+ + +www.a/, + +www.a/- + +www.a/ + +www.a/. + +www.a// + +www.a/: + +www.a/; + +www.a/< + +www.a/= + +www.a/> + +www.a/? + +www.a/@ + +www.a/[ + +www.a/\ + +www.a/] + +www.a/^ + +www.a/_ + +www.a/` + +www.a/{ + +www.a/| + +www.a/} + +www.a/~ +"###, + &gfm + ), + r###"<h1>wwwtf? (3)</h1> +<p><a href="http://www.a/">www.a/</a> (space)</p> +<p><a href="http://www.a/">www.a/</a>!</p> +<p><a href="http://www.a/">www.a/</a>"</p> +<p><a href="http://www.a/#">www.a/#</a></p> +<p><a href="http://www.a/$">www.a/$</a></p> +<p><a href="http://www.a/%25">www.a/%</a></p> +<p><a href="http://www.a/&">www.a/&</a></p> +<p><a href="http://www.a/">www.a/</a>'</p> +<p><a href="http://www.a/(">www.a/(</a></p> +<p><a href="http://www.a/">www.a/</a>)</p> +<p><a href="http://www.a/">www.a/</a>*</p> +<p><a href="http://www.a/+">www.a/+</a></p> +<p><a href="http://www.a/">www.a/</a>,</p> +<p><a href="http://www.a/-">www.a/-</a></p> +<p><a href="http://www.a/">www.a/</a></p> +<p><a href="http://www.a/">www.a/</a>.</p> +<p><a href="http://www.a//">www.a//</a></p> +<p><a href="http://www.a/">www.a/</a>:</p> +<p><a href="http://www.a/">www.a/</a>;</p> +<p><a href="http://www.a/">www.a/</a><</p> +<p><a href="http://www.a/=">www.a/=</a></p> +<p><a href="http://www.a/%3E">www.a/></a></p> +<p><a href="http://www.a/">www.a/</a>?</p> +<p><a href="http://www.a/@">www.a/@</a></p> +<p><a href="http://www.a/%5B">www.a/[</a></p> +<p><a href="http://www.a/%5C">www.a/\</a></p> +<p><a href="http://www.a/">www.a/</a>]</p> +<p><a href="http://www.a/%5E">www.a/^</a></p> +<p><a href="http://www.a/">www.a/</a>_</p> +<p><a href="http://www.a/%60">www.a/`</a></p> +<p><a href="http://www.a/%7B">www.a/{</a></p> +<p><a href="http://www.a/%7C">www.a/|</a></p> +<p><a href="http://www.a/%7D">www.a/}</a></p> +<p><a href="http://www.a/">www.a/</a>~</p> +"###, + "should match www (path start) like GitHub does (except for the bracket bug)" ); } |