diff options
Diffstat (limited to '')
| -rw-r--r-- | src/compiler.rs | 37 | ||||
| -rw-r--r-- | src/construct/gfm_autolink_literal.rs | 848 | ||||
| -rw-r--r-- | src/construct/gfm_table.rs | 2 | ||||
| -rw-r--r-- | src/construct/text.rs | 20 | ||||
| -rw-r--r-- | src/state.rs | 60 | 
5 files changed, 805 insertions, 162 deletions
| diff --git a/src/compiler.rs b/src/compiler.rs index 681ec00..0ea1638 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -871,6 +871,7 @@ fn on_exit_autolink_email(context: &mut CompileContext) {              &Position::from_exit_event(context.events, context.index),          )          .as_str(), +        false,      );  } @@ -884,6 +885,7 @@ fn on_exit_autolink_protocol(context: &mut CompileContext) {              &Position::from_exit_event(context.events, context.index),          )          .as_str(), +        false,      );  } @@ -1154,6 +1156,7 @@ fn on_exit_gfm_autolink_literal_protocol(context: &mut CompileContext) {              &Position::from_exit_event(context.events, context.index),          )          .as_str(), +        true,      );  } @@ -1167,12 +1170,22 @@ fn on_exit_gfm_autolink_literal_www(context: &mut CompileContext) {              &Position::from_exit_event(context.events, context.index),          )          .as_str(), +        true,      );  }  /// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail].  fn on_exit_gfm_autolink_literal_email(context: &mut CompileContext) { -    on_exit_autolink_email(context); +    generate_autolink( +        context, +        Some("mailto:"), +        Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        .as_str(), +        true, +    );  }  /// Handle [`Exit`][Kind::Exit]:[`GfmFootnoteCall`][Name::GfmFootnoteCall]. @@ -1822,8 +1835,24 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) {  }  /// Generate an autolink (used by unicode autolinks and GFM autolink literals). -fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { -    if !context.image_alt_inside { +fn generate_autolink( +    context: &mut CompileContext, +    protocol: Option<&str>, +    value: &str, +    is_gfm_literal: bool, +) { +    let mut is_in_link = false; +    let mut index = 0; + +    while index < context.media_stack.len() { +        if !context.media_stack[index].image { +            is_in_link = true; +            break; +        } +        index += 1; +    } + +    if !context.image_alt_inside && (!is_in_link || !is_gfm_literal) {          context.push("<a href=\"");          let url = if let Some(protocol) = protocol {              format!("{}{}", protocol, value) @@ -1843,7 +1872,7 @@ fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value      context.push(&encode(value, context.encode_html)); -    if !context.image_alt_inside { +    if !context.image_alt_inside && (!is_in_link || !is_gfm_literal) {          context.push("</a>");      }  } diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index 704c536..038330c 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -1,14 +1,621 @@ -//! To do. +//! GFM: autolink literal occurs in the [text][] content type. +//! +//! ## Grammar +//! +//! Autolink literals form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! gfm_autolink_literal ::= gfm_protocol_autolink | gfm_www_autolink | gfm_email_autolink +//! +//! ; Restriction: the code before must be `www_autolink_before`. +//! ; Restriction: the code after `.` must not be eof. +//! www_autolink ::= 3('w' | 'W') '.' [domain [path]] +//! www_autolink_before ::= eof | eol | space_or_tab | '(' | '*' | '_' | '[' | ']' | '~' +//! +//! ; Restriction: the code before must be `http_autolink_before`. +//! ; Restriction: the code after the protocol must be `http_autolink_protocol_after`. +//! http_autolink ::= ('h' | 'H') 2('t' | 'T') ('p' | 'P') ['s' | 'S'] ':' 2'/' domain [path] +//! http_autolink_before ::= byte - ascii_alpha +//! http_autolink_protocol_after ::= byte - eof - eol - ascii_control - unicode_whitespace - unicode_punctuation +//! +//! ; Restriction: the code before must be `email_autolink_before`. +//! ; Restriction: `ascii_digit` may not occur in the last label part of the label. +//! email_autolink ::= 1*('+' | '-' | '.' | '_' | ascii_alphanumeric) '@' 1*(1*label_segment label_dot_cont) 1*label_segment +//! email_autolink_before ::= byte - ascii_alpha - '/' +//! +//! ; Restriction: `_` may not occur in the last two domain parts. +//! domain ::= 1*(url_ampt_cont | domain_punct_cont | '-' | byte - eof - ascii_control - unicode_whitespace - unicode_punctuation) +//! ; Restriction: must not be followed by `punct`. +//! domain_punct_cont ::= '.' | '_' +//! ; Restriction: must not be followed by `char-ref`. +//! url_ampt_cont ::= '&' +//! +//! ; Restriction: a counter `balance = 0` is increased for every `(`, and decreased for every `)`. +//! ; Restriction: `)` must not be `paren_at_end`. +//! path ::= 1*(url_ampt_cont | path_punctuation_cont | '(' | ')' | byte - eof - eol - space_or_tab) +//! ; Restriction: must not be followed by `punct`. +//! path_punctuation_cont ::= trailing_punctuation - '<' +//! ; Restriction: must be followed by `punct` and `balance` must be less than `0`. +//! paren_at_end ::= ')' +//! +//! label_segment ::= label_dash_underscore_cont | ascii_alpha | ascii_digit +//! ; Restriction: if followed by `punct`, the whole email autolink is invalid. +//! label_dash_underscore_cont ::= '-' | '_' +//! ; Restriction: must not be followed by `punct`. +//! label_dot_cont ::= '.' +//! +//! punct ::= *trailing_punctuation ( byte - eof - eol - space_or_tab - '<' ) +//! char_ref ::= *ascii_alpha ';' path_end +//! trailing_punctuation ::= '!' | '"' | '\'' | ')' | '*' | ',' | '.' | ':' | ';' | '<' | '?' | '_' | '~' +//! ``` +//! +//! The grammar for GFM autolink literal is very relaxed: basically anything +//! except for whitespace is allowed after a prefix. +//! To use whitespace characters and otherwise impossible characters, in URLs, +//! you can use percent encoding: +//! +//! ```markdown +//! https://example.com/alpha%20bravo +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p><a href="https://example.com/alpha%20bravo">https://example.com/alpha%20bravo</a></p> +//! ``` +//! +//! There are several cases where incorrect encoding of URLs would, in other +//! languages, result in a parse error. +//! In markdown, there are no errors, and URLs are normalized. +//! In addition, many characters are percent encoded +//! ([`sanitize_uri`][sanitize_uri]). +//! For example: +//! +//! ```markdown +//! www.a👍b% +//! ``` +//! +//! Yields: +//! +//! ```html +//! <p><a href="http://www.a%F0%9F%91%8Db%25">www.a👍b%</a></p> +//! ``` +//! +//! There is a big difference between how www and protocol literals work +//! compared to how email literals work. +//! The first two are done when parsing, and work like anything else in +//! markdown. +//! But email literals are handled afterwards: when everything is parsed, we +//! look back at the events to figure out if there were email addresses. +//! This particularly affects how they interleave with character escapes and +//! character references. +//! +//! ## HTML +//! +//! GFM autolink literals relate to the `<a>` element in HTML. +//! See [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info. +//! When an email autolink is used, the string `mailto:` is prepended when +//! generating the `href` attribute of the hyperlink. +//! When a www autolink is used, the string `http:` is prepended. +//! +//! ## Recommendation +//! +//! It is recommended to use labels ([label start link][label_start_link], +//! [label end][label_end]), either with a resource or a definition +//! ([definition][]), instead of autolink literals, as those allow relative +//! URLs and descriptive text to explain the URL in prose. +//! +//! ## Bugs +//! +//! GitHub’s own algorithm to parse autolink literals contains three bugs. +//! A smaller bug is left unfixed in this project for consistency. +//! Two main bugs are not present in this project. +//! The issues relating to autolink literals are: +//! +//! *   [GFM autolink extension (`www.`, `https?://` parts): links don’t work when after bracket](https://github.com/github/cmark-gfm/issues/278)\ +//!     fixed here ✅ +//! *   [GFM autolink extension (`www.` part): uppercase does not match on issues/PRs/comments](https://github.com/github/cmark-gfm/issues/280)\ +//!     fixed here ✅ +//! *   [GFM autolink extension (`www.` part): the word `www` matches](https://github.com/github/cmark-gfm/issues/279)\ +//!     present here for consistency +//! +//! ## Tokens +//! +//! *   [`GfmAutolinkLiteralProtocol`][Name::GfmAutolinkLiteralProtocol] +//! *   [`GfmAutolinkLiteralWww`][Name::GfmAutolinkLiteralWww] +//! *   [`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail] +//! +//! ## References +//! +//! *   [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) +//! *   [*§ 6.9 Autolinks (extension)* in `GFM`](https://github.github.com/gfm/#autolinks-extension-) +//! +//! [text]: crate::construct::text +//! [definition]: crate::construct::definition +//! [attention]: crate::construct::attention +//! [label_start_link]: crate::construct::label_start_link +//! [label_end]: crate::construct::label_end +//! [sanitize_uri]: crate::util::sanitize_uri +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element  use crate::event::{Event, Kind, Name}; +use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer; -use crate::util::classify_character::{classify, Kind as CharacterKind}; -use crate::util::slice::{Position, Slice}; +use crate::util::{ +    classify_character::{classify_opt, Kind as CharacterKind}, +    slice::{char_after_index, Position, Slice}, +};  use alloc::vec::Vec; -use core::str; -// To do: doc al functions. +/// Start of protocol autolink literal. +/// +/// ```markdown +/// > | https://example.com/a?b#c +///     ^ +/// ``` +pub fn protocol_start(tokenizer: &mut Tokenizer) -> State { +    if tokenizer +        .parse_state +        .options +        .constructs +        .gfm_autolink_literal && +        matches!(tokenizer.current, Some(b'H' | b'h')) +            // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L214>. +            && !matches!(tokenizer.previous, Some(b'A'..=b'Z' | b'a'..=b'z')) +    { +        tokenizer.enter(Name::GfmAutolinkLiteralProtocol); +        tokenizer.attempt( +            State::Next(StateName::GfmAutolinkLiteralProtocolAfter), +            State::Nok, +        ); +        tokenizer.attempt( +            State::Next(StateName::GfmAutolinkLiteralDomainInside), +            State::Nok, +        ); +        tokenizer.tokenize_state.start = tokenizer.point.index; +        State::Retry(StateName::GfmAutolinkLiteralProtocolPrefixInside) +    } else { +        State::Nok +    } +} + +/// After a protocol autolink literal. +/// +/// ```markdown +/// > | https://example.com/a?b#c +///                              ^ +/// ``` +pub fn protocol_after(tokenizer: &mut Tokenizer) -> State { +    tokenizer.exit(Name::GfmAutolinkLiteralProtocol); +    State::Ok +} + +/// In protocol. +/// +/// ```markdown +/// > | https://example.com/a?b#c +///     ^^^^^ +/// ``` +pub fn protocol_prefix_inside(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'A'..=b'Z' | b'a'..=b'z') +            // `5` is size of `https` +            if tokenizer.point.index - tokenizer.tokenize_state.start < 5 => +        { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralProtocolPrefixInside) +        } +        Some(b':') => { +            let slice = Slice::from_indices( +                tokenizer.parse_state.bytes, +                tokenizer.tokenize_state.start, +                tokenizer.point.index, +            ); +            let name = slice.as_str().to_ascii_lowercase(); + +            tokenizer.tokenize_state.start = 0; + +            if name == "http" || name == "https" { +                tokenizer.consume(); +                State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside) +            } else { +                State::Nok +            } +        } +        _ => { +            tokenizer.tokenize_state.start = 0; +            State::Nok +        } +    } +} + +/// In protocol slashes. +/// +/// ```markdown +/// > | https://example.com/a?b#c +///           ^^ +/// ``` +pub fn protocol_slashes_inside(tokenizer: &mut Tokenizer) -> State { +    if tokenizer.current == Some(b'/') { +        tokenizer.consume(); +        if tokenizer.tokenize_state.size == 0 { +            tokenizer.tokenize_state.size += 1; +            State::Next(StateName::GfmAutolinkLiteralProtocolSlashesInside) +        } else { +            tokenizer.tokenize_state.size = 0; +            State::Ok +        } +    } else { +        tokenizer.tokenize_state.size = 0; +        State::Nok +    } +} +/// Start of www autolink literal. +/// +/// ```markdown +/// > | www.example.com/a?b#c +///     ^ +/// ``` +pub fn www_start(tokenizer: &mut Tokenizer) -> State { +    if tokenizer +        .parse_state +        .options +        .constructs +        .gfm_autolink_literal && +        matches!(tokenizer.current, Some(b'W' | b'w')) +            // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. +            && matches!(tokenizer.previous, None | Some(b'\t' | b'\n' | b' ' | b'(' | b'*' | b'_' | b'[' | b']' | b'~')) +    { +        tokenizer.enter(Name::GfmAutolinkLiteralWww); +        tokenizer.attempt( +            State::Next(StateName::GfmAutolinkLiteralWwwAfter), +            State::Nok, +        ); +        // Note: we *check*, so we can discard the `www.` we parsed. +        // If it worked, we consider it as a part of the domain. +        tokenizer.check( +            State::Next(StateName::GfmAutolinkLiteralDomainInside), +            State::Nok, +        ); +        State::Retry(StateName::GfmAutolinkLiteralWwwPrefixInside) +    } else { +        State::Nok +    } +} + +/// After a www autolink literal. +/// +/// ```markdown +/// > | www.example.com/a?b#c +///                          ^ +/// ``` +pub fn www_after(tokenizer: &mut Tokenizer) -> State { +    tokenizer.exit(Name::GfmAutolinkLiteralWww); +    State::Ok +} + +/// In www prefix. +/// +/// ```markdown +/// > | www.example.com +///     ^^^^ +/// ``` +pub fn www_prefix_inside(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'.') if tokenizer.tokenize_state.size == 3 => { +            tokenizer.tokenize_state.size = 0; +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralWwwPrefixAfter) +        } +        Some(b'W' | b'w') if tokenizer.tokenize_state.size < 3 => { +            tokenizer.tokenize_state.size += 1; +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralWwwPrefixInside) +        } +        _ => { +            tokenizer.tokenize_state.size = 0; +            State::Nok +        } +    } +} + +/// After www prefix. +/// +/// ```markdown +/// > | www.example.com +///         ^ +/// ``` +pub fn www_prefix_after(tokenizer: &mut Tokenizer) -> State { +    // If there is *anything*, we can link. +    if tokenizer.current == None { +        State::Nok +    } else { +        State::Ok +    } +} + +/// In domain. +/// +/// ```markdown +/// > | https://example.com/a +///             ^^^^^^^^^^^ +/// ``` +pub fn domain_inside(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Check whether this marker, which is a trailing punctuation +        // marker, optionally followed by more trailing markers, and then +        // followed by an end. +        Some(b'.' | b'_') => { +            tokenizer.check( +                State::Next(StateName::GfmAutolinkLiteralDomainAfter), +                State::Next(StateName::GfmAutolinkLiteralDomainAtPunctuation), +            ); +            State::Retry(StateName::GfmAutolinkLiteralTrail) +        } +        // Dashes and continuation bytes are fine. +        Some(b'-' | 0x80..=0xBF) => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralDomainInside) +        } +        _ => { +            // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. +            if byte_to_kind( +                tokenizer.parse_state.bytes, +                tokenizer.point.index, +                tokenizer.current, +            ) == CharacterKind::Other +            { +                tokenizer.tokenize_state.seen = true; +                tokenizer.consume(); +                State::Next(StateName::GfmAutolinkLiteralDomainInside) +            } else { +                State::Retry(StateName::GfmAutolinkLiteralDomainAfter) +            } +        } +    } +} + +/// In domain, at potential trailing punctuation, that was not trailing. +/// +/// ```markdown +/// > | https://example.com +///                    ^ +/// ``` +pub fn domain_at_punctuation(tokenizer: &mut Tokenizer) -> State { +    // There is an underscore in the last segment of the domain +    if matches!(tokenizer.current, Some(b'_')) { +        tokenizer.tokenize_state.marker = b'_'; +    } +    // Otherwise, it’s a `.`: save the last segment underscore in the +    // penultimate segment slot. +    else { +        tokenizer.tokenize_state.marker_b = tokenizer.tokenize_state.marker; +        tokenizer.tokenize_state.marker = 0; +    } + +    tokenizer.consume(); +    State::Next(StateName::GfmAutolinkLiteralDomainInside) +} + +/// After domain +/// +/// ```markdown +/// > | https://example.com/a +///                        ^ +/// ``` +pub fn domain_after(tokenizer: &mut Tokenizer) -> State { +    // No underscores allowed in last two segments. +    let result = if tokenizer.tokenize_state.marker_b == b'_' +        || tokenizer.tokenize_state.marker == b'_' +        // At least one character must be seen. +        || !tokenizer.tokenize_state.seen +    // Note: that’s GH says a dot is needed, but it’s not true: +    // <https://github.com/github/cmark-gfm/issues/279> +    { +        State::Nok +    } else { +        State::Retry(StateName::GfmAutolinkLiteralPathInside) +    }; + +    tokenizer.tokenize_state.seen = false; +    tokenizer.tokenize_state.marker = 0; +    tokenizer.tokenize_state.marker_b = 0; +    result +} + +/// In path. +/// +/// ```markdown +/// > | https://example.com/a +///                        ^^ +/// ``` +pub fn path_inside(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Continuation bytes are fine, we’ve already checked the first one. +        Some(0x80..=0xBF) => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralPathInside) +        } +        // Count opening parens. +        Some(b'(') => { +            tokenizer.tokenize_state.size += 1; +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralPathInside) +        } +        // Check whether this trailing punctuation marker is optionally +        // followed by more trailing markers, and then followed +        // by an end. +        // If this is a paren (followed by trailing, then the end), we +        // *continue* if we saw less closing parens than opening parens. +        Some( +            b'!' | b'"' | b'&' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'<' | b'?' +            | b']' | b'_' | b'~', +        ) => { +            let next = if tokenizer.current == Some(b')') +                && tokenizer.tokenize_state.size_b < tokenizer.tokenize_state.size +            { +                StateName::GfmAutolinkLiteralPathAtPunctuation +            } else { +                StateName::GfmAutolinkLiteralPathAfter +            }; +            tokenizer.check( +                State::Next(next), +                State::Next(StateName::GfmAutolinkLiteralPathAtPunctuation), +            ); +            State::Retry(StateName::GfmAutolinkLiteralTrail) +        } +        _ => { +            // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. +            if byte_to_kind( +                tokenizer.parse_state.bytes, +                tokenizer.point.index, +                tokenizer.current, +            ) == CharacterKind::Whitespace +            { +                State::Retry(StateName::GfmAutolinkLiteralPathAfter) +            } else { +                tokenizer.consume(); +                State::Next(StateName::GfmAutolinkLiteralPathInside) +            } +        } +    } +} + +/// In path, at potential trailing punctuation, that was not trailing. +/// +/// ```markdown +/// > | https://example.com/a"b +///                          ^ +/// ``` +pub fn path_at_punctuation(tokenizer: &mut Tokenizer) -> State { +    // Count closing parens. +    if tokenizer.current == Some(b')') { +        tokenizer.tokenize_state.size_b += 1; +    } + +    tokenizer.consume(); +    State::Next(StateName::GfmAutolinkLiteralPathInside) +} + +/// At end of path, reset parens. +/// +/// ```markdown +/// > | https://example.com/asd(qwe). +///                                 ^ +/// ``` +pub fn path_after(tokenizer: &mut Tokenizer) -> State { +    tokenizer.tokenize_state.size = 0; +    tokenizer.tokenize_state.size_b = 0; +    State::Ok +} + +/// In trail of domain or path. +/// +/// ```markdown +/// > | https://example.com"). +///                        ^ +/// ``` +pub fn trail(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Regular trailing punctuation. +        Some( +            b'!' | b'"' | b'\'' | b')' | b'*' | b',' | b'.' | b':' | b';' | b'?' | b'_' | b'~', +        ) => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralTrail) +        } +        // `&` followed by one or more alphabeticals and then a `;`, is +        // as a whole considered as trailing punctuation. +        // In all other cases, it is considered as continuation of the URL. +        Some(b'&') => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralTrailCharRefStart) +        } +        // `<` is an end. +        Some(b'<') => State::Ok, +        // Needed because we allow literals after `[`, as we fix: +        // <https://github.com/github/cmark-gfm/issues/278>. +        // Check that it is not followed by `(` or `[`. +        Some(b']') => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralTrailBracketAfter) +        } +        _ => { +            // Whitespace is the end of the URL, anything else is continuation. +            if byte_to_kind( +                tokenizer.parse_state.bytes, +                tokenizer.point.index, +                tokenizer.current, +            ) == CharacterKind::Whitespace +            { +                State::Ok +            } else { +                State::Nok +            } +        } +    } +} + +/// In trail, after `]`. +/// +/// > 👉 **Note**: this deviates from `cmark-gfm` to fix a bug. +/// > See end of <https://github.com/github/cmark-gfm/issues/278> for more. +/// +/// ```markdown +/// > | https://example.com]( +///                         ^ +/// ``` +pub fn trail_bracket_after(tokenizer: &mut Tokenizer) -> State { +    // Whitespace or something that could start a resource or reference is the end. +    // Switch back to trail otherwise. +    if matches!( +        tokenizer.current, +        None | Some(b'\t' | b'\n' | b' ' | b'(' | b'[') +    ) { +        State::Ok +    } else { +        State::Retry(StateName::GfmAutolinkLiteralTrail) +    } +} + +/// In character-reference like trail, after `&`. +/// +/// ```markdown +/// > | https://example.com&). +///                         ^ +/// ``` +pub fn trail_char_ref_start(tokenizer: &mut Tokenizer) -> State { +    if matches!(tokenizer.current, Some(b'A'..=b'Z' | b'a'..=b'z')) { +        State::Retry(StateName::GfmAutolinkLiteralTrailCharRefInside) +    } else { +        State::Nok +    } +} + +/// In character-reference like trail. +/// +/// ```markdown +/// > | https://example.com&). +///                         ^ +/// ``` +pub fn trail_char_ref_inside(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'A'..=b'Z' | b'a'..=b'z') => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralTrailCharRefInside) +        } +        // Switch back to trail if this is well-formed. +        Some(b';') => { +            tokenizer.consume(); +            State::Next(StateName::GfmAutolinkLiteralTrail) +        } +        _ => State::Nok, +    } +} + +/// Resolve: postprocess text to find email autolink literals.  pub fn resolve(tokenizer: &mut Tokenizer) {      tokenizer.map.consume(&mut tokenizer.events); @@ -36,23 +643,30 @@ pub fn resolve(tokenizer: &mut Tokenizer) {                  let mut start = 0;                  while byte_index < bytes.len() { -                    if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') { -                        if let Some(autolink) = peek(bytes, byte_index) { -                            byte_index = autolink.1; +                    if bytes[byte_index] == b'@' { +                        let mut range = (0, 0); + +                        if let Some(start) = peek_bytes_atext(bytes, byte_index) { +                            if let Some(end) = peek_bytes_email_domain(bytes, byte_index + 1) { +                                let end = peek_bytes_truncate(bytes, start, end); +                                range = (start, end); +                            } +                        } + +                        if range.1 != 0 { +                            byte_index = range.1;                              // If there is something between the last link                              // (or the start) and this link. -                            if start != autolink.0 { +                            if start != range.0 {                                  replace.push(Event {                                      kind: Kind::Enter,                                      name: Name::Data,                                      point: point.clone(),                                      link: None,                                  }); -                                point = point.shift_to( -                                    tokenizer.parse_state.bytes, -                                    start_index + autolink.0, -                                ); +                                point = point +                                    .shift_to(tokenizer.parse_state.bytes, start_index + range.0);                                  replace.push(Event {                                      kind: Kind::Exit,                                      name: Name::Data, @@ -64,19 +678,19 @@ pub fn resolve(tokenizer: &mut Tokenizer) {                              // Add the link.                              replace.push(Event {                                  kind: Kind::Enter, -                                name: autolink.2.clone(), +                                name: Name::GfmAutolinkLiteralEmail,                                  point: point.clone(),                                  link: None,                              }); -                            point = point -                                .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1); +                            point = +                                point.shift_to(tokenizer.parse_state.bytes, start_index + range.1);                              replace.push(Event {                                  kind: Kind::Exit, -                                name: autolink.2.clone(), +                                name: Name::GfmAutolinkLiteralEmail,                                  point: point.clone(),                                  link: None,                              }); -                            start = autolink.1; +                            start = range.1;                          }                      } @@ -114,140 +728,19 @@ pub fn resolve(tokenizer: &mut Tokenizer) {      }  } -fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> { -    // Protocol. -    if let Some(protocol_end) = peek_protocol(bytes, index) { -        if let Some(domain_end) = peek_domain(bytes, protocol_end, true) { -            let end = truncate(bytes, protocol_end, domain_end); - -            // Cannot be empty. -            if end != protocol_end { -                return Some((index, end, Name::GfmAutolinkLiteralProtocol)); -            } -        } -    } - -    // Www. -    if peek_www(bytes, index).is_some() { -        // Note: we discard the `www.` we parsed, we now try to parse it as a domain. -        let domain_end = peek_domain(bytes, index, false).unwrap_or(index); -        let end = truncate(bytes, index, domain_end); -        return Some((index, end, Name::GfmAutolinkLiteralWww)); -    } - -    // Email. -    if bytes[index] == b'@' { -        if let Some(start) = peek_atext(bytes, index) { -            if let Some(end) = peek_email_domain(bytes, index + 1) { -                let end = truncate(bytes, start, end); -                return Some((start, end, Name::GfmAutolinkLiteralEmail)); -            } -        } -    } - -    None -} - -/// Move past `http://`, `https://`, case-insensitive. -fn peek_protocol(bytes: &[u8], mut index: usize) -> Option<usize> { -    // `http` -    if index + 3 < bytes.len() -        && matches!(bytes[index], b'H' | b'h') -        && matches!(bytes[index + 1], b'T' | b't') -        && matches!(bytes[index + 2], b'T' | b't') -        && matches!(bytes[index + 3], b'P' | b'p') -    { -        index += 4; - -        // `s`, optional. -        if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') { -            index += 1; -        } - -        // `://` -        if index + 3 < bytes.len() -            && bytes[index] == b':' -            && bytes[index + 1] == b'/' -            && bytes[index + 2] == b'/' -        { -            return Some(index + 3); -        } -    } - -    None -} - -/// Move past `www.`, case-insensitive. -fn peek_www(bytes: &[u8], index: usize) -> Option<usize> { -    // `www.` -    if index + 3 < bytes.len() -        // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. -        && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~')) -        && matches!(bytes[index], b'W' | b'w') -        && matches!(bytes[index + 1], b'W' | b'w') -        && matches!(bytes[index + 2], b'W' | b'w') -        && bytes[index + 3] == b'.' -    { -        Some(index + 4) -    } else { -        None -    } -} - -/// Move past `example.com`. -fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option<usize> { -    let mut dots = false; -    let mut penultime = false; -    let mut last = false; -    // To do: expose this from slice? -    // To do: do it ourselves? <https://github.com/commonmark/cmark/blob/8a023286198a7e408398e282f293e3b0baebb644/src/utf8.c#L150>, <https://doc.rust-lang.org/core/str/fn.next_code_point.html>, <https://www.reddit.com/r/rust/comments/4g2zu0/lazy_unicode_iterator_from_byte_iteratorslice/>, <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>. -    let char_indices = str::from_utf8(&bytes[start..]) -        .unwrap() -        .char_indices() -        .collect::<Vec<_>>(); -    let mut index = 0; - -    while index < char_indices.len() { -        match char_indices[index].1 { -            '_' => last = true, -            '.' => { -                penultime = last; -                last = false; -                dots = true; -            } -            '-' => {} -            // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. -            char if classify(char) == CharacterKind::Other => {} -            _ => break, -        } - -        index += 1; -    } - -    // No underscores allowed in last two parts. -    // A valid domain needs to have at least a dot. -    if penultime || last || (!allow_short && !dots) { -        None -    } else { -        // Now peek past `/path?search#hash` (anything except whitespace). -        while index < char_indices.len() { -            if classify(char_indices[index].1) == CharacterKind::Whitespace { -                break; -            } - -            index += 1; -        } - -        Some(if index == char_indices.len() { -            bytes.len() -        } else { -            start + char_indices[index].0 -        }) -    } -} - -/// Move back past `contact`. -fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> { +// To do: add `xmpp`, `mailto` support. + +/// Move back past atext. +/// +/// Moving back is only used when post processing text: so for the email address +/// algorithm. +/// +/// ```markdown +/// > | a contact@example.org b +///              ^-- from +///       ^-- to +/// ``` +fn peek_bytes_atext(bytes: &[u8], end: usize) -> Option<usize> {      let mut index = end;      // Take simplified atext. @@ -270,8 +763,17 @@ fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> {      }  } -/// Move past `example.com`. -fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> { +/// Move past email domain. +/// +/// Peeking like this only used when post processing text: so for the email +/// address algorithm. +/// +/// ```markdown +/// > | a contact@example.org b +///               ^-- from +///                         ^-- to +/// ``` +fn peek_bytes_email_domain(bytes: &[u8], start: usize) -> Option<usize> {      let mut index = start;      let mut dot = false; @@ -303,8 +805,21 @@ fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> {      }  } -/// Split trialing stuff from a URL. -fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { +/// Move back past punctuation. +/// +/// Moving back is only used when post processing text: so for the email address +/// algorithm. +/// +/// This is much more complex that needed, because GH allows a lot of +/// punctuation in the protocol and www algorithms. +/// However, those aren’t implemented like the email algo. +/// +/// ```markdown +/// > | a contact@example.org”) b +///                           ^-- from +///                         ^-- to +/// ``` +fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {      let mut index = start;      // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L42> @@ -379,3 +894,24 @@ fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {      split  } + +/// Classify a byte (or `char`). +fn byte_to_kind(bytes: &[u8], index: usize, byte: Option<u8>) -> CharacterKind { +    match byte { +        None => CharacterKind::Whitespace, +        Some(byte) => { +            if byte.is_ascii_whitespace() { +                CharacterKind::Whitespace +            } else if byte.is_ascii_punctuation() { +                CharacterKind::Punctuation +            } else if byte.is_ascii_alphanumeric() { +                CharacterKind::Other +            } else { +                // Otherwise: seems to be an ASCII control, so it seems to be a +                // non-ASCII `char`. +                let char = char_after_index(bytes, index); +                classify_opt(char) +            } +        } +    } +} diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index d7c2b69..27fbadf 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -191,7 +191,7 @@  //! This bug is not present in this project.  //! The issue relating to tables is:  //! -//! *   [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277)\ +//! *   [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277)  //!  //! ## Tokens  //! diff --git a/src/construct/text.rs b/src/construct/text.rs index 3cb0f10..0168d02 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -29,17 +29,21 @@ use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer;  /// Characters that can start something in text. -const MARKERS: [u8; 11] = [ +const MARKERS: [u8; 15] = [      b'!',  // `label_start_image`      b'$',  // `raw_text` (math (text))      b'&',  // `character_reference`      b'*',  // `attention` (emphasis, strong)      b'<',  // `autolink`, `html_text` +    b'H',  // `gfm_autolink_literal` (`protocol` kind) +    b'W',  // `gfm_autolink_literal` (`www.` kind)      b'[',  // `label_start_link`      b'\\', // `character_escape`, `hard_break_escape`      b']',  // `label_end`, `gfm_label_start_footnote`      b'_',  // `attention` (emphasis, strong)      b'`',  // `raw_text` (code (text)) +    b'h',  // `gfm_autolink_literal` (`protocol` kind) +    b'w',  // `gfm_autolink_literal` (`www.` kind)      b'~',  // `attention` (gfm strikethrough)  ]; @@ -113,6 +117,20 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {              );              State::Retry(StateName::AutolinkStart)          } +        Some(b'H' | b'h') => { +            tokenizer.attempt( +                State::Next(StateName::TextBefore), +                State::Next(StateName::TextBeforeData), +            ); +            State::Retry(StateName::GfmAutolinkLiteralProtocolStart) +        } +        Some(b'W' | b'w') => { +            tokenizer.attempt( +                State::Next(StateName::TextBefore), +                State::Next(StateName::TextBeforeData), +            ); +            State::Retry(StateName::GfmAutolinkLiteralWwwStart) +        }          Some(b'[') => {              tokenizer.attempt(                  State::Next(StateName::TextBefore), diff --git a/src/state.rs b/src/state.rs index 5013ec8..d7c0c8a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -310,6 +310,29 @@ pub enum Name {      StringBefore,      StringBeforeData, +    GfmAutolinkLiteralProtocolStart, +    GfmAutolinkLiteralProtocolAfter, +    GfmAutolinkLiteralProtocolPrefixInside, +    GfmAutolinkLiteralProtocolSlashesInside, + +    GfmAutolinkLiteralWwwStart, +    GfmAutolinkLiteralWwwAfter, +    GfmAutolinkLiteralWwwPrefixInside, +    GfmAutolinkLiteralWwwPrefixAfter, + +    GfmAutolinkLiteralDomainInside, +    GfmAutolinkLiteralDomainAtPunctuation, +    GfmAutolinkLiteralDomainAfter, + +    GfmAutolinkLiteralPathInside, +    GfmAutolinkLiteralPathAtPunctuation, +    GfmAutolinkLiteralPathAfter, + +    GfmAutolinkLiteralTrail, +    GfmAutolinkLiteralTrailCharRefInside, +    GfmAutolinkLiteralTrailCharRefStart, +    GfmAutolinkLiteralTrailBracketAfter, +      GfmTableStart,      GfmTableHeadRowBefore,      GfmTableHeadRowStart, @@ -686,6 +709,43 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State {          Name::StringBefore => construct::string::before,          Name::StringBeforeData => construct::string::before_data, +        Name::GfmAutolinkLiteralProtocolStart => construct::gfm_autolink_literal::protocol_start, +        Name::GfmAutolinkLiteralProtocolAfter => construct::gfm_autolink_literal::protocol_after, +        Name::GfmAutolinkLiteralProtocolPrefixInside => { +            construct::gfm_autolink_literal::protocol_prefix_inside +        } +        Name::GfmAutolinkLiteralProtocolSlashesInside => { +            construct::gfm_autolink_literal::protocol_slashes_inside +        } + +        Name::GfmAutolinkLiteralWwwAfter => construct::gfm_autolink_literal::www_after, +        Name::GfmAutolinkLiteralWwwStart => construct::gfm_autolink_literal::www_start, +        Name::GfmAutolinkLiteralWwwPrefixInside => { +            construct::gfm_autolink_literal::www_prefix_inside +        } +        Name::GfmAutolinkLiteralWwwPrefixAfter => construct::gfm_autolink_literal::www_prefix_after, +        Name::GfmAutolinkLiteralDomainInside => construct::gfm_autolink_literal::domain_inside, +        Name::GfmAutolinkLiteralDomainAtPunctuation => { +            construct::gfm_autolink_literal::domain_at_punctuation +        } +        Name::GfmAutolinkLiteralDomainAfter => construct::gfm_autolink_literal::domain_after, + +        Name::GfmAutolinkLiteralPathInside => construct::gfm_autolink_literal::path_inside, +        Name::GfmAutolinkLiteralPathAtPunctuation => { +            construct::gfm_autolink_literal::path_at_punctuation +        } +        Name::GfmAutolinkLiteralPathAfter => construct::gfm_autolink_literal::path_after, +        Name::GfmAutolinkLiteralTrail => construct::gfm_autolink_literal::trail, +        Name::GfmAutolinkLiteralTrailCharRefStart => { +            construct::gfm_autolink_literal::trail_char_ref_start +        } +        Name::GfmAutolinkLiteralTrailCharRefInside => { +            construct::gfm_autolink_literal::trail_char_ref_inside +        } +        Name::GfmAutolinkLiteralTrailBracketAfter => { +            construct::gfm_autolink_literal::trail_bracket_after +        } +          Name::GfmTableStart => construct::gfm_table::start,          Name::GfmTableHeadRowBefore => construct::gfm_table::head_row_before,          Name::GfmTableHeadRowStart => construct::gfm_table::head_row_start, | 
