diff options
| author | 2022-08-25 13:16:45 +0200 | |
|---|---|---|
| committer | 2022-08-25 13:16:45 +0200 | |
| commit | 1e4c95079cb97b2b02440b21945c6d12741a7d19 (patch) | |
| tree | 4f6a4a179e72630c1cdd058f84498e32b9a433e0 /src/construct | |
| parent | 49b6a4e72516e8b2a8768e761a60a4f461802d69 (diff) | |
| download | markdown-rs-1e4c95079cb97b2b02440b21945c6d12741a7d19.tar.gz markdown-rs-1e4c95079cb97b2b02440b21945c6d12741a7d19.tar.bz2 markdown-rs-1e4c95079cb97b2b02440b21945c6d12741a7d19.zip | |
Add support for GFM footnotes
Diffstat (limited to '')
| -rw-r--r-- | src/construct/definition.rs | 12 | ||||
| -rw-r--r-- | src/construct/document.rs | 39 | ||||
| -rw-r--r-- | src/construct/gfm_footnote_definition.rs | 345 | ||||
| -rw-r--r-- | src/construct/gfm_label_start_footnote.rs | 91 | ||||
| -rw-r--r-- | src/construct/label_end.rs | 218 | ||||
| -rw-r--r-- | src/construct/label_start_image.rs | 54 | ||||
| -rw-r--r-- | src/construct/label_start_link.rs | 3 | ||||
| -rw-r--r-- | src/construct/list_item.rs | 2 | ||||
| -rw-r--r-- | src/construct/mod.rs | 5 | ||||
| -rw-r--r-- | src/construct/partial_label.rs | 28 | ||||
| -rw-r--r-- | src/construct/text.rs | 24 | 
11 files changed, 717 insertions, 104 deletions
| diff --git a/src/construct/definition.rs b/src/construct/definition.rs index e65d979..1d67635 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -175,14 +175,14 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State {      tokenizer.tokenize_state.token_2 = Name::Data;      tokenizer.tokenize_state.token_3 = Name::Data; -    tokenizer.tokenize_state.end = skip::to_back( -        &tokenizer.events, -        tokenizer.events.len() - 1, -        &[Name::DefinitionLabelString], -    ); -      match tokenizer.current {          Some(b':') => { +            tokenizer.tokenize_state.end = skip::to_back( +                &tokenizer.events, +                tokenizer.events.len() - 1, +                &[Name::DefinitionLabelString], +            ); +              tokenizer.enter(Name::DefinitionMarker);              tokenizer.consume();              tokenizer.exit(Name::DefinitionMarker); diff --git a/src/construct/document.rs b/src/construct/document.rs index b438808..9c76e46 100644 --- a/src/construct/document.rs +++ b/src/construct/document.rs @@ -1,12 +1,13 @@  //! The document content type.  //! -//! **Document** represents the containers, such as block quotes and lists, -//! which structure the document and contain other sections. +//! **Document** represents the containers, such as block quotes, list items, +//! or GFM footnotes, which structure the document and contain other sections.  //!  //! The constructs found in flow are:  //!  //! *   [Block quote][crate::construct::block_quote]  //! *   [List item][crate::construct::list_item] +//! *   [GFM: Footnote definition][crate::construct::gfm_footnote_definition]  use crate::event::{Content, Event, Kind, Link, Name};  use crate::state::{Name as StateName, State}; @@ -99,6 +100,7 @@ pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {          let name = match container.kind {              Container::BlockQuote => StateName::BlockQuoteContStart, +            Container::GfmFootnoteDefinition => StateName::GfmFootnoteDefinitionContStart,              Container::ListItem => StateName::ListItemContStart,          }; @@ -185,7 +187,7 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {  /// ```  pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {      // List item? -    // We replace the empty block quote container for this new list one. +    // We replace the empty block quote container for this new list item one.      tokenizer.tokenize_state.document_container_stack          [tokenizer.tokenize_state.document_continued] = ContainerState {          kind: Container::ListItem, @@ -200,14 +202,38 @@ pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State      State::Retry(StateName::ListItemStart)  } -/// At new container, but not a list (or block quote). +/// At new container, but not a block quote or list item.  //  /// ```markdown  /// > | a  ///     ^  /// ```  pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State { -    // It wasn’t a new block quote or a list. +    // Footnote definition? +    // We replace the empty list item container for this new footnote +    // definition one. +    tokenizer.tokenize_state.document_container_stack +        [tokenizer.tokenize_state.document_continued] = ContainerState { +        kind: Container::GfmFootnoteDefinition, +        blank_initial: false, +        size: 0, +    }; + +    tokenizer.attempt( +        State::Next(StateName::DocumentContainerNewAfter), +        State::Next(StateName::DocumentContainerNewBeforeNotGfmFootnoteDefinition), +    ); +    State::Retry(StateName::GfmFootnoteDefinitionStart) +} + +/// At new container, but not a block quote, list item, or footnote definition. +// +/// ```markdown +/// > | a +///     ^ +/// ``` +pub fn container_new_before_not_footnote_definition(tokenizer: &mut Tokenizer) -> State { +    // It wasn’t a new block quote, list item, or footnote definition.      // Swap the new container (in the middle) with the existing one (at the end).      // Drop what was in the middle.      tokenizer @@ -227,7 +253,7 @@ pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State {  ///       ^  /// ```  pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { -    // It was a new block quote or a list. +    // It was a new block quote, list item, or footnote definition.      // Swap the new container (in the middle) with the existing one (at the end).      // Take the new container.      let container = tokenizer @@ -453,6 +479,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {              let container = stack_close.pop().unwrap();              let name = match container.kind {                  Container::BlockQuote => Name::BlockQuote, +                Container::GfmFootnoteDefinition => Name::GfmFootnoteDefinition,                  Container::ListItem => Name::ListItem,              }; diff --git a/src/construct/gfm_footnote_definition.rs b/src/construct/gfm_footnote_definition.rs new file mode 100644 index 0000000..3715044 --- /dev/null +++ b/src/construct/gfm_footnote_definition.rs @@ -0,0 +1,345 @@ +//! GFM: Footnote definition occurs in the [document][] content type. +//! +//! ## Grammar +//! +//! Footnote definitions form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! ; Restriction: `label` must start with `^` (and not be empty after it). +//! ; See the `label` construct for the BNF of that parts. +//! gfm_footnote_definition_start ::= label ':' *space_or_tab +//! +//! ; Restriction: blank line allowed. +//! gfm_footnote_definition_cont ::= 4(space_or_tab) +//! ``` +//! +//! Further lines that are not prefixed with `gfm_footnote_definition_cont` +//! cause the footnote definition to be exited, except when those lines are +//! lazy continuation or blank. +//! Like so many things in markdown, footnote definition too, are complex. +//! See [*§ Phase 1: block structure* in `CommonMark`][commonmark_block] for +//! more on parsing details. +//! +//! See [`label`][label] for grammar, notes, and recommendations on that part. +//! +//! The `label` part is interpreted as the [string][] content type. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. +//! +//! Definitions match to calls through identifiers. +//! To match, both labels must be equal after normalizing with +//! [`normalize_identifier`][normalize_identifier]. +//! One definition can match to multiple calls. +//! Multiple definitions with the same, normalized, identifier are ignored: the +//! first definition is preferred. +//! To illustrate, the definition with the content of `x` wins: +//! +//! ```markdown +//! [^a]: x +//! [^a]: y +//! +//! [^a] +//! ``` +//! +//! Importantly, while labels *can* include [string][] content (character +//! escapes and character references), these are not considered when matching. +//! To illustrate, neither definition matches the call: +//! +//! ```markdown +//! [^a&b]: x +//! [^a\&b]: y +//! +//! [^a&b] +//! ``` +//! +//! Because footnote definitions are containers (like block quotes and list +//! items), they can contain more footnote definitions, and they can include +//! calls to themselves. +//! +//! ## HTML +//! +//! GFM footnote definitions do not, on their own, relate to anything in HTML. +//! When matched with a [label end][label_end], which in turns matches to a +//! [GFM label start (footnote)][gfm_label_start_footnote], the definition +//! relates to several elements in HTML. +//! +//! When one or more definitions are called, a footnote section is generated +//! at the end of the document, using `<section>`, `<h2>`, and `<ol>` elements: +//! +//! ```html +//! <section data-footnotes="" class="footnotes"><h2 id="footnote-label" class="sr-only">Footnotes</h2> +//! <ol>…</ol> +//! </section> +//! ``` +//! +//! Each definition is generated as a `<li>` in the `<ol>`, in the order they +//! were first called: +//! +//! ```html +//! <li id="user-content-fn-1">…</li> +//! ``` +//! +//! Backreferences are injected at the end of the first paragraph, or, when +//! there is no paragraph, at the end of the definition. +//! When a definition is called multiple times, multiple backreferences are +//! generated. +//! Further backreferences use an extra counter in the `href` attribute and +//! visually in a `<span>` after `↩`. +//! +//! ```html +//! <a href="#user-content-fnref-1" data-footnote-backref="" class="data-footnote-backref" aria-label="Back to content">↩</a> <a href="#user-content-fnref-1-2" data-footnote-backref="" class="data-footnote-backref" aria-label="Back to content">↩<sup>2</sup></a> +//! ``` +//! +//! See +//! [*§ 4.5.1 The `a` element*][html_a], +//! [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements*][html_h], +//! [*§ 4.4.8 The `li` element*][html_li], +//! [*§ 4.4.5 The `ol` element*][html_ol], +//! [*§ 4.4.1 The `p` element*][html_p], +//! [*§ 4.3.3 The `section` element*][html_section], and +//! [*§ 4.5.19 The `sub` and `sup` elements*][html_sup] +//! in the HTML spec for more info. +//! +//! ## Recommendation +//! +//! When authoring markdown with footnotes, it’s recommended to use words +//! instead of numbers (or letters or anything with an order) as calls. +//! That makes it easier to reuse and reorder footnotes. +//! +//! It’s recommended to place footnotes definitions at the bottom of the document. +//! +//! ## Bugs +//! +//! GitHub’s own algorithm to parse footnote definitions contains several bugs. +//! These are not present in this project. +//! The issues relating to footnote definitions are: +//! +//! *   [Footnote reference call identifiers are trimmed, but definition identifiers aren’t](https://github.com/github/cmark-gfm/issues/237)\ +//!     — initial and final whitespace in labels causes them not to match +//! *   [Footnotes are matched case-insensitive, but links keep their casing, breaking them](https://github.com/github/cmark-gfm/issues/239)\ +//!     — using uppercase (or any character that will be percent encoded) in identifiers breaks links +//! *   [Colons in footnotes generate links w/o `href`](https://github.com/github/cmark-gfm/issues/250)\ +//!     — colons in identifiers generate broken links +//! *   [Character escape of `]` does not work in footnote identifiers](https://github.com/github/cmark-gfm/issues/240)\ +//!     — some character escapes don’t work +//! *   [Footnotes in links are broken](https://github.com/github/cmark-gfm/issues/249)\ +//!     — while `CommonMark` prevents links in links, GitHub does not prevent footnotes (which turn into links) in links +//! *   [Footnote-like brackets around image, break that image](https://github.com/github/cmark-gfm/issues/275)\ +//!     — images can’t be used in what looks like a footnote call +//! +//! ## Tokens +//! +//! *   [`DefinitionMarker`][Name::DefinitionMarker] +//! *   [`GfmFootnoteDefinition`][Name::GfmFootnoteDefinition] +//! *   [`GfmFootnoteDefinitionLabel`][Name::GfmFootnoteDefinitionLabel] +//! *   [`GfmFootnoteDefinitionLabelMarker`][Name::GfmFootnoteDefinitionLabelMarker] +//! *   [`GfmFootnoteDefinitionLabelString`][Name::GfmFootnoteDefinitionLabelString] +//! *   [`GfmFootnoteDefinitionMarker`][Name::GfmFootnoteDefinitionMarker] +//! *   [`GfmFootnoteDefinitionPrefix`][Name::GfmFootnoteDefinitionPrefix] +//! *   [`SpaceOrTab`][Name::SpaceOrTab] +//! +//! ## References +//! +//! *   [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-footnote) +//! +//! > 👉 **Note**: Footnotes are not specified in GFM yet. +//! > See [`github/cmark-gfm#270`](https://github.com/github/cmark-gfm/issues/270) +//! > for the related issue. +//! +//! [document]: crate::construct::document +//! [string]: crate::construct::string +//! [character_reference]: crate::construct::character_reference +//! [character_escape]: crate::construct::character_escape +//! [label]: crate::construct::partial_label +//! [label_end]: crate::construct::label_end +//! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote +//! [commonmark_block]: https://spec.commonmark.org/0.30/#phase-1-block-structure +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! [html_h]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [html_li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element +//! [html_ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element +//! [html_p]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element +//! [html_section]: https://html.spec.whatwg.org/multipage/sections.html#the-section-element +//! [html_sup]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-sub-and-sup-elements + +use crate::construct::partial_space_or_tab::space_or_tab_min_max; +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{ +    constant::TAB_SIZE, +    normalize_identifier::normalize_identifier, +    skip, +    slice::{Position, Slice}, +}; + +/// Start of GFM footnote definition. +/// +/// ```markdown +/// > | [^a]: b +///     ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { +    if tokenizer +        .parse_state +        .options +        .constructs +        .gfm_footnote_definition +    { +        tokenizer.enter(Name::GfmFootnoteDefinition); + +        if matches!(tokenizer.current, Some(b'\t' | b' ')) { +            tokenizer.attempt( +                State::Next(StateName::GfmFootnoteDefinitionLabelBefore), +                State::Nok, +            ); +            State::Retry(space_or_tab_min_max( +                tokenizer, +                1, +                if tokenizer.parse_state.options.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            )) +        } else { +            State::Retry(StateName::GfmFootnoteDefinitionLabelBefore) +        } +    } else { +        State::Nok +    } +} + +/// Before definition label (after optional whitespace). +/// +/// ```markdown +/// > | [^a]: b +///     ^ +/// ``` +pub fn label_before(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'[') => { +            tokenizer.tokenize_state.token_1 = Name::GfmFootnoteDefinitionLabel; +            tokenizer.tokenize_state.token_2 = Name::GfmFootnoteDefinitionLabelMarker; +            tokenizer.tokenize_state.token_3 = Name::GfmFootnoteDefinitionLabelString; +            tokenizer.tokenize_state.token_4 = Name::GfmFootnoteDefinitionMarker; +            tokenizer.tokenize_state.marker = b'^'; +            tokenizer.enter(Name::GfmFootnoteDefinitionPrefix); +            tokenizer.attempt( +                State::Next(StateName::GfmFootnoteDefinitionLabelAfter), +                State::Nok, +            ); +            State::Retry(StateName::LabelStart) +        } +        _ => State::Nok, +    } +} + +/// After definition label. +/// +/// ```markdown +/// > | [^a]: b +///         ^ +/// ``` +pub fn label_after(tokenizer: &mut Tokenizer) -> State { +    tokenizer.tokenize_state.token_1 = Name::Data; +    tokenizer.tokenize_state.token_2 = Name::Data; +    tokenizer.tokenize_state.token_3 = Name::Data; +    tokenizer.tokenize_state.token_4 = Name::Data; +    tokenizer.tokenize_state.marker = 0; + +    match tokenizer.current { +        Some(b':') => { +            let end = skip::to_back( +                &tokenizer.events, +                tokenizer.events.len() - 1, +                &[Name::GfmFootnoteDefinitionLabelString], +            ); + +            // Note: we don’t care about virtual spaces, so `as_str` is fine. +            let id = normalize_identifier( +                Slice::from_position( +                    tokenizer.parse_state.bytes, +                    &Position::from_exit_event(&tokenizer.events, end), +                ) +                .as_str(), +            ); + +            // Note: we don’t care about uniqueness. +            // It’s likely that that doesn’t happen very frequently. +            // It is more likely that it wastes precious time. +            tokenizer.tokenize_state.gfm_footnote_definitions.push(id); + +            tokenizer.enter(Name::DefinitionMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::DefinitionMarker); +            tokenizer.attempt( +                State::Next(StateName::GfmFootnoteDefinitionWhitespaceAfter), +                State::Nok, +            ); +            // Any whitespace after the marker is eaten, forming indented code +            // is not possible. +            // No space is also fine, just like a block quote marker. +            State::Next(space_or_tab_min_max(tokenizer, 0, usize::MAX)) +        } +        _ => State::Nok, +    } +} + +/// After definition prefix. +/// +/// ```markdown +/// > | [^a]: b +///           ^ +/// ``` +pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { +    tokenizer.exit(Name::GfmFootnoteDefinitionPrefix); +    State::Ok +} + +/// Start of footnote definition continuation. +/// +/// ```markdown +///   | [^a]: b +/// > |     c +///     ^ +/// ``` +pub fn cont_start(tokenizer: &mut Tokenizer) -> State { +    tokenizer.check( +        State::Next(StateName::GfmFootnoteDefinitionContBlank), +        State::Next(StateName::GfmFootnoteDefinitionContFilled), +    ); +    State::Retry(StateName::BlankLineStart) +} + +/// Start of footnote definition continuation, at a blank line. +/// +/// ```markdown +///   | [^a]: b +/// > | ␠␠␊ +///     ^ +/// ``` +pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { +    if matches!(tokenizer.current, Some(b'\t' | b' ')) { +        State::Retry(space_or_tab_min_max(tokenizer, 0, TAB_SIZE)) +    } else { +        State::Ok +    } +} + +/// Start of footnote definition continuation, at a filled line. +/// +/// ```markdown +///   | [^a]: b +/// > |     c +///     ^ +/// ``` +pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { +    if matches!(tokenizer.current, Some(b'\t' | b' ')) { +        // Consume exactly `TAB_SIZE`. +        State::Retry(space_or_tab_min_max(tokenizer, TAB_SIZE, TAB_SIZE)) +    } else { +        State::Nok +    } +} diff --git a/src/construct/gfm_label_start_footnote.rs b/src/construct/gfm_label_start_footnote.rs new file mode 100644 index 0000000..a3a0df6 --- /dev/null +++ b/src/construct/gfm_label_start_footnote.rs @@ -0,0 +1,91 @@ +//! Label start (footnote) occurs in the [text][] content type. +//! +//! ## Grammar +//! +//! Label start (footnote) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! gfm_label_start_footnote ::= '[' '^' +//! ``` +//! +//! ## HTML +//! +//! Label start (footnote) does not, on its own, relate to anything in HTML. +//! When matched with a [label end][label_end], they together relate to `<sup>` +//! and `<a>` elements in HTML. +//! See [*§ 4.5.19 The `sub` and `sup` elements*][html_sup] and +//! [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info. +//! Without an end, the characters (`[^`) are output. +//! +//! ## Tokens +//! +//! *   [`LabelImage`][Name::LabelImage] +//! *   To do. +//! +//! ## References +//! +//! *   [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) +//! +//! > 👉 **Note**: Footnotes are not specified in GFM yet. +//! > See [`github/cmark-gfm#270`](https://github.com/github/cmark-gfm/issues/270) +//! > for the related issue. +//! +//! [text]: crate::construct::text +//! [label_end]: crate::construct::label_end +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! [html_sup]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-sub-and-sup-elements + +use crate::event::Name; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::{LabelKind, LabelStart, Tokenizer}; + +/// Start of label (footnote) start. +/// +/// ```markdown +/// > | a [^b] c +///       ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { +    if tokenizer +        .parse_state +        .options +        .constructs +        .gfm_label_start_footnote +        && tokenizer.current == Some(b'[') +    { +        tokenizer.enter(Name::GfmFootnoteCallLabel); +        tokenizer.enter(Name::LabelMarker); +        tokenizer.consume(); +        tokenizer.exit(Name::LabelMarker); +        State::Next(StateName::GfmLabelStartFootnoteOpen) +    } else { +        State::Nok +    } +} + +/// After `[`, at `^`. +/// +/// ```markdown +/// > | a [^b] c +///        ^ +/// ``` +pub fn open(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'^') => { +            tokenizer.enter(Name::GfmFootnoteCallMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::GfmFootnoteCallMarker); +            tokenizer.exit(Name::GfmFootnoteCallLabel); +            tokenizer.tokenize_state.label_starts.push(LabelStart { +                kind: LabelKind::GfmFootnote, +                start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1), +                inactive: false, +            }); +            tokenizer.register_resolver_before(ResolveName::Label); +            State::Ok +        } +        _ => State::Nok, +    } +} diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 0ea745f..b5a6013 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -46,6 +46,8 @@  //! attribute in case of a [label start (link)][label_start_link], and an  //! `src` attribute in case of a [label start (image)][label_start_image].  //! The title is formed, optionally, on either `<a>` or `<img>`. +//! When matched with a [gfm label start (footnote)][gfm_label_start_footnote], +//! no reference or resource can follow the label end.  //!  //! For info on how to encode characters in URLs, see  //! [`destination`][destination]. @@ -53,11 +55,13 @@  //! `<img>` when compiling, see  //! [`sanitize_uri`][sanitize_uri].  //! +//! In case of a matched [gfm label start (footnote)][gfm_label_start_footnote], +//! a counter is injected.  //! In case of a matched [label start (link)][label_start_link], the interpreted  //! content between it and the label end, is placed between the opening and  //! closing tags. -//! Otherwise, the text is also interpreted, but used *without* the resulting -//! tags: +//! In case of a matched [label start (image)][label_start_image], the text is +//! also interpreted, but used *without* the resulting tags:  //!  //! ```markdown  //! [a *b* c](#) @@ -75,8 +79,9 @@  //! It is possible to use images in links.  //! It’s somewhat possible to have links in images (the text will be used, not  //! the HTML, see above). -//! But it’s not possible to use links in links. -//! The “deepest” link wins. +//! But it’s not possible to use links (or footnotes, which result in links) +//! in links. +//! The “deepest” link (or footnote) wins.  //! To illustrate:  //!  //! ```markdown @@ -104,17 +109,26 @@  //! It can also match with [label start (image)][label_start_image], in which  //! case they form an `<img>` element.  //! See [*§ 4.8.3 The `img` element*][html_img] in the HTML spec for more info. +//! It can also match with [gfm label start (footnote)][gfm_label_start_footnote], +//! in which case they form `<sup>` and `<a>` elements in HTML. +//! See [*§ 4.5.19 The `sub` and `sup` elements*][html_sup] and +//! [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info.  //!  //! ## Recommendation  //! -//! It is recommended to use labels instead of [autolinks][autolink]. +//! It is recommended to use labels for links instead of [autolinks][autolink].  //! Labels allow more characters in URLs, and allow relative URLs and `www.`  //! URLs.  //! They also allow for descriptive text to explain the URL in prose.  //! +//! In footnotes, it’s recommended to use words instead of numbers (or letters +//! or anything with an order) as calls. +//! That makes it easier to reuse and reorder footnotes. +//!  //! ## Tokens  //!  //! *   [`Data`][Name::Data] +//! *   [`GfmFootnoteCall`][Name::GfmFootnoteCall]  //! *   [`Image`][Name::Image]  //! *   [`Label`][Name::Label]  //! *   [`LabelEnd`][Name::LabelEnd] @@ -140,10 +154,15 @@  //! ## References  //!  //! *   [`label-end.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/label-end.js) +//! *   [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-footnote)  //! *   [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions)  //! *   [*§ 6.3 Links* in `CommonMark`](https://spec.commonmark.org/0.30/#links)  //! *   [*§ 6.4 Images* in `CommonMark`](https://spec.commonmark.org/0.30/#images)  //! +//! > 👉 **Note**: Footnotes are not specified in GFM yet. +//! > See [`github/cmark-gfm#270`](https://github.com/github/cmark-gfm/issues/270) +//! > for the related issue. +//!  //! [string]: crate::construct::string  //! [text]: crate::construct::text  //! [destination]: crate::construct::partial_destination @@ -151,25 +170,28 @@  //! [label]: crate::construct::partial_label  //! [label_start_image]: crate::construct::label_start_image  //! [label_start_link]: crate::construct::label_start_link +//! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote  //! [definition]: crate::construct::definition  //! [autolink]: crate::construct::autolink  //! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri  //! [normalize_identifier]: crate::util::normalize_identifier::normalize_identifier  //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element  //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element +//! [html_sup]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-sub-and-sup-elements  use crate::construct::partial_space_or_tab_eol::space_or_tab_eol;  use crate::event::{Event, Kind, Name};  use crate::resolve::Name as ResolveName;  use crate::state::{Name as StateName, State}; -use crate::tokenizer::{Label, LabelStart, Tokenizer}; +use crate::tokenizer::{Label, LabelKind, LabelStart, Tokenizer};  use crate::util::{      constant::RESOURCE_DESTINATION_BALANCE_MAX,      normalize_identifier::normalize_identifier,      skip,      slice::{Position, Slice},  }; -use alloc::vec; +use alloc::{string::String, vec}; +extern crate std;  /// Start of label end.  /// @@ -190,7 +212,15 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {              tokenizer.tokenize_state.end = tokenizer.events.len(); -            // Mark as balanced if the info is inactive. +            // If the corresponding label (link) start is marked as inactive, +            // it means we’d be wrapping a link, like this: +            // +            // ```markdown +            // > | a [b [c](d) e](f) g. +            //                  ^ +            // ``` +            // +            // We can’t have that, so it’s just balanced brackets.              if label_start.inactive {                  return State::Retry(StateName::LabelEndNok);              } @@ -220,19 +250,34 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  ///       ^  /// ```  pub fn after(tokenizer: &mut Tokenizer) -> State { -    let start = tokenizer.tokenize_state.label_starts.last().unwrap(); -    let defined = tokenizer -        .parse_state -        .definitions -        .contains(&normalize_identifier( -            // We don’t care about virtual spaces, so `indices` and `as_str` are fine. -            Slice::from_indices( -                tokenizer.parse_state.bytes, -                tokenizer.events[start.start.1].point.index, -                tokenizer.events[tokenizer.tokenize_state.end].point.index, -            ) -            .as_str(), -        )); +    let start_index = tokenizer.tokenize_state.label_starts.len() - 1; +    let start = &tokenizer.tokenize_state.label_starts[start_index]; + +    let indices = ( +        tokenizer.events[start.start.1].point.index, +        tokenizer.events[tokenizer.tokenize_state.end].point.index, +    ); + +    // We don’t care about virtual spaces, so `indices` and `as_str` are fine. +    let mut id = normalize_identifier( +        Slice::from_indices(tokenizer.parse_state.bytes, indices.0, indices.1).as_str(), +    ); + +    // See if this matches a footnote definition. +    if start.kind == LabelKind::GfmFootnote { +        if tokenizer.parse_state.gfm_footnote_definitions.contains(&id) { +            return State::Retry(StateName::LabelEndOk); +        } + +        // Nope, this might be a normal link? +        tokenizer.tokenize_state.label_starts[start_index].kind = LabelKind::GfmUndefinedFootnote; +        let mut new_id = String::new(); +        new_id.push('^'); +        new_id.push_str(&id); +        id = new_id; +    } + +    let defined = tokenizer.parse_state.definitions.contains(&id);      match tokenizer.current {          // Resource (`[asd](fgh)`)? @@ -302,17 +347,15 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State {      // Remove the start.      let label_start = tokenizer.tokenize_state.label_starts.pop().unwrap(); -    let is_link = tokenizer.events[label_start.start.0].name == Name::LabelLink; - -    // If this is a link, we need to mark earlier link starts as no longer -    // viable for use (as they would otherwise contain a link). +    // If this is a link or footnote, we need to mark earlier link starts as no +    // longer viable for use (as they would otherwise contain a link).      // These link starts are still looking for balanced closing brackets, so -    // we can’t remove them. -    if is_link { +    // we can’t remove them, but we can mark them. +    if label_start.kind != LabelKind::Image {          let mut index = 0;          while index < tokenizer.tokenize_state.label_starts.len() {              let label_start = &mut tokenizer.tokenize_state.label_starts[index]; -            if tokenizer.events[label_start.start.0].name == Name::LabelLink { +            if label_start.kind != LabelKind::Image {                  label_start.inactive = true;              }              index += 1; @@ -320,6 +363,7 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State {      }      tokenizer.tokenize_state.labels.push(Label { +        kind: label_start.kind,          start: label_start.start,          end: (tokenizer.tokenize_state.end, tokenizer.events.len() - 1),      }); @@ -342,9 +386,7 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State {  /// ```  pub fn nok(tokenizer: &mut Tokenizer) -> State {      let start = tokenizer.tokenize_state.label_starts.pop().unwrap(); -      tokenizer.tokenize_state.label_starts_loose.push(start); -      tokenizer.tokenize_state.end = 0;      State::Nok  } @@ -615,120 +657,142 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State {      }  } -/// Resolve media. +/// Resolve images, links, and footnotes.  /// -/// This turns matching label start (image, link) and label ends into links and -/// images, and turns unmatched label starts back into data. +/// This turns matching label starts and label ends into links, images, and +/// footnotes, and turns unmatched label starts back into data.  pub fn resolve(tokenizer: &mut Tokenizer) { -    let list = tokenizer.tokenize_state.label_starts.split_off(0); -    mark_as_data(tokenizer, &list); -    let list = tokenizer.tokenize_state.label_starts_loose.split_off(0); -    mark_as_data(tokenizer, &list); +    // Inject labels. +    let labels = tokenizer.tokenize_state.labels.split_off(0); +    inject_labels(tokenizer, &labels); +    // Handle loose starts. +    let starts = tokenizer.tokenize_state.label_starts.split_off(0); +    mark_as_data(tokenizer, &starts); +    let starts = tokenizer.tokenize_state.label_starts_loose.split_off(0); +    mark_as_data(tokenizer, &starts); -    let media = tokenizer.tokenize_state.labels.split_off(0); +    tokenizer.map.consume(&mut tokenizer.events); +} +/// Inject links/images/footnotes. +fn inject_labels(tokenizer: &mut Tokenizer, labels: &[Label]) {      // Add grouping events.      let mut index = 0; -    while index < media.len() { -        let media = &media[index]; -        // LabelLink:Enter or LabelImage:Enter. -        let group_enter_index = media.start.0; -        let group_enter_event = &tokenizer.events[group_enter_index]; -        // LabelLink:Exit or LabelImage:Exit. -        let text_enter_index = media.start.0 -            + (if group_enter_event.name == Name::LabelLink { -                4 -            } else { -                6 -            }); -        // LabelEnd:Enter. -        let text_exit_index = media.end.0; -        // LabelEnd:Exit. -        let label_exit_index = media.end.0 + 3; -        // Resource:Exit, etc. -        let group_end_index = media.end.1; - -        let group_name = if group_enter_event.name == Name::LabelLink { -            Name::Link -        } else { +    while index < labels.len() { +        let label = &labels[index]; +        let group_name = if label.kind == LabelKind::GfmFootnote { +            Name::GfmFootnoteCall +        } else if label.kind == LabelKind::Image {              Name::Image +        } else { +            Name::Link          }; +        // If this is a fine link, which starts with a footnote start that did +        // not match, we need to inject the caret as data. +        let mut caret = vec![]; + +        if label.kind == LabelKind::GfmUndefinedFootnote { +            // Add caret. +            caret.push(Event { +                kind: Kind::Enter, +                name: Name::Data, +                // Enter:GfmFootnoteCallMarker. +                point: tokenizer.events[label.start.1 - 2].point.clone().clone(), +                link: None, +            }); +            caret.push(Event { +                kind: Kind::Exit, +                name: Name::Data, +                // Exit:GfmFootnoteCallMarker. +                point: tokenizer.events[label.start.1 - 1].point.clone(), +                link: None, +            }); +            // Change and move label end. +            tokenizer.events[label.start.0].name = Name::LabelLink; +            tokenizer.events[label.start.1].name = Name::LabelLink; +            tokenizer.events[label.start.1].point = caret[0].point.clone(); +            // Remove the caret. +            // Enter:GfmFootnoteCallMarker, Exit:GfmFootnoteCallMarker. +            tokenizer.map.add(label.start.1 - 2, 2, vec![]); +        } +          // Insert a group enter and label enter.          tokenizer.map.add( -            group_enter_index, +            label.start.0,              0,              vec![                  Event {                      kind: Kind::Enter,                      name: group_name.clone(), -                    point: group_enter_event.point.clone(), +                    point: tokenizer.events[label.start.0].point.clone(),                      link: None,                  },                  Event {                      kind: Kind::Enter,                      name: Name::Label, -                    point: group_enter_event.point.clone(), +                    point: tokenizer.events[label.start.0].point.clone(),                      link: None,                  },              ],          );          // Empty events not allowed. -        if text_enter_index != text_exit_index { -            // Insert a text enter. +        // Though: if this was what looked like a footnote, but didn’t match, +        // it’s a link instead, and we need to inject the `^`. +        if label.start.1 != label.end.0 || !caret.is_empty() {              tokenizer.map.add( -                text_enter_index, +                label.start.1 + 1,                  0,                  vec![Event {                      kind: Kind::Enter,                      name: Name::LabelText, -                    point: tokenizer.events[text_enter_index].point.clone(), +                    point: tokenizer.events[label.start.1].point.clone(),                      link: None,                  }],              ); - -            // Insert a text exit.              tokenizer.map.add( -                text_exit_index, +                label.end.0,                  0,                  vec![Event {                      kind: Kind::Exit,                      name: Name::LabelText, -                    point: tokenizer.events[text_exit_index].point.clone(), +                    point: tokenizer.events[label.end.0].point.clone(),                      link: None,                  }],              );          } +        if !caret.is_empty() { +            tokenizer.map.add(label.start.1 + 1, 0, caret); +        } +          // Insert a label exit.          tokenizer.map.add( -            label_exit_index + 1, +            label.end.0 + 4,              0,              vec![Event {                  kind: Kind::Exit,                  name: Name::Label, -                point: tokenizer.events[label_exit_index].point.clone(), +                point: tokenizer.events[label.end.0 + 3].point.clone(),                  link: None,              }],          );          // Insert a group exit.          tokenizer.map.add( -            group_end_index + 1, +            label.end.1 + 1,              0,              vec![Event {                  kind: Kind::Exit,                  name: group_name, -                point: tokenizer.events[group_end_index].point.clone(), +                point: tokenizer.events[label.end.1].point.clone(),                  link: None,              }],          );          index += 1;      } - -    tokenizer.map.consume(&mut tokenizer.events);  }  /// Remove loose label starts. diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index a8c9ac3..4511794 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -35,7 +35,7 @@  use crate::event::Name;  use crate::resolve::Name as ResolveName;  use crate::state::{Name as StateName, State}; -use crate::tokenizer::{LabelStart, Tokenizer}; +use crate::tokenizer::{LabelKind, LabelStart, Tokenizer};  /// Start of label (image) start.  /// @@ -68,14 +68,52 @@ pub fn open(tokenizer: &mut Tokenizer) -> State {              tokenizer.enter(Name::LabelMarker);              tokenizer.consume();              tokenizer.exit(Name::LabelMarker); -            tokenizer.exit(Name::LabelImage); -            tokenizer.tokenize_state.label_starts.push(LabelStart { -                start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1), -                inactive: false, -            }); -            tokenizer.register_resolver_before(ResolveName::Label); -            State::Ok +            State::Next(StateName::LabelStartImageAfter)          }          _ => State::Nok,      }  } + +/// After `![`. +/// +/// ```markdown +/// > | a ![b] c +///         ^ +/// ``` +/// +/// This is needed in because, when GFM footnotes are enabled, images never +/// form when started with a `^`. +/// Instead, links form: +/// +/// ```markdown +///  +/// +/// ![^a][b] +/// +/// [b]: c +/// ``` +/// +/// ```html +/// <p>!<a href=\"b\">^a</a></p> +/// <p>!<a href=\"c\">^a</a></p> +/// ``` +pub fn after(tokenizer: &mut Tokenizer) -> State { +    if tokenizer +        .parse_state +        .options +        .constructs +        .gfm_label_start_footnote +        && tokenizer.current == Some(b'^') +    { +        State::Nok +    } else { +        tokenizer.exit(Name::LabelImage); +        tokenizer.tokenize_state.label_starts.push(LabelStart { +            kind: LabelKind::Image, +            start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1), +            inactive: false, +        }); +        tokenizer.register_resolver_before(ResolveName::Label); +        State::Ok +    } +} diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index 3aeb68b..3454724 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -34,7 +34,7 @@  use crate::event::Name;  use crate::resolve::Name as ResolveName;  use crate::state::State; -use crate::tokenizer::{LabelStart, Tokenizer}; +use crate::tokenizer::{LabelKind, LabelStart, Tokenizer};  /// Start of label (link) start.  /// @@ -52,6 +52,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {          tokenizer.exit(Name::LabelMarker);          tokenizer.exit(Name::LabelLink);          tokenizer.tokenize_state.label_starts.push(LabelStart { +            kind: LabelKind::Link,              start: (start, tokenizer.events.len() - 1),              inactive: false,          }); diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index 39b5d13..658c2c7 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -17,7 +17,7 @@  //! ```  //!  //! Further lines that are not prefixed with `list_item_cont` cause the list -//! item to be exited, except when those lines are lazy continuation. +//! item to be exited, except when those lines are lazy continuation or blank.  //! Like so many things in markdown, list items too, are complex.  //! See [*§ Phase 1: block structure* in `CommonMark`][commonmark_block] for  //! more on parsing details. diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 7ac3899..c5002bb 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -59,6 +59,9 @@  //!  //! *   [frontmatter][]  //! *   [gfm autolink literal][gfm_autolink_literal] +//! *   [gfm footnote definition][gfm_footnote_definition] +//! *   [gfm task list item check][gfm_task_list_item_check] +//! *   [gfm label start footnote][gfm_label_start_footnote]  //!  //! There are also several small subroutines typically used in different places:  //! @@ -146,6 +149,8 @@ pub mod document;  pub mod flow;  pub mod frontmatter;  pub mod gfm_autolink_literal; +pub mod gfm_footnote_definition; +pub mod gfm_label_start_footnote;  pub mod gfm_task_list_item_check;  pub mod hard_break_escape;  pub mod heading_atx; diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 47ffd90..ab436b2 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -81,13 +81,37 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {              tokenizer.enter(tokenizer.tokenize_state.token_2.clone());              tokenizer.consume();              tokenizer.exit(tokenizer.tokenize_state.token_2.clone()); -            tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); -            State::Next(StateName::LabelAtBreak) +            State::Next(StateName::LabelAtMarker)          }          _ => State::Nok,      }  } +/// At an optional extra marker. +/// +/// Used for footnotes. +/// +/// ```markdown +/// > | [^a] +///      ^ +/// ``` +pub fn at_marker(tokenizer: &mut Tokenizer) -> State { +    // For footnotes (and potentially other custom things in the future), +    // We need to make sure there is a certain marker after `[`. +    if tokenizer.tokenize_state.marker == 0 { +        tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); +        State::Retry(StateName::LabelAtBreak) +    } else if tokenizer.current == Some(tokenizer.tokenize_state.marker) { +        tokenizer.enter(tokenizer.tokenize_state.token_4.clone()); +        tokenizer.consume(); +        tokenizer.exit(tokenizer.tokenize_state.token_4.clone()); +        tokenizer.enter(tokenizer.tokenize_state.token_3.clone()); +        State::Next(StateName::LabelAtBreak) +    } else { +        State::Nok +    } +} +  /// In label, at something, before something else.  ///  /// ```markdown diff --git a/src/construct/text.rs b/src/construct/text.rs index 65f55d4..5535e3f 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -11,6 +11,8 @@  //! *   [Character escape][crate::construct::character_escape]  //! *   [Character reference][crate::construct::character_reference]  //! *   [Code (text)][crate::construct::code_text] +//! *   [GFM: Label start (footnote)][crate::construct::gfm_label_start_footnote] +//! *   [GFM: Task list item check][crate::construct::gfm_task_list_item_check]  //! *   [Hard break (escape)][crate::construct::hard_break_escape]  //! *   [HTML (text)][crate::construct::html_text]  //! *   [Label start (image)][crate::construct::label_start_image] @@ -34,7 +36,7 @@ const MARKERS: [u8; 10] = [      b'<',  // `autolink`, `html_text`      b'[',  // `label_start_link`      b'\\', // `character_escape`, `hard_break_escape` -    b']',  // `label_end` +    b']',  // `label_end`, `gfm_label_start_footnote`      b'_',  // `attention`      b'`',  // `code_text`      b'~',  // `attention` (w/ `gfm_strikethrough`) @@ -104,9 +106,9 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {          Some(b'[') => {              tokenizer.attempt(                  State::Next(StateName::TextBefore), -                State::Next(StateName::TextBeforeData), +                State::Next(StateName::TextBeforeLabelStartLink),              ); -            State::Retry(StateName::LabelStartLinkStart) +            State::Retry(StateName::GfmLabelStartFootnoteStart)          }          Some(b'\\') => {              tokenizer.attempt( @@ -165,6 +167,22 @@ pub fn before_hard_break_escape(tokenizer: &mut Tokenizer) -> State {      State::Retry(StateName::HardBreakEscapeStart)  } +/// Before label start (link). +/// +/// At `[`, which wasn’t a GFM label start (footnote). +/// +/// ```markdown +/// > | [a](b) +///     ^ +/// ``` +pub fn before_label_start_link(tokenizer: &mut Tokenizer) -> State { +    tokenizer.attempt( +        State::Next(StateName::TextBefore), +        State::Next(StateName::TextBeforeData), +    ); +    State::Retry(StateName::LabelStartLinkStart) +} +  /// Before data.  ///  /// ```markdown | 
