From 1e4c95079cb97b2b02440b21945c6d12741a7d19 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 25 Aug 2022 13:16:45 +0200 Subject: Add support for GFM footnotes --- src/construct/gfm_footnote_definition.rs | 345 +++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 src/construct/gfm_footnote_definition.rs (limited to 'src/construct/gfm_footnote_definition.rs') diff --git a/src/construct/gfm_footnote_definition.rs b/src/construct/gfm_footnote_definition.rs new file mode 100644 index 0000000..3715044 --- /dev/null +++ b/src/construct/gfm_footnote_definition.rs @@ -0,0 +1,345 @@ +//! GFM: Footnote definition occurs in the [document][] content type. +//! +//! ## Grammar +//! +//! Footnote definitions form with the following BNF +//! (see [construct][crate::construct] for character groups): +//! +//! ```bnf +//! ; Restriction: `label` must start with `^` (and not be empty after it). +//! ; See the `label` construct for the BNF of that parts. +//! gfm_footnote_definition_start ::= label ':' *space_or_tab +//! +//! ; Restriction: blank line allowed. +//! gfm_footnote_definition_cont ::= 4(space_or_tab) +//! ``` +//! +//! Further lines that are not prefixed with `gfm_footnote_definition_cont` +//! cause the footnote definition to be exited, except when those lines are +//! lazy continuation or blank. +//! Like so many things in markdown, footnote definition too, are complex. +//! See [*§ Phase 1: block structure* in `CommonMark`][commonmark_block] for +//! more on parsing details. +//! +//! See [`label`][label] for grammar, notes, and recommendations on that part. +//! +//! The `label` part is interpreted as the [string][] content type. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. +//! +//! Definitions match to calls through identifiers. +//! To match, both labels must be equal after normalizing with +//! [`normalize_identifier`][normalize_identifier]. +//! One definition can match to multiple calls. +//! Multiple definitions with the same, normalized, identifier are ignored: the +//! first definition is preferred. +//! To illustrate, the definition with the content of `x` wins: +//! +//! ```markdown +//! [^a]: x +//! [^a]: y +//! +//! [^a] +//! ``` +//! +//! Importantly, while labels *can* include [string][] content (character +//! escapes and character references), these are not considered when matching. +//! To illustrate, neither definition matches the call: +//! +//! ```markdown +//! [^a&b]: x +//! [^a\&b]: y +//! +//! [^a&b] +//! ``` +//! +//! Because footnote definitions are containers (like block quotes and list +//! items), they can contain more footnote definitions, and they can include +//! calls to themselves. +//! +//! ## HTML +//! +//! GFM footnote definitions do not, on their own, relate to anything in HTML. +//! When matched with a [label end][label_end], which in turns matches to a +//! [GFM label start (footnote)][gfm_label_start_footnote], the definition +//! relates to several elements in HTML. +//! +//! When one or more definitions are called, a footnote section is generated +//! at the end of the document, using `
`, `

`, and `
    ` elements: +//! +//! ```html +//!

    Footnotes

    +//!
    +//!
    +//! ``` +//! +//! Each definition is generated as a `
  1. ` in the `
      `, in the order they +//! were first called: +//! +//! ```html +//!
    1. +//! ``` +//! +//! Backreferences are injected at the end of the first paragraph, or, when +//! there is no paragraph, at the end of the definition. +//! When a definition is called multiple times, multiple backreferences are +//! generated. +//! Further backreferences use an extra counter in the `href` attribute and +//! visually in a `` after `↩`. +//! +//! ```html +//! 2 +//! ``` +//! +//! See +//! [*§ 4.5.1 The `a` element*][html_a], +//! [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements*][html_h], +//! [*§ 4.4.8 The `li` element*][html_li], +//! [*§ 4.4.5 The `ol` element*][html_ol], +//! [*§ 4.4.1 The `p` element*][html_p], +//! [*§ 4.3.3 The `section` element*][html_section], and +//! [*§ 4.5.19 The `sub` and `sup` elements*][html_sup] +//! in the HTML spec for more info. +//! +//! ## Recommendation +//! +//! When authoring markdown with footnotes, it’s recommended to use words +//! instead of numbers (or letters or anything with an order) as calls. +//! That makes it easier to reuse and reorder footnotes. +//! +//! It’s recommended to place footnotes definitions at the bottom of the document. +//! +//! ## Bugs +//! +//! GitHub’s own algorithm to parse footnote definitions contains several bugs. +//! These are not present in this project. +//! The issues relating to footnote definitions are: +//! +//! * [Footnote reference call identifiers are trimmed, but definition identifiers aren’t](https://github.com/github/cmark-gfm/issues/237)\ +//! — initial and final whitespace in labels causes them not to match +//! * [Footnotes are matched case-insensitive, but links keep their casing, breaking them](https://github.com/github/cmark-gfm/issues/239)\ +//! — using uppercase (or any character that will be percent encoded) in identifiers breaks links +//! * [Colons in footnotes generate links w/o `href`](https://github.com/github/cmark-gfm/issues/250)\ +//! — colons in identifiers generate broken links +//! * [Character escape of `]` does not work in footnote identifiers](https://github.com/github/cmark-gfm/issues/240)\ +//! — some character escapes don’t work +//! * [Footnotes in links are broken](https://github.com/github/cmark-gfm/issues/249)\ +//! — while `CommonMark` prevents links in links, GitHub does not prevent footnotes (which turn into links) in links +//! * [Footnote-like brackets around image, break that image](https://github.com/github/cmark-gfm/issues/275)\ +//! — images can’t be used in what looks like a footnote call +//! +//! ## Tokens +//! +//! * [`DefinitionMarker`][Name::DefinitionMarker] +//! * [`GfmFootnoteDefinition`][Name::GfmFootnoteDefinition] +//! * [`GfmFootnoteDefinitionLabel`][Name::GfmFootnoteDefinitionLabel] +//! * [`GfmFootnoteDefinitionLabelMarker`][Name::GfmFootnoteDefinitionLabelMarker] +//! * [`GfmFootnoteDefinitionLabelString`][Name::GfmFootnoteDefinitionLabelString] +//! * [`GfmFootnoteDefinitionMarker`][Name::GfmFootnoteDefinitionMarker] +//! * [`GfmFootnoteDefinitionPrefix`][Name::GfmFootnoteDefinitionPrefix] +//! * [`SpaceOrTab`][Name::SpaceOrTab] +//! +//! ## References +//! +//! * [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-footnote) +//! +//! > 👉 **Note**: Footnotes are not specified in GFM yet. +//! > See [`github/cmark-gfm#270`](https://github.com/github/cmark-gfm/issues/270) +//! > for the related issue. +//! +//! [document]: crate::construct::document +//! [string]: crate::construct::string +//! [character_reference]: crate::construct::character_reference +//! [character_escape]: crate::construct::character_escape +//! [label]: crate::construct::partial_label +//! [label_end]: crate::construct::label_end +//! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote +//! [commonmark_block]: https://spec.commonmark.org/0.30/#phase-1-block-structure +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! [html_h]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [html_li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element +//! [html_ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element +//! [html_p]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element +//! [html_section]: https://html.spec.whatwg.org/multipage/sections.html#the-section-element +//! [html_sup]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-sub-and-sup-elements + +use crate::construct::partial_space_or_tab::space_or_tab_min_max; +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{ + constant::TAB_SIZE, + normalize_identifier::normalize_identifier, + skip, + slice::{Position, Slice}, +}; + +/// Start of GFM footnote definition. +/// +/// ```markdown +/// > | [^a]: b +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if tokenizer + .parse_state + .options + .constructs + .gfm_footnote_definition + { + tokenizer.enter(Name::GfmFootnoteDefinition); + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt( + State::Next(StateName::GfmFootnoteDefinitionLabelBefore), + State::Nok, + ); + State::Retry(space_or_tab_min_max( + tokenizer, + 1, + if tokenizer.parse_state.options.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } else { + State::Retry(StateName::GfmFootnoteDefinitionLabelBefore) + } + } else { + State::Nok + } +} + +/// Before definition label (after optional whitespace). +/// +/// ```markdown +/// > | [^a]: b +/// ^ +/// ``` +pub fn label_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'[') => { + tokenizer.tokenize_state.token_1 = Name::GfmFootnoteDefinitionLabel; + tokenizer.tokenize_state.token_2 = Name::GfmFootnoteDefinitionLabelMarker; + tokenizer.tokenize_state.token_3 = Name::GfmFootnoteDefinitionLabelString; + tokenizer.tokenize_state.token_4 = Name::GfmFootnoteDefinitionMarker; + tokenizer.tokenize_state.marker = b'^'; + tokenizer.enter(Name::GfmFootnoteDefinitionPrefix); + tokenizer.attempt( + State::Next(StateName::GfmFootnoteDefinitionLabelAfter), + State::Nok, + ); + State::Retry(StateName::LabelStart) + } + _ => State::Nok, + } +} + +/// After definition label. +/// +/// ```markdown +/// > | [^a]: b +/// ^ +/// ``` +pub fn label_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.token_1 = Name::Data; + tokenizer.tokenize_state.token_2 = Name::Data; + tokenizer.tokenize_state.token_3 = Name::Data; + tokenizer.tokenize_state.token_4 = Name::Data; + tokenizer.tokenize_state.marker = 0; + + match tokenizer.current { + Some(b':') => { + let end = skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Name::GfmFootnoteDefinitionLabelString], + ); + + // Note: we don’t care about virtual spaces, so `as_str` is fine. + let id = normalize_identifier( + Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, end), + ) + .as_str(), + ); + + // Note: we don’t care about uniqueness. + // It’s likely that that doesn’t happen very frequently. + // It is more likely that it wastes precious time. + tokenizer.tokenize_state.gfm_footnote_definitions.push(id); + + tokenizer.enter(Name::DefinitionMarker); + tokenizer.consume(); + tokenizer.exit(Name::DefinitionMarker); + tokenizer.attempt( + State::Next(StateName::GfmFootnoteDefinitionWhitespaceAfter), + State::Nok, + ); + // Any whitespace after the marker is eaten, forming indented code + // is not possible. + // No space is also fine, just like a block quote marker. + State::Next(space_or_tab_min_max(tokenizer, 0, usize::MAX)) + } + _ => State::Nok, + } +} + +/// After definition prefix. +/// +/// ```markdown +/// > | [^a]: b +/// ^ +/// ``` +pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.exit(Name::GfmFootnoteDefinitionPrefix); + State::Ok +} + +/// Start of footnote definition continuation. +/// +/// ```markdown +/// | [^a]: b +/// > | c +/// ^ +/// ``` +pub fn cont_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.check( + State::Next(StateName::GfmFootnoteDefinitionContBlank), + State::Next(StateName::GfmFootnoteDefinitionContFilled), + ); + State::Retry(StateName::BlankLineStart) +} + +/// Start of footnote definition continuation, at a blank line. +/// +/// ```markdown +/// | [^a]: b +/// > | ␠␠␊ +/// ^ +/// ``` +pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + State::Retry(space_or_tab_min_max(tokenizer, 0, TAB_SIZE)) + } else { + State::Ok + } +} + +/// Start of footnote definition continuation, at a filled line. +/// +/// ```markdown +/// | [^a]: b +/// > | c +/// ^ +/// ``` +pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + // Consume exactly `TAB_SIZE`. + State::Retry(space_or_tab_min_max(tokenizer, TAB_SIZE, TAB_SIZE)) + } else { + State::Nok + } +} -- cgit