diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-08-31 16:50:20 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-08-31 16:50:20 +0200 |
commit | b1590a4fb0c28fdb6af866ea79c186ea57284493 (patch) | |
tree | 61264dc36135e7dae34a04992a99b9f3f71e7b8e /src/construct | |
parent | 670f1d82e01ea2394b21d7d1857f41bdc67b3fce (diff) | |
download | markdown-rs-b1590a4fb0c28fdb6af866ea79c186ea57284493.tar.gz markdown-rs-b1590a4fb0c28fdb6af866ea79c186ea57284493.tar.bz2 markdown-rs-b1590a4fb0c28fdb6af866ea79c186ea57284493.zip |
Add support for GFM tables
Diffstat (limited to 'src/construct')
-rw-r--r-- | src/construct/document.rs | 16 | ||||
-rw-r--r-- | src/construct/flow.rs | 41 | ||||
-rw-r--r-- | src/construct/gfm_autolink_literal.rs | 1 | ||||
-rw-r--r-- | src/construct/gfm_footnote_definition.rs | 2 | ||||
-rw-r--r-- | src/construct/gfm_table.rs | 1042 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 1 | ||||
-rw-r--r-- | src/construct/label_end.rs | 1 | ||||
-rw-r--r-- | src/construct/mod.rs | 2 |
8 files changed, 1077 insertions, 29 deletions
diff --git a/src/construct/document.rs b/src/construct/document.rs index 9c76e46..e31e58d 100644 --- a/src/construct/document.rs +++ b/src/construct/document.rs @@ -269,6 +269,14 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { exit_containers(tokenizer, &Phase::Prefix); } + // We are “piercing” into the flow with a new container. + tokenizer + .tokenize_state + .document_child + .as_mut() + .unwrap() + .pierce = true; + tokenizer .tokenize_state .document_container_stack @@ -398,12 +406,11 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { let mut stack_index = child.stack.len(); // Use two algo’s: one for when we’re suspended or in multiline things - // like definitions, another (b) for when we fed the line ending and closed - // a) + // like definitions, another for when we fed the line ending and closed. while !document_lazy_continuation_current && stack_index > 0 { stack_index -= 1; let name = &child.stack[stack_index]; - if name == &Name::Paragraph || name == &Name::Definition { + if name == &Name::Paragraph || name == &Name::Definition || name == &Name::GfmTableHead { document_lazy_continuation_current = true; } } @@ -418,6 +425,9 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State { } } + // Reset “piercing”. + child.pierce = false; + if child.lazy && tokenizer.tokenize_state.document_lazy_accepting_before && document_lazy_continuation_current diff --git a/src/construct/flow.rs b/src/construct/flow.rs index 3f1cd77..3f7bc9c 100644 --- a/src/construct/flow.rs +++ b/src/construct/flow.rs @@ -65,29 +65,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::HtmlFlowStart) } - // Note: `-` is also used in thematic breaks so it’s not included here. - Some(b'=') => { - tokenizer.attempt( - State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), - ); - State::Retry(StateName::HeadingSetextStart) - } - Some(b'[') => { - tokenizer.attempt( - State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeParagraph), - ); - State::Retry(StateName::DefinitionStart) - } // Actual parsing: blank line? Indented code? Indented anything? - // Also includes `-` which can be a setext heading underline or thematic break. - None | Some(b'\t' | b'\n' | b' ' | b'-') => State::Retry(StateName::FlowBlankLineBefore), - // Must be a paragraph. - Some(_) => { - tokenizer.attempt(State::Next(StateName::FlowAfter), State::Nok); - State::Retry(StateName::ParagraphStart) - } + // Tables, setext heading underlines, definitions, and paragraphs are + // particularly weird. + _ => State::Retry(StateName::FlowBlankLineBefore), } } @@ -185,11 +166,25 @@ pub fn before_heading_setext(tokenizer: &mut Tokenizer) -> State { pub fn before_thematic_break(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::FlowAfter), - State::Next(StateName::FlowBeforeDefinition), + State::Next(StateName::FlowBeforeGfmTable), ); State::Retry(StateName::ThematicBreakStart) } +/// At GFM table. +/// +/// ```markdown +/// > | | a | +/// ^ +/// ``` +pub fn before_gfm_table(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt( + State::Next(StateName::FlowAfter), + State::Next(StateName::FlowBeforeDefinition), + ); + State::Retry(StateName::GfmTableStart) +} + /// At definition. /// /// ```markdown diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index 7fdeb01..704c536 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -5,7 +5,6 @@ use crate::tokenizer::Tokenizer; use crate::util::classify_character::{classify, Kind as CharacterKind}; use crate::util::slice::{Position, Slice}; use alloc::vec::Vec; -extern crate std; use core::str; // To do: doc al functions. diff --git a/src/construct/gfm_footnote_definition.rs b/src/construct/gfm_footnote_definition.rs index 3715044..cbe816f 100644 --- a/src/construct/gfm_footnote_definition.rs +++ b/src/construct/gfm_footnote_definition.rs @@ -141,7 +141,7 @@ //! //! ## References //! -//! * [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-footnote) +//! * [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) //! //! > 👉 **Note**: Footnotes are not specified in GFM yet. //! > See [`github/cmark-gfm#270`](https://github.com/github/cmark-gfm/issues/270) diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs new file mode 100644 index 0000000..d7c2b69 --- /dev/null +++ b/src/construct/gfm_table.rs @@ -0,0 +1,1042 @@ +//! GFM: table occurs in the [flow][] content type. +//! +//! ## Grammar +//! +//! Tables form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! gfm_table ::= gfm_table_head 0*(eol gfm_table_body_row) +//! +//! ; Restriction: both rows must have the same number of cells. +//! gfm_table_head ::= gfm_table_row eol gfm_table_delimiter_row +//! +//! gfm_table_row ::= ['|'] gfm_table_cell 0*('|' gfm_table_cell) ['|'] *space_or_tab +//! gfm_table_cell ::= *space_or_tab gfm_table_text *space_or_tab +//! gfm_table_text ::= 0*(line - '\\' - '|' | '\\' ['\\' | '|']) +// +//! gfm_table_delimiter_row ::= ['|'] gfm_table_delimiter_cell 0*('|' gfm_table_delimiter_cell) ['|'] *space_or_tab +//! gfm_table_delimiter_cell ::= *space_or_tab gfm_table_delimiter_value *space_or_tab +//! gfm_table_delimiter_value ::= [':'] 1*'-' [':'] +//! ``` +//! +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file). +//! +//! The above grammar shows that basically anything can be a cell or a row. +//! The main thing that makes something a row, is that it occurs directly before +//! or after a delimiter row, or after another row. +//! +//! It is not required for a table to have a body: it can end right after the +//! delimiter row. +//! +//! Each column can be marked with an alignment. +//! The alignment marker is a colon (`:`) used before and/or after delimiter row +//! filler. +//! To illustrate: +//! +//! ```markdown +//! | none | left | right | center | +//! | ---- | :--- | ----: | :----: | +//! ``` +//! +//! The number of cells in the delimiter row, is the number of columns of the +//! table. +//! Only the head row is required to have the same number of cells. +//! Body rows are not required to have a certain number of cells. +//! For body rows that have less cells than the number of columns of the table, +//! empty cells are injected. +//! When a row has more cells than the number of columns of the table, the +//! superfluous cells are dropped. +//! To illustrate: +//! +//! ```markdown +//! | a | b | +//! | - | - | +//! | c | +//! | d | e | f | +//! ``` +//! +//! Yields: +//! +//! ```html +//! <table> +//! <thead> +//! <tr> +//! <th>a</th> +//! <th>b</th> +//! </tr> +//! </thead> +//! <tbody> +//! <tr> +//! <td>c</td> +//! <td></td> +//! </tr> +//! <tr> +//! <td>d</td> +//! <td>e</td> +//! </tr> +//! </tbody> +//! </table> +//! ``` +//! +//! Each cell’s text is interpreted as the [text][] content type. +//! That means that it can include constructs such as [attention][attention]. +//! +//! The grammar for cells prohibits the use of `|` in them. +//! To use pipes in cells, encode them as a character reference or character +//! escape: `|` (or `|`, `|`, `|`, `|`) or +//! `\|`. +//! +//! Escapes will typically work, but they are not supported in +//! [code (text)][raw_text] (and the math (text) extension). +//! To work around this, GitHub came up with a rather weird “trick”. +//! When inside a table cell *and* inside code, escaped pipes *are* decoded. +//! To illustrate: +//! +//! ```markdown +//! | Name | Character | +//! | - | - | +//! | Left curly brace | `{` | +//! | Pipe | `\|` | +//! | Right curly brace | `}` | +//! ``` +//! +//! Yields: +//! +//! ```html +//! <table> +//! <thead> +//! <tr> +//! <th>Name</th> +//! <th>Character</th> +//! </tr> +//! </thead> +//! <tbody> +//! <tr> +//! <td>Left curly brace</td> +//! <td><code>{</code></td> +//! </tr> +//! <tr> +//! <td>Pipe</td> +//! <td><code>|</code></td> +//! </tr> +//! <tr> +//! <td>Right curly brace</td> +//! <td><code>}</code></td> +//! </tr> +//! </tbody> +//! </table> +//! ``` +//! +//! > 👉 **Note**: no other character can be escaped like this. +//! > Escaping pipes in code does not work when not inside a table, either. +//! +//! ## HTML +//! +//! GFM tables relate to several HTML elements: `<table>`, `<tbody>`, `<td>`, +//! `<th>`, `<thead>`, and `<tr>`. +//! See +//! [*§ 4.9.1 The `table` element*][html_table], +//! [*§ 4.9.5 The `tbody` element*][html_tbody], +//! [*§ 4.9.9 The `td` element*][html_td], +//! [*§ 4.9.10 The `th` element*][html_th], +//! [*§ 4.9.6 The `thead` element*][html_thead], and +//! [*§ 4.9.8 The `tr` element*][html_tr] +//! in the HTML spec for more info. +//! +//! If the the alignment of a column is left, right, or center, a deprecated +//! `align` attribute is added to each `<th>` and `<td>` element belonging to +//! that column. +//! That attribute is interpreted by browsers as if a CSS `text-align` property +//! was included, with its value set to that same keyword. +//! +//! ## Recommendation +//! +//! When authoring markdown with GFM tables, it’s recommended to *always* put +//! pipes around cells. +//! Without them, it can be hard to infer whether the table will work, how many +//! columns there are, and which column you are currently editing. +//! +//! It is recommended to not use many columns, as it results in very long lines, +//! making it hard to infer which column you are currently editing. +//! +//! For larger tables, particularly when cells vary in size, it is recommended +//! *not* to manually “pad” cell text. +//! While it can look better, it results in a lot of time spent realigning +//! everything when a new, longer cell is added or the longest cell removed, as +//! every row then must be changed. +//! Other than costing time, it also causes large diffs in Git. +//! +//! To illustrate, when authoring large tables, it is discouraged to pad cells +//! like this: +//! +//! ```markdown +//! | Alpha bravo charlie | delta | +//! | ------------------- | -----------------: | +//! | Echo | Foxtrot golf hotel | +//! ``` +//! +//! Instead, use single spaces (and single filler dashes): +//! +//! ```markdown +//! | Alpha bravo charlie | delta | +//! | - | -: | +//! | Echo | Foxtrot golf hotel | +//! ``` +//! +//! ## Bugs +//! +//! GitHub’s own algorithm to parse tables contains a bug. +//! This bug is not present in this project. +//! The issue relating to tables is: +//! +//! * [GFM tables: escaped escapes are incorrectly treated as escapes](https://github.com/github/cmark-gfm/issues/277)\ +//! +//! ## Tokens +//! +//! * [`GfmTable`][Name::GfmTable] +//! * [`GfmTableBody`][Name::GfmTableBody] +//! * [`GfmTableCell`][Name::GfmTableCell] +//! * [`GfmTableCellDivider`][Name::GfmTableCellDivider] +//! * [`GfmTableCellText`][Name::GfmTableCellText] +//! * [`GfmTableDelimiterCell`][Name::GfmTableDelimiterCell] +//! * [`GfmTableDelimiterCellValue`][Name::GfmTableDelimiterCellValue] +//! * [`GfmTableDelimiterFiller`][Name::GfmTableDelimiterFiller] +//! * [`GfmTableDelimiterMarker`][Name::GfmTableDelimiterMarker] +//! * [`GfmTableDelimiterRow`][Name::GfmTableDelimiterRow] +//! * [`GfmTableHead`][Name::GfmTableHead] +//! * [`GfmTableRow`][Name::GfmTableRow] +//! * [`LineEnding`][Name::LineEnding] +//! +//! ## References +//! +//! * [`micromark-extension-gfm-table`](https://github.com/micromark/micromark-extension-gfm-table) +//! * [*§ 4.10 Tables (extension)* in `GFM`](https://github.github.com/gfm/#tables-extension-) +//! +//! [flow]: crate::construct::flow +//! [text]: crate::construct::text +//! [attention]: crate::construct::attention +//! [raw_text]: crate::construct::raw_text +//! [html_table]: https://html.spec.whatwg.org/multipage/tables.html#the-table-element +//! [html_tbody]: https://html.spec.whatwg.org/multipage/tables.html#the-tbody-element +//! [html_td]: https://html.spec.whatwg.org/multipage/tables.html#the-td-element +//! [html_th]: https://html.spec.whatwg.org/multipage/tables.html#the-th-element +//! [html_thead]: https://html.spec.whatwg.org/multipage/tables.html#the-thead-element +//! [html_tr]: https://html.spec.whatwg.org/multipage/tables.html#the-tr-element + +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; +use crate::event::{Content, Event, Kind, Link, Name}; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; +use alloc::vec; + +/// Start of a GFM table. +/// +/// If there is a valid table row or table head before, then we try to parse +/// another row. +/// Otherwise, we try to parse a head. +/// +/// ```markdown +/// > | | a | +/// ^ +/// | | - | +/// > | | b | +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if tokenizer.parse_state.options.constructs.gfm_table { + if !tokenizer.pierce + && !tokenizer.events.is_empty() + && matches!( + tokenizer.events[skip_opt_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Name::LineEnding, Name::SpaceOrTab], + )] + .name, + Name::GfmTableHead | Name::GfmTableRow + ) + { + State::Retry(StateName::GfmTableBodyRowStart) + } else { + State::Retry(StateName::GfmTableHeadRowBefore) + } + } else { + State::Nok + } +} + +/// Before table head row. +/// +/// ```markdown +/// > | | a | +/// ^ +/// | | - | +/// | | b | +/// ``` +pub fn head_row_before(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::GfmTableHead); + tokenizer.enter(Name::GfmTableRow); + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt(State::Next(StateName::GfmTableHeadRowStart), State::Nok); + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.options.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } else { + State::Retry(StateName::GfmTableHeadRowStart) + } +} + +/// Before table head row, after whitespace. +/// +/// ```markdown +/// > | | a | +/// ^ +/// | | - | +/// | | b | +/// ``` +pub fn head_row_start(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // 4+ spaces. + Some(b'\t' | b' ') => State::Nok, + Some(b'|') => State::Retry(StateName::GfmTableHeadRowBreak), + _ => { + tokenizer.tokenize_state.seen = true; + State::Retry(StateName::GfmTableHeadRowBreak) + } + } +} + +/// At break in table head row. +/// +/// ```markdown +/// > | | a | +/// ^ +/// ^ +/// ^ +/// | | - | +/// | | b | +/// ``` +pub fn head_row_break(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None => { + tokenizer.tokenize_state.seen = false; + State::Nok + } + Some(b'\n') => { + // Feel free to interrupt: + tokenizer.interrupt = true; + tokenizer.exit(Name::GfmTableRow); + tokenizer.enter(Name::LineEnding); + tokenizer.consume(); + tokenizer.exit(Name::LineEnding); + State::Next(StateName::GfmTableHeadDelimiterStart) + } + Some(b'\t' | b' ') => { + tokenizer.attempt(State::Next(StateName::GfmTableHeadRowBreak), State::Nok); + State::Retry(space_or_tab(tokenizer)) + } + _ => { + // Whether a delimiter was seen. + if tokenizer.tokenize_state.seen { + tokenizer.tokenize_state.seen = false; + // Header cell count. + tokenizer.tokenize_state.size += 1; + } + + if tokenizer.current == Some(b'|') { + tokenizer.enter(Name::GfmTableCellDivider); + tokenizer.consume(); + tokenizer.exit(Name::GfmTableCellDivider); + // Whether a delimiter was seen. + tokenizer.tokenize_state.seen = true; + State::Next(StateName::GfmTableHeadRowBreak) + } else { + // Anything else is cell data. + tokenizer.enter(Name::Data); + State::Retry(StateName::GfmTableHeadRowData) + } + } + } +} + +/// In table head row data. +/// +/// ```markdown +/// > | | a | +/// ^ +/// | | - | +/// | | b | +/// ``` +pub fn head_row_data(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\t' | b'\n' | b' ' | b'|') => { + tokenizer.exit(Name::Data); + State::Retry(StateName::GfmTableHeadRowBreak) + } + _ => { + let name = if tokenizer.current == Some(b'\\') { + StateName::GfmTableHeadRowEscape + } else { + StateName::GfmTableHeadRowData + }; + tokenizer.consume(); + State::Next(name) + } + } +} + +/// In table head row escape. +/// +/// ```markdown +/// > | | a\-b | +/// ^ +/// | | ---- | +/// | | c | +/// ``` +pub fn head_row_escape(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\\' | b'|') => { + tokenizer.consume(); + State::Next(StateName::GfmTableHeadRowData) + } + _ => State::Retry(StateName::GfmTableHeadRowData), + } +} + +/// Before delimiter row. +/// +/// ```markdown +/// | | a | +/// > | | - | +/// ^ +/// | | b | +/// ``` +pub fn head_delimiter_start(tokenizer: &mut Tokenizer) -> State { + // Reset `interrupt`. + tokenizer.interrupt = false; + + if tokenizer.lazy || tokenizer.pierce { + State::Nok + } else { + tokenizer.enter(Name::GfmTableDelimiterRow); + // Track if we’ve seen a `:` or `|`. + tokenizer.tokenize_state.seen = false; + + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.attempt( + State::Next(StateName::GfmTableHeadDelimiterBefore), + State::Next(StateName::GfmTableHeadDelimiterNok), + ); + + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.options.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } + _ => State::Retry(StateName::GfmTableHeadDelimiterBefore), + } + } +} + +/// Before delimiter row, after optional whitespace. +/// +/// Reused when a `|` is found later, to parse another cell. +/// +/// ```markdown +/// | | a | +/// > | | - | +/// ^ +/// | | b | +/// ``` +pub fn head_delimiter_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'-' | b':') => State::Retry(StateName::GfmTableHeadDelimiterValueBefore), + Some(b'|') => { + tokenizer.tokenize_state.seen = true; + // If we start with a pipe, we open a cell marker. + tokenizer.enter(Name::GfmTableCellDivider); + tokenizer.consume(); + tokenizer.exit(Name::GfmTableCellDivider); + State::Next(StateName::GfmTableHeadDelimiterCellBefore) + } + // More whitespace / empty row not allowed at start. + _ => State::Retry(StateName::GfmTableHeadDelimiterNok), + } +} + +/// After `|`, before delimiter cell. +/// +/// ```markdown +/// | | a | +/// > | | - | +/// ^ +/// ``` +pub fn head_delimiter_cell_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.attempt( + State::Next(StateName::GfmTableHeadDelimiterValueBefore), + State::Nok, + ); + State::Retry(space_or_tab(tokenizer)) + } + _ => State::Retry(StateName::GfmTableHeadDelimiterValueBefore), + } +} + +/// Before delimiter cell value. +/// +/// ```markdown +/// | | a | +/// > | | - | +/// ^ +/// ``` +pub fn head_delimiter_value_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => State::Retry(StateName::GfmTableHeadDelimiterCellAfter), + Some(b':') => { + // Align: left. + tokenizer.tokenize_state.size_b += 1; + tokenizer.tokenize_state.seen = true; + tokenizer.enter(Name::GfmTableDelimiterMarker); + tokenizer.consume(); + tokenizer.exit(Name::GfmTableDelimiterMarker); + State::Next(StateName::GfmTableHeadDelimiterLeftAlignmentAfter) + } + Some(b'-') => { + // Align: none. + tokenizer.tokenize_state.size_b += 1; + State::Retry(StateName::GfmTableHeadDelimiterLeftAlignmentAfter) + } + _ => State::Retry(StateName::GfmTableHeadDelimiterNok), + } +} + +/// After delimiter cell left alignment marker. +/// +/// ```markdown +/// | | a | +/// > | | :- | +/// ^ +/// ``` +pub fn head_delimiter_left_alignment_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'-') => { + tokenizer.enter(Name::GfmTableDelimiterFiller); + State::Retry(StateName::GfmTableHeadDelimiterFiller) + } + // Anything else is not ok after the left-align colon. + _ => State::Retry(StateName::GfmTableHeadDelimiterNok), + } +} + +/// In delimiter cell filler. +/// +/// ```markdown +/// | | a | +/// > | | - | +/// ^ +/// ``` +pub fn head_delimiter_filler(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'-') => { + tokenizer.consume(); + State::Next(StateName::GfmTableHeadDelimiterFiller) + } + Some(b':') => { + // Align is `center` if it was `left`, `right` otherwise. + tokenizer.tokenize_state.seen = true; + tokenizer.exit(Name::GfmTableDelimiterFiller); + tokenizer.enter(Name::GfmTableDelimiterMarker); + tokenizer.consume(); + tokenizer.exit(Name::GfmTableDelimiterMarker); + State::Next(StateName::GfmTableHeadDelimiterRightAlignmentAfter) + } + _ => { + tokenizer.exit(Name::GfmTableDelimiterFiller); + State::Retry(StateName::GfmTableHeadDelimiterRightAlignmentAfter) + } + } +} + +/// After delimiter cell right alignment marker. +/// +/// ```markdown +/// | | a | +/// > | | -: | +/// ^ +/// ``` +pub fn head_delimiter_right_alignment_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.attempt( + State::Next(StateName::GfmTableHeadDelimiterCellAfter), + State::Nok, + ); + State::Retry(space_or_tab(tokenizer)) + } + _ => State::Retry(StateName::GfmTableHeadDelimiterCellAfter), + } +} + +/// After delimiter cell. +/// +/// ```markdown +/// | | a | +/// > | | -: | +/// ^ +/// ``` +pub fn head_delimiter_cell_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + // Exit when: + // * there was no `:` or `|` at all (it’s a thematic break or setext + // underline instead) + // * the header cell count is not the delimiter cell count + if !tokenizer.tokenize_state.seen + || tokenizer.tokenize_state.size != tokenizer.tokenize_state.size_b + { + State::Retry(StateName::GfmTableHeadDelimiterNok) + } else { + // Reset. + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_b = 0; + tokenizer.exit(Name::GfmTableDelimiterRow); + tokenizer.exit(Name::GfmTableHead); + tokenizer.register_resolver(ResolveName::GfmTable); + State::Ok + } + } + Some(b'|') => State::Retry(StateName::GfmTableHeadDelimiterBefore), + _ => State::Retry(StateName::GfmTableHeadDelimiterNok), + } +} + +/// In delimiter row, at a disallowed byte. +/// +/// ```markdown +/// | | a | +/// > | | x | +/// ^ +/// ``` +pub fn head_delimiter_nok(tokenizer: &mut Tokenizer) -> State { + // Reset. + tokenizer.tokenize_state.seen = false; + tokenizer.tokenize_state.size = 0; + tokenizer.tokenize_state.size_b = 0; + State::Nok +} + +/// Before table body row. +/// +/// ```markdown +/// | | a | +/// | | - | +/// > | | b | +/// ^ +/// ``` +pub fn body_row_start(tokenizer: &mut Tokenizer) -> State { + if tokenizer.lazy { + State::Nok + } else { + tokenizer.enter(Name::GfmTableRow); + + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.attempt(State::Next(StateName::GfmTableBodyRowBefore), State::Nok); + + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.options.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } + _ => State::Retry(StateName::GfmTableBodyRowBefore), + } + } +} + +/// Before table body row, after optional whitespace. +/// +/// ```markdown +/// | | a | +/// | | - | +/// > | | b | +/// ^ +/// ``` +pub fn body_row_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\t' | b' ') => State::Nok, + _ => State::Retry(StateName::GfmTableBodyRowBreak), + } +} + +/// At break in table body row. +/// +/// ```markdown +/// | | a | +/// | | - | +/// > | | b | +/// ^ +/// ^ +/// ^ +/// ``` +pub fn body_row_break(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\n') => { + tokenizer.exit(Name::GfmTableRow); + State::Ok + } + Some(b'\t' | b' ') => { + tokenizer.attempt(State::Next(StateName::GfmTableBodyRowBreak), State::Nok); + State::Retry(space_or_tab(tokenizer)) + } + Some(b'|') => { + tokenizer.enter(Name::GfmTableCellDivider); + tokenizer.consume(); + tokenizer.exit(Name::GfmTableCellDivider); + State::Next(StateName::GfmTableBodyRowBreak) + } + // Anything else is cell content. + _ => { + tokenizer.enter(Name::Data); + State::Retry(StateName::GfmTableBodyRowData) + } + } +} + +/// In table body row data. +/// +/// ```markdown +/// | | a | +/// | | - | +/// > | | b | +/// ^ +/// ``` +pub fn body_row_data(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + None | Some(b'\t' | b'\n' | b' ' | b'|') => { + tokenizer.exit(Name::Data); + State::Retry(StateName::GfmTableBodyRowBreak) + } + _ => { + let name = if tokenizer.current == Some(b'\\') { + StateName::GfmTableBodyRowEscape + } else { + StateName::GfmTableBodyRowData + }; + tokenizer.consume(); + State::Next(name) + } + } +} + +/// In table body row escape. +/// +/// ```markdown +/// | | a | +/// | | ---- | +/// > | | b\-c | +/// ^ +/// ``` +pub fn body_row_escape(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\\' | b'|') => { + tokenizer.consume(); + State::Next(StateName::GfmTableBodyRowData) + } + _ => State::Retry(StateName::GfmTableBodyRowData), + } +} + +/// Resolve GFM table. +pub fn resolve(tokenizer: &mut Tokenizer) { + let mut index = 0; + // let mut tables = vec![]; + let mut in_first_cell_awaiting_pipe = true; + let mut in_row = false; + let mut in_delimiter_row = false; + let mut last_cell = (0, 0, 0, 0); + let mut cell = (0, 0, 0, 0); + + let mut after_head_awaiting_first_body_row = false; + let mut last_table_end = 0; + let mut last_table_has_body = false; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter { + // Start of head. + if event.name == Name::GfmTableHead { + after_head_awaiting_first_body_row = false; + + // Inject previous (body end and) table end. + if last_table_end != 0 { + flush_table_end(tokenizer, last_table_end, last_table_has_body); + last_table_has_body = false; + last_table_end = 0; + } + + // Inject table start. + tokenizer.map.add( + index, + 0, + vec![Event { + kind: Kind::Enter, + name: Name::GfmTable, + point: tokenizer.events[index].point.clone(), + link: None, + }], + ); + } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + in_delimiter_row = event.name == Name::GfmTableDelimiterRow; + in_row = true; + in_first_cell_awaiting_pipe = true; + last_cell = (0, 0, 0, 0); + cell = (0, index + 1, 0, 0); + + // Inject table body start. + if after_head_awaiting_first_body_row { + after_head_awaiting_first_body_row = false; + last_table_has_body = true; + tokenizer.map.add( + index, + 0, + vec![Event { + kind: Kind::Enter, + name: Name::GfmTableBody, + point: tokenizer.events[index].point.clone(), + link: None, + }], + ); + } + } + // Cell data. + else if in_row + && (event.name == Name::Data + || event.name == Name::GfmTableDelimiterMarker + || event.name == Name::GfmTableDelimiterFiller) + { + in_first_cell_awaiting_pipe = false; + + // First value in cell. + if cell.2 == 0 { + if last_cell.1 != 0 { + cell.0 = cell.1; + flush_cell(tokenizer, last_cell, in_delimiter_row, None); + last_cell = (0, 0, 0, 0); + } + + cell.2 = index; + } + } else if event.name == Name::GfmTableCellDivider { + if in_first_cell_awaiting_pipe { + in_first_cell_awaiting_pipe = false; + } else { + if last_cell.1 != 0 { + cell.0 = cell.1; + flush_cell(tokenizer, last_cell, in_delimiter_row, None); + } + + last_cell = cell; + cell = (last_cell.1, index, 0, 0); + } + } + // Exit events. + } else if event.name == Name::GfmTableHead { + after_head_awaiting_first_body_row = true; + last_table_end = index; + } else if event.name == Name::GfmTableRow || event.name == Name::GfmTableDelimiterRow { + in_row = false; + last_table_end = index; + if last_cell.1 != 0 { + cell.0 = cell.1; + flush_cell(tokenizer, last_cell, in_delimiter_row, Some(index)); + } else if cell.1 != 0 { + flush_cell(tokenizer, cell, in_delimiter_row, Some(index)); + } + } else if in_row + && (event.name == Name::Data + || event.name == Name::GfmTableDelimiterMarker + || event.name == Name::GfmTableDelimiterFiller) + { + cell.3 = index; + } + + index += 1; + } + + if last_table_end != 0 { + flush_table_end(tokenizer, last_table_end, last_table_has_body); + } +} + +/// Generate a cell. +fn flush_cell( + tokenizer: &mut Tokenizer, + range: (usize, usize, usize, usize), + in_delimiter_row: bool, + row_end: Option<usize>, +) { + let group_name = if in_delimiter_row { + Name::GfmTableDelimiterCell + } else { + Name::GfmTableCell + }; + let value_name = if in_delimiter_row { + Name::GfmTableDelimiterCellValue + } else { + Name::GfmTableCellText + }; + + // Insert an exit for the previous cell, if there is one. + // + // ```markdown + // > | | aa | bb | cc | + // ^-- exit + // ^^^^-- this cell + // ``` + if range.0 != 0 { + tokenizer.map.add( + range.0, + 0, + vec![Event { + kind: Kind::Exit, + name: group_name.clone(), + point: tokenizer.events[range.0].point.clone(), + link: None, + }], + ); + } + + // Insert enter of this cell. + // + // ```markdown + // > | | aa | bb | cc | + // ^-- enter + // ^^^^-- this cell + // ``` + tokenizer.map.add( + range.1, + 0, + vec![Event { + kind: Kind::Enter, + name: group_name.clone(), + point: tokenizer.events[range.1].point.clone(), + link: None, + }], + ); + + // Insert text start at first data start and end at last data end, and + // remove events between. + // + // ```markdown + // > | | aa | bb | cc | + // ^-- enter + // ^-- exit + // ^^^^-- this cell + // ``` + if range.2 != 0 { + tokenizer.map.add( + range.2, + 0, + vec![Event { + kind: Kind::Enter, + name: value_name.clone(), + point: tokenizer.events[range.2].point.clone(), + link: None, + }], + ); + debug_assert_ne!(range.3, 0); + + if !in_delimiter_row { + tokenizer.events[range.2].link = Some(Link { + previous: None, + next: None, + content: Content::Text, + }); + + if !in_delimiter_row && range.3 > range.2 + 1 { + let a = range.2 + 1; + let b = range.3 - range.2 - 1; + tokenizer.map.add(a, b, vec![]); + } + } + + tokenizer.map.add( + range.3 + 1, + 0, + vec![Event { + kind: Kind::Exit, + name: value_name, + point: tokenizer.events[range.3].point.clone(), + link: None, + }], + ); + } + + // Insert an exit for the last cell, if at the row end. + // + // ```markdown + // > | | aa | bb | cc | + // ^-- exit + // ^^^^^^-- this cell (the last one contains two “between” parts) + // ``` + if let Some(row_end) = row_end { + tokenizer.map.add( + row_end, + 0, + vec![Event { + kind: Kind::Exit, + name: group_name, + point: tokenizer.events[row_end].point.clone(), + link: None, + }], + ); + } +} + +/// Generate table end (and table body end). +fn flush_table_end(tokenizer: &mut Tokenizer, index: usize, body: bool) { + let mut exits = vec![]; + + if body { + exits.push(Event { + kind: Kind::Exit, + name: Name::GfmTableBody, + point: tokenizer.events[index].point.clone(), + link: None, + }); + } + + exits.push(Event { + kind: Kind::Exit, + name: Name::GfmTable, + point: tokenizer.events[index].point.clone(), + link: None, + }); + + tokenizer.map.add(index + 1, 0, exits); +} diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index df1d4fb..e9cc759 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -87,6 +87,7 @@ use alloc::vec; pub fn start(tokenizer: &mut Tokenizer) -> State { if tokenizer.parse_state.options.constructs.heading_setext && !tokenizer.lazy + && !tokenizer.pierce // Require a paragraph before. && (!tokenizer.events.is_empty() && tokenizer.events[skip_opt_back( diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index b5a6013..8a9edfb 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -191,7 +191,6 @@ use crate::util::{ slice::{Position, Slice}, }; use alloc::{string::String, vec}; -extern crate std; /// Start of label end. /// diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 9add015..de88174 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -60,6 +60,7 @@ //! * [frontmatter][] //! * [gfm autolink literal][gfm_autolink_literal] //! * [gfm footnote definition][gfm_footnote_definition] +//! * [gfm table][gfm_table] //! * [gfm task list item check][gfm_task_list_item_check] //! * [gfm label start footnote][gfm_label_start_footnote] //! * math (text) (in `raw_text`) @@ -151,6 +152,7 @@ pub mod frontmatter; pub mod gfm_autolink_literal; pub mod gfm_footnote_definition; pub mod gfm_label_start_footnote; +pub mod gfm_table; pub mod gfm_task_list_item_check; pub mod hard_break_escape; pub mod heading_atx; |