From 6e20c3e79d4270fafb13a63af51eaffaa45c11e1 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 15 Aug 2022 11:24:06 +0200 Subject: Refactor to rename list construct to list item --- src/constant.rs | 5 +- src/construct/blank_line.rs | 4 +- src/construct/list.rs | 460 ---------------------------------------- src/construct/list_item.rs | 460 ++++++++++++++++++++++++++++++++++++++++ src/construct/mod.rs | 4 +- src/construct/thematic_break.rs | 4 +- src/content/document.rs | 6 +- src/event.rs | 12 +- src/lib.rs | 6 +- src/resolve.rs | 2 +- src/state.rs | 60 +++--- 11 files changed, 512 insertions(+), 511 deletions(-) delete mode 100644 src/construct/list.rs create mode 100644 src/construct/list_item.rs (limited to 'src') diff --git a/src/constant.rs b/src/constant.rs index 6ef851c..47cb50c 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -202,7 +202,8 @@ pub const HTML_RAW_SIZE_MAX: usize = 8; /// To safeguard performance, labels are capped at a large number: `999`. pub const LINK_REFERENCE_SIZE_MAX: usize = 999; -/// The max number of decimals allowed to form an (ordered) [list][] item. +/// The max number of decimals allowed to form an (ordered) +/// [list item][list-item]. /// /// `CommonMark` caps this at 10 digits (9 is fine, 10 not). /// This limit is imposed because bigger numbers result in integer overflows @@ -212,7 +213,7 @@ pub const LINK_REFERENCE_SIZE_MAX: usize = 999; /// /// * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.30/#ordered-list-marker) /// -/// [list]: crate::construct::list +/// [list-item]: crate::construct::list_item pub const LIST_ITEM_VALUE_SIZE_MAX: usize = 10; /// Maximum allowed unbalanced parens in destination. diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 7f1d023..87d257d 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -12,7 +12,7 @@ //! such as between two [heading (atx)][heading-atx]s. //! Sometimes, whether blank lines are present, changes the behavior of how //! HTML is rendered, such as whether blank lines are present between list -//! items in a [list][]. +//! items in a [list][list-item]. //! More than one blank line is never needed in `CommonMark`. //! //! Because blank lines can be empty (line endings are not considered part of @@ -28,7 +28,7 @@ //! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) //! //! [heading-atx]: crate::construct::heading_atx -//! [list]: crate::construct::list +//! [list-item]: crate::construct::list_item //! [paragraph]: crate::construct::paragraph //! [flow]: crate::content::flow diff --git a/src/construct/list.rs b/src/construct/list.rs deleted file mode 100644 index 596330c..0000000 --- a/src/construct/list.rs +++ /dev/null @@ -1,460 +0,0 @@ -//! List is a construct that occurs in the [document][] content type. -//! -//! It forms with, roughly, the following BNF: -//! -//! ```bnf -//! ; Restriction: there must be `eol | space_or_tab` after the start. -//! ; Restriction: if the first line after the marker is not blank and starts with `5( space_or_tab )`, -//! ; only the first `space_or_tab` is part of the start. -//! list_item_start ::= '*' | '+' | '-' | 1*9( ascii_decimal ) ( '.' | ')' ) [ 1*4 space_or_tab ] -//! ; Restriction: blank line allowed, except when this is the first continuation after a blank start. -//! ; Restriction: if not blank, the line must be indented, exactly `n` times. -//! list_item_cont ::= [ n( space_or_tab ) ] -//! ``` -//! -//! Further lines that are not prefixed with `list_item_cont` cause the item -//! to be exited, except when those lines are lazy continuation. -//! Like so many things in markdown, list (items) too, are very complex. -//! See [*§ Phase 1: block structure*][commonmark-block] for more on parsing -//! details. -//! -//! Lists relates to the `
  • `, `
      `, and `
        ` elements in HTML. -//! See [*§ 4.4.8 The `li` element*][html-li], -//! [*§ 4.4.5 The `ol` element*][html-ol], and -//! [*§ 4.4.7 The `ul` element*][html-ul] in the HTML spec for more info. -//! -//! ## Tokens -//! -//! * [`ListItem`][Name::ListItem] -//! * [`ListItemMarker`][Name::ListItemMarker] -//! * [`ListItemPrefix`][Name::ListItemPrefix] -//! * [`ListItemValue`][Name::ListItemValue] -//! * [`ListOrdered`][Name::ListOrdered] -//! * [`ListUnordered`][Name::ListUnordered] -//! -//! ## References -//! -//! * [`list.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/list.js) -//! * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.30/#list-items) -//! * [*§ 5.3 Lists* in `CommonMark`](https://spec.commonmark.org/0.30/#lists) -//! -//! [document]: crate::content::document -//! [html-li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element -//! [html-ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element -//! [html-ul]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ul-element -//! [commonmark-block]: https://spec.commonmark.org/0.30/#phase-1-block-structure - -use crate::constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}; -use crate::construct::partial_space_or_tab::space_or_tab_min_max; -use crate::event::{Kind, Name}; -use crate::resolve::Name as ResolveName; -use crate::state::{Name as StateName, State}; -use crate::tokenizer::Tokenizer; -use crate::util::{ - skip, - slice::{Position, Slice}, -}; - -/// Start of list item. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.parse_state.constructs.list { - tokenizer.enter(Name::ListItem); - - if matches!(tokenizer.current, Some(b'\t' | b' ')) { - tokenizer.attempt(State::Next(StateName::ListBefore), State::Nok); - State::Retry(space_or_tab_min_max( - tokenizer, - 0, - if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }, - )) - } else { - State::Retry(StateName::ListBefore) - } - } else { - State::Nok - } -} - -/// After optional whitespace, at list item prefix. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn before(tokenizer: &mut Tokenizer) -> State { - // Unordered. - if matches!(tokenizer.current, Some(b'*' | b'-')) { - tokenizer.check(State::Nok, State::Next(StateName::ListBeforeUnordered)); - State::Retry(StateName::ThematicBreakStart) - } else if tokenizer.current == Some(b'+') { - State::Retry(StateName::ListBeforeUnordered) - } - // Ordered. - else if tokenizer.current == Some(b'1') - || (matches!(tokenizer.current, Some(b'0'..=b'9')) && !tokenizer.interrupt) - { - State::Retry(StateName::ListBeforeOrdered) - } else { - State::Nok - } -} - -/// At unordered list item marker. -/// -/// The line is not a thematic break. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn before_unordered(tokenizer: &mut Tokenizer) -> State { - tokenizer.enter(Name::ListItemPrefix); - State::Retry(StateName::ListMarker) -} - -/// At ordered list item value. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn before_ordered(tokenizer: &mut Tokenizer) -> State { - tokenizer.enter(Name::ListItemPrefix); - tokenizer.enter(Name::ListItemValue); - State::Retry(StateName::ListValue) -} - -/// In ordered list item value. -/// -/// ```markdown -/// > | 1. a -/// ^ -/// ``` -pub fn value(tokenizer: &mut Tokenizer) -> State { - if matches!(tokenizer.current, Some(b'.' | b')')) - && (!tokenizer.interrupt || tokenizer.tokenize_state.size < 2) - { - tokenizer.exit(Name::ListItemValue); - State::Retry(StateName::ListMarker) - } else if matches!(tokenizer.current, Some(b'0'..=b'9')) - && tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX - { - tokenizer.tokenize_state.size += 1; - tokenizer.consume(); - State::Next(StateName::ListValue) - } else { - tokenizer.tokenize_state.size = 0; - State::Nok - } -} - -/// At list item marker. -/// -/// ```markdown -/// > | * a -/// ^ -/// > | 1. b -/// ^ -/// ``` -pub fn marker(tokenizer: &mut Tokenizer) -> State { - tokenizer.enter(Name::ListItemMarker); - tokenizer.consume(); - tokenizer.exit(Name::ListItemMarker); - State::Next(StateName::ListMarkerAfter) -} - -/// After list item marker. -/// -/// ```markdown -/// > | * a -/// ^ -/// > | 1. b -/// ^ -/// ``` -pub fn marker_after(tokenizer: &mut Tokenizer) -> State { - tokenizer.tokenize_state.size = 1; - tokenizer.check( - State::Next(StateName::ListAfter), - State::Next(StateName::ListMarkerAfterFilled), - ); - State::Retry(StateName::BlankLineStart) -} - -/// After list item marker. -/// -/// The marker is not followed by a blank line. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State { - tokenizer.tokenize_state.size = 0; - - // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace. - tokenizer.attempt( - State::Next(StateName::ListAfter), - State::Next(StateName::ListPrefixOther), - ); - State::Retry(StateName::ListWhitespace) -} - -/// After marker, at whitespace. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn whitespace(tokenizer: &mut Tokenizer) -> State { - tokenizer.attempt(State::Next(StateName::ListWhitespaceAfter), State::Nok); - State::Retry(space_or_tab_min_max(tokenizer, 1, TAB_SIZE)) -} - -/// After acceptable whitespace. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { - if let Some(b'\t' | b' ') = tokenizer.current { - State::Nok - } else { - State::Ok - } -} - -/// After marker, followed by no indent or more indent that needed. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn prefix_other(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(b'\t' | b' ') => { - tokenizer.enter(Name::SpaceOrTab); - tokenizer.consume(); - tokenizer.exit(Name::SpaceOrTab); - State::Next(StateName::ListAfter) - } - _ => State::Nok, - } -} - -/// After list item prefix. -/// -/// ```markdown -/// > | * a -/// ^ -/// ``` -pub fn after(tokenizer: &mut Tokenizer) -> State { - let blank = tokenizer.tokenize_state.size == 1; - tokenizer.tokenize_state.size = 0; - - if blank && tokenizer.interrupt { - State::Nok - } else { - let start = skip::to_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Name::ListItem], - ); - let mut prefix = Slice::from_position( - tokenizer.parse_state.bytes, - &Position { - start: &tokenizer.events[start].point, - end: &tokenizer.point, - }, - ) - .len(); - - if blank { - prefix += 1; - } - - let container = &mut tokenizer.tokenize_state.document_container_stack - [tokenizer.tokenize_state.document_continued]; - - container.blank_initial = blank; - container.size = prefix; - - tokenizer.exit(Name::ListItemPrefix); - tokenizer.register_resolver_before(ResolveName::List); - State::Ok - } -} - -/// Start of list item continuation. -/// -/// ```markdown -/// | * a -/// > | b -/// ^ -/// ``` -pub fn cont_start(tokenizer: &mut Tokenizer) -> State { - tokenizer.check( - State::Next(StateName::ListContBlank), - State::Next(StateName::ListContFilled), - ); - State::Retry(StateName::BlankLineStart) -} - -/// Start of blank list item continuation. -/// -/// ```markdown -/// | * a -/// > | -/// ^ -/// | b -/// ``` -pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { - let container = &mut tokenizer.tokenize_state.document_container_stack - [tokenizer.tokenize_state.document_continued]; - let size = container.size; - - if container.blank_initial { - State::Nok - } else if matches!(tokenizer.current, Some(b'\t' | b' ')) { - // Consume, optionally, at most `size`. - State::Retry(space_or_tab_min_max(tokenizer, 0, size)) - } else { - State::Ok - } -} - -/// Start of non-blank list item continuation. -/// -/// ```markdown -/// | * a -/// > | b -/// ^ -/// ``` -pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { - let container = &mut tokenizer.tokenize_state.document_container_stack - [tokenizer.tokenize_state.document_continued]; - let size = container.size; - - container.blank_initial = false; - - if matches!(tokenizer.current, Some(b'\t' | b' ')) { - // Consume exactly `size`. - State::Retry(space_or_tab_min_max(tokenizer, size, size)) - } else { - State::Nok - } -} - -/// Find adjacent list items with the same marker. -pub fn resolve(tokenizer: &mut Tokenizer) { - let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; - let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; - let mut index = 0; - let mut balance = 0; - - // Merge list items. - while index < tokenizer.events.len() { - let event = &tokenizer.events[index]; - - if event.name == Name::ListItem { - if event.kind == Kind::Enter { - let end = skip::opt(&tokenizer.events, index, &[Name::ListItem]) - 1; - let marker = skip::to(&tokenizer.events, index, &[Name::ListItemMarker]); - // Guaranteed to be a valid ASCII byte. - let marker = Slice::from_index( - tokenizer.parse_state.bytes, - tokenizer.events[marker].point.index, - ) - .head() - .unwrap(); - let current = (marker, balance, index, end); - - let mut list_index = lists_wip.len(); - let mut matched = false; - - while list_index > 0 { - list_index -= 1; - let previous = &lists_wip[list_index]; - let before = skip::opt( - &tokenizer.events, - previous.3 + 1, - &[ - Name::SpaceOrTab, - Name::LineEnding, - Name::BlankLineEnding, - Name::BlockQuotePrefix, - ], - ); - - if previous.0 == current.0 && previous.1 == current.1 && before == current.2 { - let previous_mut = &mut lists_wip[list_index]; - previous_mut.3 = current.3; - lists.append(&mut lists_wip.split_off(list_index + 1)); - matched = true; - break; - } - } - - if !matched { - let mut index = lists_wip.len(); - let mut exit = None; - - while index > 0 { - index -= 1; - - // If the current (new) item starts after where this - // item on the stack ends, we can remove it from the - // stack. - if current.2 > lists_wip[index].3 { - exit = Some(index); - } else { - break; - } - } - - if let Some(exit) = exit { - lists.append(&mut lists_wip.split_off(exit)); - } - - lists_wip.push(current); - } - - balance += 1; - } else { - balance -= 1; - } - } - - index += 1; - } - - lists.append(&mut lists_wip); - - // Inject events. - let mut index = 0; - while index < lists.len() { - let list_item = &lists[index]; - let mut list_start = tokenizer.events[list_item.2].clone(); - let mut list_end = tokenizer.events[list_item.3].clone(); - let name = match list_item.0 { - b'.' | b')' => Name::ListOrdered, - _ => Name::ListUnordered, - }; - list_start.name = name.clone(); - list_end.name = name; - - tokenizer.map.add(list_item.2, 0, vec![list_start]); - tokenizer.map.add(list_item.3 + 1, 0, vec![list_end]); - - index += 1; - } -} diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs new file mode 100644 index 0000000..5161254 --- /dev/null +++ b/src/construct/list_item.rs @@ -0,0 +1,460 @@ +//! List item is a construct that occurs in the [document][] content type. +//! +//! It forms with, roughly, the following BNF: +//! +//! ```bnf +//! ; Restriction: there must be `eol | space_or_tab` after the start. +//! ; Restriction: if the first line after the marker is not blank and starts with `5( space_or_tab )`, +//! ; only the first `space_or_tab` is part of the start. +//! list_item_start ::= '*' | '+' | '-' | 1*9( ascii_decimal ) ( '.' | ')' ) [ 1*4 space_or_tab ] +//! ; Restriction: blank line allowed, except when this is the first continuation after a blank start. +//! ; Restriction: if not blank, the line must be indented, exactly `n` times. +//! list_item_cont ::= [ n( space_or_tab ) ] +//! ``` +//! +//! Further lines that are not prefixed with `list_item_cont` cause the item +//! to be exited, except when those lines are lazy continuation. +//! Like so many things in markdown, list (items) too, are very complex. +//! See [*§ Phase 1: block structure*][commonmark-block] for more on parsing +//! details. +//! +//! Lists relates to the `
      • `, `
          `, and `
            ` elements in HTML. +//! See [*§ 4.4.8 The `li` element*][html-li], +//! [*§ 4.4.5 The `ol` element*][html-ol], and +//! [*§ 4.4.7 The `ul` element*][html-ul] in the HTML spec for more info. +//! +//! ## Tokens +//! +//! * [`ListItem`][Name::ListItem] +//! * [`ListItemMarker`][Name::ListItemMarker] +//! * [`ListItemPrefix`][Name::ListItemPrefix] +//! * [`ListItemValue`][Name::ListItemValue] +//! * [`ListOrdered`][Name::ListOrdered] +//! * [`ListUnordered`][Name::ListUnordered] +//! +//! ## References +//! +//! * [`list.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/list.js) +//! * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.30/#list-items) +//! * [*§ 5.3 Lists* in `CommonMark`](https://spec.commonmark.org/0.30/#lists) +//! +//! [document]: crate::content::document +//! [html-li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element +//! [html-ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element +//! [html-ul]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ul-element +//! [commonmark-block]: https://spec.commonmark.org/0.30/#phase-1-block-structure + +use crate::constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}; +use crate::construct::partial_space_or_tab::space_or_tab_min_max; +use crate::event::{Kind, Name}; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{ + skip, + slice::{Position, Slice}, +}; + +/// Start of list item. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if tokenizer.parse_state.constructs.list_item { + tokenizer.enter(Name::ListItem); + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt(State::Next(StateName::ListItemBefore), State::Nok); + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } else { + State::Retry(StateName::ListItemBefore) + } + } else { + State::Nok + } +} + +/// After optional whitespace, at list item prefix. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn before(tokenizer: &mut Tokenizer) -> State { + // Unordered. + if matches!(tokenizer.current, Some(b'*' | b'-')) { + tokenizer.check(State::Nok, State::Next(StateName::ListItemBeforeUnordered)); + State::Retry(StateName::ThematicBreakStart) + } else if tokenizer.current == Some(b'+') { + State::Retry(StateName::ListItemBeforeUnordered) + } + // Ordered. + else if tokenizer.current == Some(b'1') + || (matches!(tokenizer.current, Some(b'0'..=b'9')) && !tokenizer.interrupt) + { + State::Retry(StateName::ListItemBeforeOrdered) + } else { + State::Nok + } +} + +/// At unordered list item marker. +/// +/// The line is not a thematic break. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn before_unordered(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::ListItemPrefix); + State::Retry(StateName::ListItemMarker) +} + +/// At ordered list item value. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn before_ordered(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::ListItemPrefix); + tokenizer.enter(Name::ListItemValue); + State::Retry(StateName::ListItemValue) +} + +/// In ordered list item value. +/// +/// ```markdown +/// > | 1. a +/// ^ +/// ``` +pub fn value(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'.' | b')')) + && (!tokenizer.interrupt || tokenizer.tokenize_state.size < 2) + { + tokenizer.exit(Name::ListItemValue); + State::Retry(StateName::ListItemMarker) + } else if matches!(tokenizer.current, Some(b'0'..=b'9')) + && tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX + { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::ListItemValue) + } else { + tokenizer.tokenize_state.size = 0; + State::Nok + } +} + +/// At list item marker. +/// +/// ```markdown +/// > | * a +/// ^ +/// > | 1. b +/// ^ +/// ``` +pub fn marker(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::ListItemMarker); + tokenizer.consume(); + tokenizer.exit(Name::ListItemMarker); + State::Next(StateName::ListItemMarkerAfter) +} + +/// After list item marker. +/// +/// ```markdown +/// > | * a +/// ^ +/// > | 1. b +/// ^ +/// ``` +pub fn marker_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 1; + tokenizer.check( + State::Next(StateName::ListItemAfter), + State::Next(StateName::ListItemMarkerAfterFilled), + ); + State::Retry(StateName::BlankLineStart) +} + +/// After list item marker. +/// +/// The marker is not followed by a blank line. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 0; + + // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace. + tokenizer.attempt( + State::Next(StateName::ListItemAfter), + State::Next(StateName::ListItemPrefixOther), + ); + State::Retry(StateName::ListItemWhitespace) +} + +/// After marker, at whitespace. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn whitespace(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt(State::Next(StateName::ListItemWhitespaceAfter), State::Nok); + State::Retry(space_or_tab_min_max(tokenizer, 1, TAB_SIZE)) +} + +/// After acceptable whitespace. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { + if let Some(b'\t' | b' ') = tokenizer.current { + State::Nok + } else { + State::Ok + } +} + +/// After marker, followed by no indent or more indent that needed. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn prefix_other(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.enter(Name::SpaceOrTab); + tokenizer.consume(); + tokenizer.exit(Name::SpaceOrTab); + State::Next(StateName::ListItemAfter) + } + _ => State::Nok, + } +} + +/// After list item prefix. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn after(tokenizer: &mut Tokenizer) -> State { + let blank = tokenizer.tokenize_state.size == 1; + tokenizer.tokenize_state.size = 0; + + if blank && tokenizer.interrupt { + State::Nok + } else { + let start = skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Name::ListItem], + ); + let mut prefix = Slice::from_position( + tokenizer.parse_state.bytes, + &Position { + start: &tokenizer.events[start].point, + end: &tokenizer.point, + }, + ) + .len(); + + if blank { + prefix += 1; + } + + let container = &mut tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + + container.blank_initial = blank; + container.size = prefix; + + tokenizer.exit(Name::ListItemPrefix); + tokenizer.register_resolver_before(ResolveName::List); + State::Ok + } +} + +/// Start of list item continuation. +/// +/// ```markdown +/// | * a +/// > | b +/// ^ +/// ``` +pub fn cont_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.check( + State::Next(StateName::ListItemContBlank), + State::Next(StateName::ListItemContFilled), + ); + State::Retry(StateName::BlankLineStart) +} + +/// Start of blank list item continuation. +/// +/// ```markdown +/// | * a +/// > | +/// ^ +/// | b +/// ``` +pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { + let container = &mut tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + let size = container.size; + + if container.blank_initial { + State::Nok + } else if matches!(tokenizer.current, Some(b'\t' | b' ')) { + // Consume, optionally, at most `size`. + State::Retry(space_or_tab_min_max(tokenizer, 0, size)) + } else { + State::Ok + } +} + +/// Start of non-blank list item continuation. +/// +/// ```markdown +/// | * a +/// > | b +/// ^ +/// ``` +pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { + let container = &mut tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + let size = container.size; + + container.blank_initial = false; + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + // Consume exactly `size`. + State::Retry(space_or_tab_min_max(tokenizer, size, size)) + } else { + State::Nok + } +} + +/// Find adjacent list items with the same marker. +pub fn resolve(tokenizer: &mut Tokenizer) { + let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; + let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; + let mut index = 0; + let mut balance = 0; + + // Merge list items. + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.name == Name::ListItem { + if event.kind == Kind::Enter { + let end = skip::opt(&tokenizer.events, index, &[Name::ListItem]) - 1; + let marker = skip::to(&tokenizer.events, index, &[Name::ListItemMarker]); + // Guaranteed to be a valid ASCII byte. + let marker = Slice::from_index( + tokenizer.parse_state.bytes, + tokenizer.events[marker].point.index, + ) + .head() + .unwrap(); + let current = (marker, balance, index, end); + + let mut list_index = lists_wip.len(); + let mut matched = false; + + while list_index > 0 { + list_index -= 1; + let previous = &lists_wip[list_index]; + let before = skip::opt( + &tokenizer.events, + previous.3 + 1, + &[ + Name::SpaceOrTab, + Name::LineEnding, + Name::BlankLineEnding, + Name::BlockQuotePrefix, + ], + ); + + if previous.0 == current.0 && previous.1 == current.1 && before == current.2 { + let previous_mut = &mut lists_wip[list_index]; + previous_mut.3 = current.3; + lists.append(&mut lists_wip.split_off(list_index + 1)); + matched = true; + break; + } + } + + if !matched { + let mut index = lists_wip.len(); + let mut exit = None; + + while index > 0 { + index -= 1; + + // If the current (new) item starts after where this + // item on the stack ends, we can remove it from the + // stack. + if current.2 > lists_wip[index].3 { + exit = Some(index); + } else { + break; + } + } + + if let Some(exit) = exit { + lists.append(&mut lists_wip.split_off(exit)); + } + + lists_wip.push(current); + } + + balance += 1; + } else { + balance -= 1; + } + } + + index += 1; + } + + lists.append(&mut lists_wip); + + // Inject events. + let mut index = 0; + while index < lists.len() { + let list_item = &lists[index]; + let mut list_start = tokenizer.events[list_item.2].clone(); + let mut list_end = tokenizer.events[list_item.3].clone(); + let name = match list_item.0 { + b'.' | b')' => Name::ListOrdered, + _ => Name::ListUnordered, + }; + list_start.name = name.clone(); + list_end.name = name; + + tokenizer.map.add(list_item.2, 0, vec![list_start]); + tokenizer.map.add(list_item.3 + 1, 0, vec![list_end]); + + index += 1; + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 0adf611..566bb30 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -32,7 +32,7 @@ //! * [label end][label_end] //! * [label start (image)][label_start_image] //! * [label start (link)][label_start_link] -//! * [list][] +//! * [list item][list_item] //! * [paragraph][] //! * [thematic break][thematic_break] //! @@ -84,7 +84,7 @@ pub mod html_text; pub mod label_end; pub mod label_start_image; pub mod label_start_link; -pub mod list; +pub mod list_item; pub mod paragraph; pub mod partial_bom; pub mod partial_data; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index f493b96..1b581ea 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -20,7 +20,7 @@ //! As using more than three markers has no effect other than wasting space, //! it is recommended to use exactly three markers. //! Thematic breaks formed with asterisks or dashes can interfere with -//! [list][]s if there is whitespace between them: `* * *` and `- - -`. +//! [list][list-item]s if there is whitespace between them: `* * *` and `- - -`. //! For these reasons, it is recommend to not use spaces or tabs between the //! markers. //! Thematic breaks formed with dashes (without whitespace) can also form @@ -45,7 +45,7 @@ //! //! [flow]: crate::content::flow //! [heading_setext]: crate::construct::heading_setext -//! [list]: crate::construct::list +//! [list-item]: crate::construct::list_item //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; diff --git a/src/content/document.rs b/src/content/document.rs index 41d60e2..f90aea7 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -6,7 +6,7 @@ //! The constructs found in flow are: //! //! * [Block quote][crate::construct::block_quote] -//! * [List][crate::construct::list] +//! * [List][crate::construct::list_item] use crate::event::{Content, Event, Kind, Link, Name, Point}; use crate::parser::ParseState; @@ -103,7 +103,7 @@ pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State { let name = match container.kind { Container::BlockQuote => StateName::BlockQuoteContStart, - Container::ListItem => StateName::ListContStart, + Container::ListItem => StateName::ListItemContStart, }; tokenizer.attempt( @@ -201,7 +201,7 @@ pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State State::Next(StateName::DocumentContainerNewAfter), State::Next(StateName::DocumentContainerNewBeforeNotList), ); - State::Retry(StateName::ListStart) + State::Retry(StateName::ListItemStart) } /// At new container, but not a list (or block quote). diff --git a/src/event.rs b/src/event.rs index be32b5b..8cdb959 100644 --- a/src/event.rs +++ b/src/event.rs @@ -1332,7 +1332,7 @@ pub enum Name { /// [`ListItemPrefix`][Name::ListItemPrefix], /// [flow content][crate::content::flow] /// * **Construct**: - /// [`list`][crate::construct::list] + /// [`list item`][crate::construct::list_item] /// /// ## Example /// @@ -1352,7 +1352,7 @@ pub enum Name { /// * **Content model**: /// void /// * **Construct**: - /// [`list`][crate::construct::list] + /// [`list item`][crate::construct::list_item] /// /// ## Example /// @@ -1374,7 +1374,7 @@ pub enum Name { /// [`ListItemValue`][Name::ListItemValue], /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: - /// [`list`][crate::construct::list] + /// [`list item`][crate::construct::list_item] /// /// ## Example /// @@ -1394,7 +1394,7 @@ pub enum Name { /// * **Content model**: /// void /// * **Construct**: - /// [`list`][crate::construct::list] + /// [`list item`][crate::construct::list_item] /// /// ## Example /// @@ -1416,7 +1416,7 @@ pub enum Name { /// [`LineEnding`][Name::LineEnding], /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: - /// [`list`][crate::construct::list] + /// [`list item`][crate::construct::list_item] /// /// ## Example /// @@ -1440,7 +1440,7 @@ pub enum Name { /// [`LineEnding`][Name::LineEnding], /// [`SpaceOrTab`][Name::SpaceOrTab] /// * **Construct**: - /// [`list`][crate::construct::list] + /// [`list item`][crate::construct::list_item] /// /// ## Example /// diff --git a/src/lib.rs b/src/lib.rs index 24a794b..f9f5326 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -218,13 +218,13 @@ pub struct Constructs { /// ^^^^ /// ``` pub label_end: bool, - /// List. + /// List items. /// /// ```markdown /// > | * a /// ^^^ /// ``` - pub list: bool, + pub list_item: bool, /// Thematic break. /// /// ```markdown @@ -256,7 +256,7 @@ impl Default for Constructs { label_start_image: true, label_start_link: true, label_end: true, - list: true, + list_item: true, thematic_break: true, } } diff --git a/src/resolve.rs b/src/resolve.rs index e7d63f9..edc92b2 100644 --- a/src/resolve.rs +++ b/src/resolve.rs @@ -23,7 +23,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) { Name::Attention => construct::attention::resolve, Name::HeadingAtx => construct::heading_atx::resolve, Name::HeadingSetext => construct::heading_setext::resolve, - Name::List => construct::list::resolve, + Name::List => construct::list_item::resolve, Name::Paragraph => construct::paragraph::resolve, Name::Data => construct::partial_data::resolve, Name::String => content::string::resolve, diff --git a/src/state.rs b/src/state.rs index 006ffe1..344a31e 100644 --- a/src/state.rs +++ b/src/state.rs @@ -240,21 +240,21 @@ pub enum Name { LabelStartLinkStart, - ListStart, - ListBefore, - ListBeforeOrdered, - ListBeforeUnordered, - ListValue, - ListMarker, - ListMarkerAfter, - ListAfter, - ListMarkerAfterFilled, - ListWhitespace, - ListPrefixOther, - ListWhitespaceAfter, - ListContStart, - ListContBlank, - ListContFilled, + ListItemStart, + ListItemBefore, + ListItemBeforeOrdered, + ListItemBeforeUnordered, + ListItemValue, + ListItemMarker, + ListItemMarkerAfter, + ListItemAfter, + ListItemMarkerAfterFilled, + ListItemWhitespace, + ListItemPrefixOther, + ListItemWhitespaceAfter, + ListItemContStart, + ListItemContBlank, + ListItemContFilled, NonLazyContinuationStart, NonLazyContinuationAfter, @@ -552,21 +552,21 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::LabelStartImageOpen => construct::label_start_image::open, Name::LabelStartLinkStart => construct::label_start_link::start, - Name::ListStart => construct::list::start, - Name::ListBefore => construct::list::before, - Name::ListBeforeOrdered => construct::list::before_ordered, - Name::ListBeforeUnordered => construct::list::before_unordered, - Name::ListValue => construct::list::value, - Name::ListMarker => construct::list::marker, - Name::ListMarkerAfter => construct::list::marker_after, - Name::ListAfter => construct::list::after, - Name::ListMarkerAfterFilled => construct::list::marker_after_filled, - Name::ListWhitespace => construct::list::whitespace, - Name::ListWhitespaceAfter => construct::list::whitespace_after, - Name::ListPrefixOther => construct::list::prefix_other, - Name::ListContStart => construct::list::cont_start, - Name::ListContBlank => construct::list::cont_blank, - Name::ListContFilled => construct::list::cont_filled, + Name::ListItemStart => construct::list_item::start, + Name::ListItemBefore => construct::list_item::before, + Name::ListItemBeforeOrdered => construct::list_item::before_ordered, + Name::ListItemBeforeUnordered => construct::list_item::before_unordered, + Name::ListItemValue => construct::list_item::value, + Name::ListItemMarker => construct::list_item::marker, + Name::ListItemMarkerAfter => construct::list_item::marker_after, + Name::ListItemAfter => construct::list_item::after, + Name::ListItemMarkerAfterFilled => construct::list_item::marker_after_filled, + Name::ListItemWhitespace => construct::list_item::whitespace, + Name::ListItemWhitespaceAfter => construct::list_item::whitespace_after, + Name::ListItemPrefixOther => construct::list_item::prefix_other, + Name::ListItemContStart => construct::list_item::cont_start, + Name::ListItemContBlank => construct::list_item::cont_blank, + Name::ListItemContFilled => construct::list_item::cont_filled, Name::NonLazyContinuationStart => construct::partial_non_lazy_continuation::start, Name::NonLazyContinuationAfter => construct::partial_non_lazy_continuation::after, -- cgit