From 6e20c3e79d4270fafb13a63af51eaffaa45c11e1 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 15 Aug 2022 11:24:06 +0200 Subject: Refactor to rename list construct to list item --- src/construct/list_item.rs | 460 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 src/construct/list_item.rs (limited to 'src/construct/list_item.rs') diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs new file mode 100644 index 0000000..5161254 --- /dev/null +++ b/src/construct/list_item.rs @@ -0,0 +1,460 @@ +//! List item is a construct that occurs in the [document][] content type. +//! +//! It forms with, roughly, the following BNF: +//! +//! ```bnf +//! ; Restriction: there must be `eol | space_or_tab` after the start. +//! ; Restriction: if the first line after the marker is not blank and starts with `5( space_or_tab )`, +//! ; only the first `space_or_tab` is part of the start. +//! list_item_start ::= '*' | '+' | '-' | 1*9( ascii_decimal ) ( '.' | ')' ) [ 1*4 space_or_tab ] +//! ; Restriction: blank line allowed, except when this is the first continuation after a blank start. +//! ; Restriction: if not blank, the line must be indented, exactly `n` times. +//! list_item_cont ::= [ n( space_or_tab ) ] +//! ``` +//! +//! Further lines that are not prefixed with `list_item_cont` cause the item +//! to be exited, except when those lines are lazy continuation. +//! Like so many things in markdown, list (items) too, are very complex. +//! See [*§ Phase 1: block structure*][commonmark-block] for more on parsing +//! details. +//! +//! Lists relates to the `
  • `, `
      `, and `
        ` elements in HTML. +//! See [*§ 4.4.8 The `li` element*][html-li], +//! [*§ 4.4.5 The `ol` element*][html-ol], and +//! [*§ 4.4.7 The `ul` element*][html-ul] in the HTML spec for more info. +//! +//! ## Tokens +//! +//! * [`ListItem`][Name::ListItem] +//! * [`ListItemMarker`][Name::ListItemMarker] +//! * [`ListItemPrefix`][Name::ListItemPrefix] +//! * [`ListItemValue`][Name::ListItemValue] +//! * [`ListOrdered`][Name::ListOrdered] +//! * [`ListUnordered`][Name::ListUnordered] +//! +//! ## References +//! +//! * [`list.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/list.js) +//! * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.30/#list-items) +//! * [*§ 5.3 Lists* in `CommonMark`](https://spec.commonmark.org/0.30/#lists) +//! +//! [document]: crate::content::document +//! [html-li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element +//! [html-ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element +//! [html-ul]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ul-element +//! [commonmark-block]: https://spec.commonmark.org/0.30/#phase-1-block-structure + +use crate::constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}; +use crate::construct::partial_space_or_tab::space_or_tab_min_max; +use crate::event::{Kind, Name}; +use crate::resolve::Name as ResolveName; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{ + skip, + slice::{Position, Slice}, +}; + +/// Start of list item. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if tokenizer.parse_state.constructs.list_item { + tokenizer.enter(Name::ListItem); + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + tokenizer.attempt(State::Next(StateName::ListItemBefore), State::Nok); + State::Retry(space_or_tab_min_max( + tokenizer, + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + )) + } else { + State::Retry(StateName::ListItemBefore) + } + } else { + State::Nok + } +} + +/// After optional whitespace, at list item prefix. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn before(tokenizer: &mut Tokenizer) -> State { + // Unordered. + if matches!(tokenizer.current, Some(b'*' | b'-')) { + tokenizer.check(State::Nok, State::Next(StateName::ListItemBeforeUnordered)); + State::Retry(StateName::ThematicBreakStart) + } else if tokenizer.current == Some(b'+') { + State::Retry(StateName::ListItemBeforeUnordered) + } + // Ordered. + else if tokenizer.current == Some(b'1') + || (matches!(tokenizer.current, Some(b'0'..=b'9')) && !tokenizer.interrupt) + { + State::Retry(StateName::ListItemBeforeOrdered) + } else { + State::Nok + } +} + +/// At unordered list item marker. +/// +/// The line is not a thematic break. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn before_unordered(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::ListItemPrefix); + State::Retry(StateName::ListItemMarker) +} + +/// At ordered list item value. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn before_ordered(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::ListItemPrefix); + tokenizer.enter(Name::ListItemValue); + State::Retry(StateName::ListItemValue) +} + +/// In ordered list item value. +/// +/// ```markdown +/// > | 1. a +/// ^ +/// ``` +pub fn value(tokenizer: &mut Tokenizer) -> State { + if matches!(tokenizer.current, Some(b'.' | b')')) + && (!tokenizer.interrupt || tokenizer.tokenize_state.size < 2) + { + tokenizer.exit(Name::ListItemValue); + State::Retry(StateName::ListItemMarker) + } else if matches!(tokenizer.current, Some(b'0'..=b'9')) + && tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX + { + tokenizer.tokenize_state.size += 1; + tokenizer.consume(); + State::Next(StateName::ListItemValue) + } else { + tokenizer.tokenize_state.size = 0; + State::Nok + } +} + +/// At list item marker. +/// +/// ```markdown +/// > | * a +/// ^ +/// > | 1. b +/// ^ +/// ``` +pub fn marker(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Name::ListItemMarker); + tokenizer.consume(); + tokenizer.exit(Name::ListItemMarker); + State::Next(StateName::ListItemMarkerAfter) +} + +/// After list item marker. +/// +/// ```markdown +/// > | * a +/// ^ +/// > | 1. b +/// ^ +/// ``` +pub fn marker_after(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 1; + tokenizer.check( + State::Next(StateName::ListItemAfter), + State::Next(StateName::ListItemMarkerAfterFilled), + ); + State::Retry(StateName::BlankLineStart) +} + +/// After list item marker. +/// +/// The marker is not followed by a blank line. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State { + tokenizer.tokenize_state.size = 0; + + // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace. + tokenizer.attempt( + State::Next(StateName::ListItemAfter), + State::Next(StateName::ListItemPrefixOther), + ); + State::Retry(StateName::ListItemWhitespace) +} + +/// After marker, at whitespace. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn whitespace(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt(State::Next(StateName::ListItemWhitespaceAfter), State::Nok); + State::Retry(space_or_tab_min_max(tokenizer, 1, TAB_SIZE)) +} + +/// After acceptable whitespace. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { + if let Some(b'\t' | b' ') = tokenizer.current { + State::Nok + } else { + State::Ok + } +} + +/// After marker, followed by no indent or more indent that needed. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn prefix_other(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.enter(Name::SpaceOrTab); + tokenizer.consume(); + tokenizer.exit(Name::SpaceOrTab); + State::Next(StateName::ListItemAfter) + } + _ => State::Nok, + } +} + +/// After list item prefix. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +pub fn after(tokenizer: &mut Tokenizer) -> State { + let blank = tokenizer.tokenize_state.size == 1; + tokenizer.tokenize_state.size = 0; + + if blank && tokenizer.interrupt { + State::Nok + } else { + let start = skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Name::ListItem], + ); + let mut prefix = Slice::from_position( + tokenizer.parse_state.bytes, + &Position { + start: &tokenizer.events[start].point, + end: &tokenizer.point, + }, + ) + .len(); + + if blank { + prefix += 1; + } + + let container = &mut tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + + container.blank_initial = blank; + container.size = prefix; + + tokenizer.exit(Name::ListItemPrefix); + tokenizer.register_resolver_before(ResolveName::List); + State::Ok + } +} + +/// Start of list item continuation. +/// +/// ```markdown +/// | * a +/// > | b +/// ^ +/// ``` +pub fn cont_start(tokenizer: &mut Tokenizer) -> State { + tokenizer.check( + State::Next(StateName::ListItemContBlank), + State::Next(StateName::ListItemContFilled), + ); + State::Retry(StateName::BlankLineStart) +} + +/// Start of blank list item continuation. +/// +/// ```markdown +/// | * a +/// > | +/// ^ +/// | b +/// ``` +pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { + let container = &mut tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + let size = container.size; + + if container.blank_initial { + State::Nok + } else if matches!(tokenizer.current, Some(b'\t' | b' ')) { + // Consume, optionally, at most `size`. + State::Retry(space_or_tab_min_max(tokenizer, 0, size)) + } else { + State::Ok + } +} + +/// Start of non-blank list item continuation. +/// +/// ```markdown +/// | * a +/// > | b +/// ^ +/// ``` +pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { + let container = &mut tokenizer.tokenize_state.document_container_stack + [tokenizer.tokenize_state.document_continued]; + let size = container.size; + + container.blank_initial = false; + + if matches!(tokenizer.current, Some(b'\t' | b' ')) { + // Consume exactly `size`. + State::Retry(space_or_tab_min_max(tokenizer, size, size)) + } else { + State::Nok + } +} + +/// Find adjacent list items with the same marker. +pub fn resolve(tokenizer: &mut Tokenizer) { + let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; + let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; + let mut index = 0; + let mut balance = 0; + + // Merge list items. + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.name == Name::ListItem { + if event.kind == Kind::Enter { + let end = skip::opt(&tokenizer.events, index, &[Name::ListItem]) - 1; + let marker = skip::to(&tokenizer.events, index, &[Name::ListItemMarker]); + // Guaranteed to be a valid ASCII byte. + let marker = Slice::from_index( + tokenizer.parse_state.bytes, + tokenizer.events[marker].point.index, + ) + .head() + .unwrap(); + let current = (marker, balance, index, end); + + let mut list_index = lists_wip.len(); + let mut matched = false; + + while list_index > 0 { + list_index -= 1; + let previous = &lists_wip[list_index]; + let before = skip::opt( + &tokenizer.events, + previous.3 + 1, + &[ + Name::SpaceOrTab, + Name::LineEnding, + Name::BlankLineEnding, + Name::BlockQuotePrefix, + ], + ); + + if previous.0 == current.0 && previous.1 == current.1 && before == current.2 { + let previous_mut = &mut lists_wip[list_index]; + previous_mut.3 = current.3; + lists.append(&mut lists_wip.split_off(list_index + 1)); + matched = true; + break; + } + } + + if !matched { + let mut index = lists_wip.len(); + let mut exit = None; + + while index > 0 { + index -= 1; + + // If the current (new) item starts after where this + // item on the stack ends, we can remove it from the + // stack. + if current.2 > lists_wip[index].3 { + exit = Some(index); + } else { + break; + } + } + + if let Some(exit) = exit { + lists.append(&mut lists_wip.split_off(exit)); + } + + lists_wip.push(current); + } + + balance += 1; + } else { + balance -= 1; + } + } + + index += 1; + } + + lists.append(&mut lists_wip); + + // Inject events. + let mut index = 0; + while index < lists.len() { + let list_item = &lists[index]; + let mut list_start = tokenizer.events[list_item.2].clone(); + let mut list_end = tokenizer.events[list_item.3].clone(); + let name = match list_item.0 { + b'.' | b')' => Name::ListOrdered, + _ => Name::ListUnordered, + }; + list_start.name = name.clone(); + list_end.name = name; + + tokenizer.map.add(list_item.2, 0, vec![list_start]); + tokenizer.map.add(list_item.3 + 1, 0, vec![list_end]); + + index += 1; + } +} -- cgit