aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct/list_item.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/construct/list_item.rs')
-rw-r--r--src/construct/list_item.rs460
1 files changed, 460 insertions, 0 deletions
diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs
new file mode 100644
index 0000000..5161254
--- /dev/null
+++ b/src/construct/list_item.rs
@@ -0,0 +1,460 @@
+//! List item is a construct that occurs in the [document][] content type.
+//!
+//! It forms with, roughly, the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: there must be `eol | space_or_tab` after the start.
+//! ; Restriction: if the first line after the marker is not blank and starts with `5( space_or_tab )`,
+//! ; only the first `space_or_tab` is part of the start.
+//! list_item_start ::= '*' | '+' | '-' | 1*9( ascii_decimal ) ( '.' | ')' ) [ 1*4 space_or_tab ]
+//! ; Restriction: blank line allowed, except when this is the first continuation after a blank start.
+//! ; Restriction: if not blank, the line must be indented, exactly `n` times.
+//! list_item_cont ::= [ n( space_or_tab ) ]
+//! ```
+//!
+//! Further lines that are not prefixed with `list_item_cont` cause the item
+//! to be exited, except when those lines are lazy continuation.
+//! Like so many things in markdown, list (items) too, are very complex.
+//! See [*§ Phase 1: block structure*][commonmark-block] for more on parsing
+//! details.
+//!
+//! Lists relates to the `<li>`, `<ol>`, and `<ul>` elements in HTML.
+//! See [*§ 4.4.8 The `li` element*][html-li],
+//! [*§ 4.4.5 The `ol` element*][html-ol], and
+//! [*§ 4.4.7 The `ul` element*][html-ul] in the HTML spec for more info.
+//!
+//! ## Tokens
+//!
+//! * [`ListItem`][Name::ListItem]
+//! * [`ListItemMarker`][Name::ListItemMarker]
+//! * [`ListItemPrefix`][Name::ListItemPrefix]
+//! * [`ListItemValue`][Name::ListItemValue]
+//! * [`ListOrdered`][Name::ListOrdered]
+//! * [`ListUnordered`][Name::ListUnordered]
+//!
+//! ## References
+//!
+//! * [`list.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/list.js)
+//! * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.30/#list-items)
+//! * [*§ 5.3 Lists* in `CommonMark`](https://spec.commonmark.org/0.30/#lists)
+//!
+//! [document]: crate::content::document
+//! [html-li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element
+//! [html-ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element
+//! [html-ul]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ul-element
+//! [commonmark-block]: https://spec.commonmark.org/0.30/#phase-1-block-structure
+
+use crate::constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE};
+use crate::construct::partial_space_or_tab::space_or_tab_min_max;
+use crate::event::{Kind, Name};
+use crate::resolve::Name as ResolveName;
+use crate::state::{Name as StateName, State};
+use crate::tokenizer::Tokenizer;
+use crate::util::{
+ skip,
+ slice::{Position, Slice},
+};
+
+/// Start of list item.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn start(tokenizer: &mut Tokenizer) -> State {
+ if tokenizer.parse_state.constructs.list_item {
+ tokenizer.enter(Name::ListItem);
+
+ if matches!(tokenizer.current, Some(b'\t' | b' ')) {
+ tokenizer.attempt(State::Next(StateName::ListItemBefore), State::Nok);
+ State::Retry(space_or_tab_min_max(
+ tokenizer,
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ))
+ } else {
+ State::Retry(StateName::ListItemBefore)
+ }
+ } else {
+ State::Nok
+ }
+}
+
+/// After optional whitespace, at list item prefix.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn before(tokenizer: &mut Tokenizer) -> State {
+ // Unordered.
+ if matches!(tokenizer.current, Some(b'*' | b'-')) {
+ tokenizer.check(State::Nok, State::Next(StateName::ListItemBeforeUnordered));
+ State::Retry(StateName::ThematicBreakStart)
+ } else if tokenizer.current == Some(b'+') {
+ State::Retry(StateName::ListItemBeforeUnordered)
+ }
+ // Ordered.
+ else if tokenizer.current == Some(b'1')
+ || (matches!(tokenizer.current, Some(b'0'..=b'9')) && !tokenizer.interrupt)
+ {
+ State::Retry(StateName::ListItemBeforeOrdered)
+ } else {
+ State::Nok
+ }
+}
+
+/// At unordered list item marker.
+///
+/// The line is not a thematic break.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn before_unordered(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.enter(Name::ListItemPrefix);
+ State::Retry(StateName::ListItemMarker)
+}
+
+/// At ordered list item value.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn before_ordered(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.enter(Name::ListItemPrefix);
+ tokenizer.enter(Name::ListItemValue);
+ State::Retry(StateName::ListItemValue)
+}
+
+/// In ordered list item value.
+///
+/// ```markdown
+/// > | 1. a
+/// ^
+/// ```
+pub fn value(tokenizer: &mut Tokenizer) -> State {
+ if matches!(tokenizer.current, Some(b'.' | b')'))
+ && (!tokenizer.interrupt || tokenizer.tokenize_state.size < 2)
+ {
+ tokenizer.exit(Name::ListItemValue);
+ State::Retry(StateName::ListItemMarker)
+ } else if matches!(tokenizer.current, Some(b'0'..=b'9'))
+ && tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX
+ {
+ tokenizer.tokenize_state.size += 1;
+ tokenizer.consume();
+ State::Next(StateName::ListItemValue)
+ } else {
+ tokenizer.tokenize_state.size = 0;
+ State::Nok
+ }
+}
+
+/// At list item marker.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// > | 1. b
+/// ^
+/// ```
+pub fn marker(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.enter(Name::ListItemMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::ListItemMarker);
+ State::Next(StateName::ListItemMarkerAfter)
+}
+
+/// After list item marker.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// > | 1. b
+/// ^
+/// ```
+pub fn marker_after(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.tokenize_state.size = 1;
+ tokenizer.check(
+ State::Next(StateName::ListItemAfter),
+ State::Next(StateName::ListItemMarkerAfterFilled),
+ );
+ State::Retry(StateName::BlankLineStart)
+}
+
+/// After list item marker.
+///
+/// The marker is not followed by a blank line.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.tokenize_state.size = 0;
+
+ // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace.
+ tokenizer.attempt(
+ State::Next(StateName::ListItemAfter),
+ State::Next(StateName::ListItemPrefixOther),
+ );
+ State::Retry(StateName::ListItemWhitespace)
+}
+
+/// After marker, at whitespace.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn whitespace(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.attempt(State::Next(StateName::ListItemWhitespaceAfter), State::Nok);
+ State::Retry(space_or_tab_min_max(tokenizer, 1, TAB_SIZE))
+}
+
+/// After acceptable whitespace.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State {
+ if let Some(b'\t' | b' ') = tokenizer.current {
+ State::Nok
+ } else {
+ State::Ok
+ }
+}
+
+/// After marker, followed by no indent or more indent that needed.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn prefix_other(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some(b'\t' | b' ') => {
+ tokenizer.enter(Name::SpaceOrTab);
+ tokenizer.consume();
+ tokenizer.exit(Name::SpaceOrTab);
+ State::Next(StateName::ListItemAfter)
+ }
+ _ => State::Nok,
+ }
+}
+
+/// After list item prefix.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+pub fn after(tokenizer: &mut Tokenizer) -> State {
+ let blank = tokenizer.tokenize_state.size == 1;
+ tokenizer.tokenize_state.size = 0;
+
+ if blank && tokenizer.interrupt {
+ State::Nok
+ } else {
+ let start = skip::to_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Name::ListItem],
+ );
+ let mut prefix = Slice::from_position(
+ tokenizer.parse_state.bytes,
+ &Position {
+ start: &tokenizer.events[start].point,
+ end: &tokenizer.point,
+ },
+ )
+ .len();
+
+ if blank {
+ prefix += 1;
+ }
+
+ let container = &mut tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
+
+ container.blank_initial = blank;
+ container.size = prefix;
+
+ tokenizer.exit(Name::ListItemPrefix);
+ tokenizer.register_resolver_before(ResolveName::List);
+ State::Ok
+ }
+}
+
+/// Start of list item continuation.
+///
+/// ```markdown
+/// | * a
+/// > | b
+/// ^
+/// ```
+pub fn cont_start(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.check(
+ State::Next(StateName::ListItemContBlank),
+ State::Next(StateName::ListItemContFilled),
+ );
+ State::Retry(StateName::BlankLineStart)
+}
+
+/// Start of blank list item continuation.
+///
+/// ```markdown
+/// | * a
+/// > |
+/// ^
+/// | b
+/// ```
+pub fn cont_blank(tokenizer: &mut Tokenizer) -> State {
+ let container = &mut tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
+ let size = container.size;
+
+ if container.blank_initial {
+ State::Nok
+ } else if matches!(tokenizer.current, Some(b'\t' | b' ')) {
+ // Consume, optionally, at most `size`.
+ State::Retry(space_or_tab_min_max(tokenizer, 0, size))
+ } else {
+ State::Ok
+ }
+}
+
+/// Start of non-blank list item continuation.
+///
+/// ```markdown
+/// | * a
+/// > | b
+/// ^
+/// ```
+pub fn cont_filled(tokenizer: &mut Tokenizer) -> State {
+ let container = &mut tokenizer.tokenize_state.document_container_stack
+ [tokenizer.tokenize_state.document_continued];
+ let size = container.size;
+
+ container.blank_initial = false;
+
+ if matches!(tokenizer.current, Some(b'\t' | b' ')) {
+ // Consume exactly `size`.
+ State::Retry(space_or_tab_min_max(tokenizer, size, size))
+ } else {
+ State::Nok
+ }
+}
+
+/// Find adjacent list items with the same marker.
+pub fn resolve(tokenizer: &mut Tokenizer) {
+ let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![];
+ let mut lists: Vec<(u8, usize, usize, usize)> = vec![];
+ let mut index = 0;
+ let mut balance = 0;
+
+ // Merge list items.
+ while index < tokenizer.events.len() {
+ let event = &tokenizer.events[index];
+
+ if event.name == Name::ListItem {
+ if event.kind == Kind::Enter {
+ let end = skip::opt(&tokenizer.events, index, &[Name::ListItem]) - 1;
+ let marker = skip::to(&tokenizer.events, index, &[Name::ListItemMarker]);
+ // Guaranteed to be a valid ASCII byte.
+ let marker = Slice::from_index(
+ tokenizer.parse_state.bytes,
+ tokenizer.events[marker].point.index,
+ )
+ .head()
+ .unwrap();
+ let current = (marker, balance, index, end);
+
+ let mut list_index = lists_wip.len();
+ let mut matched = false;
+
+ while list_index > 0 {
+ list_index -= 1;
+ let previous = &lists_wip[list_index];
+ let before = skip::opt(
+ &tokenizer.events,
+ previous.3 + 1,
+ &[
+ Name::SpaceOrTab,
+ Name::LineEnding,
+ Name::BlankLineEnding,
+ Name::BlockQuotePrefix,
+ ],
+ );
+
+ if previous.0 == current.0 && previous.1 == current.1 && before == current.2 {
+ let previous_mut = &mut lists_wip[list_index];
+ previous_mut.3 = current.3;
+ lists.append(&mut lists_wip.split_off(list_index + 1));
+ matched = true;
+ break;
+ }
+ }
+
+ if !matched {
+ let mut index = lists_wip.len();
+ let mut exit = None;
+
+ while index > 0 {
+ index -= 1;
+
+ // If the current (new) item starts after where this
+ // item on the stack ends, we can remove it from the
+ // stack.
+ if current.2 > lists_wip[index].3 {
+ exit = Some(index);
+ } else {
+ break;
+ }
+ }
+
+ if let Some(exit) = exit {
+ lists.append(&mut lists_wip.split_off(exit));
+ }
+
+ lists_wip.push(current);
+ }
+
+ balance += 1;
+ } else {
+ balance -= 1;
+ }
+ }
+
+ index += 1;
+ }
+
+ lists.append(&mut lists_wip);
+
+ // Inject events.
+ let mut index = 0;
+ while index < lists.len() {
+ let list_item = &lists[index];
+ let mut list_start = tokenizer.events[list_item.2].clone();
+ let mut list_end = tokenizer.events[list_item.3].clone();
+ let name = match list_item.0 {
+ b'.' | b')' => Name::ListOrdered,
+ _ => Name::ListUnordered,
+ };
+ list_start.name = name.clone();
+ list_end.name = name;
+
+ tokenizer.map.add(list_item.2, 0, vec![list_start]);
+ tokenizer.map.add(list_item.3 + 1, 0, vec![list_end]);
+
+ index += 1;
+ }
+}