diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-28 14:18:17 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-28 14:18:17 +0200 |
commit | dfd11b1bc155ae1fba9975a90c2dc83dc07697b4 (patch) | |
tree | 0dd150365a6ae1df4c4845518efafe02ab61cb77 /src/construct | |
parent | a3dd207e3b1ebcbcb6cec0f703a695e51ae4ece0 (diff) | |
download | markdown-rs-dfd11b1bc155ae1fba9975a90c2dc83dc07697b4.tar.gz markdown-rs-dfd11b1bc155ae1fba9975a90c2dc83dc07697b4.tar.bz2 markdown-rs-dfd11b1bc155ae1fba9975a90c2dc83dc07697b4.zip |
Fix jumps in `edit_map`
* Use resolve more often (e.g., heading (atx, setext))
* Fix to link whole phrasing (e.g., one big chunk of text in heading (atx,
setext), titles, labels)
* Replace `ChunkText`, `ChunkString`, with
`event.content_type: Option<ContentType>`
* Refactor to externalize `edit_map` from `label`
Diffstat (limited to 'src/construct')
-rw-r--r-- | src/construct/code_fenced.rs | 12 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 107 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 32 | ||||
-rw-r--r-- | src/construct/label_end.rs | 159 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 10 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 12 | ||||
-rw-r--r-- | src/construct/partial_label.rs | 54 | ||||
-rw-r--r-- | src/construct/partial_space_or_tab.rs | 161 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 67 |
9 files changed, 316 insertions, 298 deletions
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 5b1426c..1602aad 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -103,7 +103,7 @@ use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; use crate::util::span::from_exit_event; /// Kind of fences. @@ -259,7 +259,7 @@ fn info_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResu } _ => { tokenizer.enter(TokenType::CodeFencedFenceInfo); - tokenizer.enter(TokenType::ChunkString); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::String)); info_inside(tokenizer, code, info, vec![]) } } @@ -280,13 +280,13 @@ fn info_inside( ) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); tokenizer.exit(TokenType::CodeFencedFenceInfo); tokenizer.exit(TokenType::CodeFencedFence); at_break(tokenizer, code, info) } Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); tokenizer.exit(TokenType::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t, c| meta_before(t, c, info))(tokenizer, code) } @@ -317,7 +317,7 @@ fn meta_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResu } _ => { tokenizer.enter(TokenType::CodeFencedFenceMeta); - tokenizer.enter(TokenType::ChunkString); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::String)); meta(tokenizer, code, info) } } @@ -333,7 +333,7 @@ fn meta_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResu fn meta(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); tokenizer.exit(TokenType::CodeFencedFenceMeta); tokenizer.exit(TokenType::CodeFencedFence); at_break(tokenizer, code, info) diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 1e5fe3d..2811894 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -40,7 +40,7 @@ //! * [`HeadingAtx`][TokenType::HeadingAtx] //! * [`HeadingAtxSequence`][TokenType::HeadingAtxSequence] //! * [`HeadingAtxText`][TokenType::HeadingAtxText] -//! * [`HeadingAtxSpaceOrTab`][TokenType::HeadingAtxSpaceOrTab] +//! * [`SpaceOrTab`][TokenType::SpaceOrTab] //! //! ## References //! @@ -54,11 +54,12 @@ //! [wiki-setext]: https://en.wikipedia.org/wiki/Setext //! [atx]: http://www.aaronsw.com/2002/atx/ -use super::partial_space_or_tab::{ - space_or_tab, space_or_tab_with_options, Options as SpaceOrTabOptions, -}; +use super::partial_space_or_tab::space_or_tab; use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{ + Code, ContentType, Event, EventType, State, StateFnResult, TokenType, Tokenizer, +}; +use crate::util::edit_map::EditMap; /// Start of a heading (atx). /// @@ -106,14 +107,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR } _ if rank > 0 => { tokenizer.exit(TokenType::HeadingAtxSequence); - tokenizer.go( - space_or_tab_with_options(SpaceOrTabOptions { - kind: TokenType::HeadingAtxSpaceOrTab, - min: 1, - max: usize::MAX, - }), - at_break, - )(tokenizer, code) + tokenizer.go(space_or_tab(), at_break)(tokenizer, code) } _ => (State::Nok, None), } @@ -132,23 +126,18 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HeadingAtx); + tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); (State::Ok, Some(vec![code])) } - Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.go( - space_or_tab_with_options(SpaceOrTabOptions { - kind: TokenType::HeadingAtxSpaceOrTab, - min: 1, - max: usize::MAX, - }), - at_break, - )(tokenizer, code), + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.go(space_or_tab(), at_break)(tokenizer, code) + } Code::Char('#') => { tokenizer.enter(TokenType::HeadingAtxSequence); further_sequence(tokenizer, code) } Code::Char(_) => { - tokenizer.enter(TokenType::HeadingAtxText); - tokenizer.enter(TokenType::ChunkText); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); data(tokenizer, code) } } @@ -179,8 +168,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { - tokenizer.exit(TokenType::ChunkText); - tokenizer.exit(TokenType::HeadingAtxText); + tokenizer.exit(TokenType::Data); at_break(tokenizer, code) } _ => { @@ -189,3 +177,72 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } } + +/// To do. +pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut edit_map = EditMap::new(); + let mut index = 0; + let mut heading_start: Option<usize> = None; + let mut data_start: Option<usize> = None; + let mut data_end: Option<usize> = None; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.token_type == TokenType::HeadingAtx { + if event.event_type == EventType::Enter { + heading_start = Some(index); + } else if let Some(start) = data_start { + // If `start` is some, `end` is too. + let end = data_end.unwrap(); + + edit_map.add( + start, + 0, + vec![Event { + event_type: EventType::Enter, + token_type: TokenType::HeadingAtxText, + point: tokenizer.events[start].point.clone(), + index: tokenizer.events[start].index, + previous: None, + next: None, + content_type: None, + }], + ); + + // Remove everything between the start and the end. + edit_map.add(start + 1, end - start - 1, vec![]); + + edit_map.add( + end + 1, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::HeadingAtxText, + point: tokenizer.events[end].point.clone(), + index: tokenizer.events[end].index, + previous: None, + next: None, + content_type: None, + }], + ); + + heading_start = None; + data_start = None; + data_end = None; + } + } else if heading_start.is_some() && event.token_type == TokenType::Data { + if event.event_type == EventType::Enter { + if data_start.is_none() { + data_start = Some(index); + } + } else { + data_end = Some(index); + } + } + + index += 1; + } + + edit_map.consume(&mut tokenizer.events) +} diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 06ce481..63f3c30 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -56,9 +56,9 @@ //! [atx]: http://www.aaronsw.com/2002/atx/ use crate::constant::TAB_SIZE; -use crate::construct::partial_space_or_tab::space_or_tab; +use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_with_options, Options}; use crate::subtokenize::link; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; use crate::util::span::from_exit_event; /// Kind of underline. @@ -131,7 +131,7 @@ fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } _ => { tokenizer.enter(TokenType::HeadingSetextText); - tokenizer.enter(TokenType::ChunkText); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); text_inside(tokenizer, code) } } @@ -148,7 +148,7 @@ fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Nok, None), Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::Data); tokenizer.exit(TokenType::HeadingSetextText); tokenizer.attempt(underline_before, |ok| { Box::new(if ok { after } else { text_continue }) @@ -176,16 +176,23 @@ fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::LineEnding); + tokenizer.enter_with_content(TokenType::LineEnding, Some(ContentType::Text)); let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), text_line_start), - )), + State::Fn(Box::new(tokenizer.attempt_opt( + space_or_tab_with_options(Options { + kind: TokenType::SpaceOrTab, + min: 1, + max: usize::MAX, + content_type: Some(ContentType::Text), + connect: true, + }), + text_line_start, + ))), None, ) } @@ -201,18 +208,11 @@ fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// == /// ``` fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let index = tokenizer.events.len() - 2; - - // Link the whitespace, if it exists. - if tokenizer.events[index].token_type == TokenType::SpaceOrTab { - link(&mut tokenizer.events, index); - } - match code { // Blank lines not allowed. Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), _ => { - tokenizer.enter(TokenType::ChunkText); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); text_inside(tokenizer, code) diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 405858d..6e8e476 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -11,11 +11,10 @@ use crate::tokenizer::{ Code, Event, EventType, LabelStart, Media, State, StateFnResult, TokenType, Tokenizer, }; use crate::util::{ + edit_map::EditMap, normalize_identifier::normalize_identifier, span::{serialize, Span}, }; -/// To do: could we do without `HashMap`, so we don’t need `std`? -use std::collections::HashMap; #[derive(Debug)] struct Info { @@ -32,43 +31,45 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { let media: Vec<Media> = tokenizer.media_list.drain(..).collect(); left.append(&mut left_2); - let mut map: HashMap<usize, (usize, Vec<Event>)> = HashMap::new(); + let mut edit_map = EditMap::new(); let events = &tokenizer.events; + // Remove loose label starts. let mut index = 0; while index < left.len() { let label_start = &left[index]; let data_enter_index = label_start.start.0; let data_exit_index = label_start.start.1; - map.insert( + edit_map.add( data_enter_index, - ( - data_exit_index - data_enter_index, - vec![ - Event { - event_type: EventType::Enter, - token_type: TokenType::Data, - point: events[data_enter_index].point.clone(), - index: events[data_enter_index].index, - previous: None, - next: None, - }, - Event { - event_type: EventType::Exit, - token_type: TokenType::Data, - point: events[data_exit_index].point.clone(), - index: events[data_exit_index].index, - previous: None, - next: None, - }, - ], - ), + data_exit_index - data_enter_index, + vec![ + Event { + event_type: EventType::Enter, + token_type: TokenType::Data, + point: events[data_enter_index].point.clone(), + index: events[data_enter_index].index, + previous: None, + next: None, + content_type: None, + }, + Event { + event_type: EventType::Exit, + token_type: TokenType::Data, + point: events[data_exit_index].point.clone(), + index: events[data_exit_index].index, + previous: None, + next: None, + content_type: None, + }, + ], ); index += 1; } + // Add grouping events. let mut index = 0; while index < media.len() { let media = &media[index]; @@ -90,8 +91,7 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { let group_end_index = media.end.1; // Insert a group enter and label enter. - add( - &mut map, + edit_map.add( group_enter_index, 0, vec![ @@ -106,6 +106,7 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { index: group_enter_event.index, previous: None, next: None, + content_type: None, }, Event { event_type: EventType::Enter, @@ -114,6 +115,7 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { index: group_enter_event.index, previous: None, next: None, + content_type: None, }, ], ); @@ -121,8 +123,7 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { // Empty events not allowed. if text_enter_index != text_exit_index { // Insert a text enter. - add( - &mut map, + edit_map.add( text_enter_index, 0, vec![Event { @@ -132,12 +133,12 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { index: events[text_enter_index].index, previous: None, next: None, + content_type: None, }], ); // Insert a text exit. - add( - &mut map, + edit_map.add( text_exit_index, 0, vec![Event { @@ -147,13 +148,13 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { index: events[text_exit_index].index, previous: None, next: None, + content_type: None, }], ); } // Insert a label exit. - add( - &mut map, + edit_map.add( label_exit_index + 1, 0, vec![Event { @@ -163,12 +164,12 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { index: events[label_exit_index].index, previous: None, next: None, + content_type: None, }], ); // Insert a group exit. - add( - &mut map, + edit_map.add( group_end_index + 1, 0, vec![Event { @@ -178,81 +179,14 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { index: events[group_end_index].index, previous: None, next: None, + content_type: None, }], ); index += 1; } - let mut indices: Vec<&usize> = map.keys().collect(); - indices.sort_unstable(); - let mut next_events: Vec<Event> = vec![]; - let mut index_into_indices = 0; - let mut start = 0; - let events = &mut tokenizer.events; - let mut shift: i32 = 0; - - while index_into_indices < indices.len() { - let index = *indices[index_into_indices]; - - if start < index { - let append = &mut events[start..index].to_vec(); - let mut index = 0; - - while index < append.len() { - let ev = &mut append[index]; - - if let Some(x) = ev.previous { - let next = (x as i32 + shift) as usize; - ev.previous = Some(next); - println!("todo: y: previous {:?} {:?} {:?}", x, shift, start); - } - - if let Some(x) = ev.next { - let next = (x as i32 + shift) as usize; - ev.next = Some(next); - println!("todo: y: next {:?} {:?} {:?}", x, shift, start); - } - - index += 1; - } - - next_events.append(append); - } - - let (remove, add) = map.get(&index).unwrap(); - shift += (add.len() as i32) - (*remove as i32); - - if !add.is_empty() { - let append = &mut add.clone(); - let mut index = 0; - - while index < append.len() { - let ev = &mut append[index]; - - if let Some(x) = ev.previous { - println!("todo: x: previous {:?} {:?} {:?}", x, shift, start); - } - - if let Some(x) = ev.next { - println!("todo: x: next {:?} {:?} {:?}", x, shift, start); - } - - index += 1; - } - - next_events.append(append); - } - - start = index + remove; - index_into_indices += 1; - } - - if start < events.len() { - next_events.append(&mut events[start..].to_vec()); - } - - next_events + edit_map.consume(&mut tokenizer.events) } /// Start of label end. @@ -693,20 +627,3 @@ fn collapsed_reference_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnRes _ => (State::Nok, None), } } - -pub fn add( - map: &mut HashMap<usize, (usize, Vec<Event>)>, - index: usize, - mut remove: usize, - mut add: Vec<Event>, -) { - let curr = map.remove(&index); - - if let Some((curr_rm, mut curr_add)) = curr { - remove += curr_rm; - curr_add.append(&mut add); - add = curr_add; - } - - map.insert(index, (remove, add)); -} diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 13bd5aa..fea7052 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -39,7 +39,7 @@ use crate::construct::{ partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break, }; use crate::subtokenize::link; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; /// Before a paragraph. /// @@ -53,7 +53,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } _ => { tokenizer.enter(TokenType::Paragraph); - tokenizer.enter(TokenType::ChunkText); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); inside(tokenizer, code) } } @@ -86,8 +86,8 @@ fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); - tokenizer.exit(TokenType::ChunkText); - tokenizer.enter(TokenType::ChunkText); + tokenizer.exit(TokenType::Data); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::Text)); let index = tokenizer.events.len() - 1; link(&mut tokenizer.events, index); (State::Fn(Box::new(inside)), None) @@ -100,7 +100,7 @@ fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// *** /// ``` fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::Data); tokenizer.exit(TokenType::Paragraph); (State::Ok, Some(vec![code])) } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 7887a44..05f5060 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -72,7 +72,7 @@ //! //! <!-- To do: link label end. --> -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; /// Configuration. /// @@ -134,7 +134,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFn tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.raw.clone()); tokenizer.enter(info.options.string.clone()); - tokenizer.enter(TokenType::ChunkString); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::String)); raw(tokenizer, code, info) } } @@ -155,7 +155,7 @@ fn enclosed_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFn (State::Ok, None) } else { tokenizer.enter(info.options.string.clone()); - tokenizer.enter(TokenType::ChunkString); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::String)); enclosed(tokenizer, code, info) } } @@ -168,7 +168,7 @@ fn enclosed_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFn fn enclosed(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::Char('>') => { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, code, info) } @@ -222,7 +222,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { } Code::Char(')') => { if info.balance == 0 { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); tokenizer.exit(info.options.string.clone()); tokenizer.exit(info.options.raw.clone()); tokenizer.exit(info.options.destination); @@ -240,7 +240,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { if info.balance > 0 { (State::Nok, None) } else { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); tokenizer.exit(info.options.string.clone()); tokenizer.exit(info.options.raw.clone()); tokenizer.exit(info.options.destination); diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 1cb7d4b..dd8ee84 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -55,10 +55,12 @@ // To do: pass token types in. +use super::partial_space_or_tab::{ + space_or_tab_one_line_ending_with_options, OneLineEndingOptions, +}; use crate::constant::LINK_REFERENCE_SIZE_MAX; -use crate::construct::partial_space_or_tab::space_or_tab; use crate::subtokenize::link; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; /// Configuration. /// @@ -130,8 +132,18 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes tokenizer.exit(info.options.label); (State::Ok, None) } + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( + space_or_tab_one_line_ending_with_options(OneLineEndingOptions { + content_type: Some(ContentType::String), + connect: info.connect, + }), + |t, c| { + info.connect = true; + at_break(t, c, info) + }, + )(tokenizer, code), _ => { - tokenizer.enter(TokenType::ChunkString); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::String)); if info.connect { let index = tokenizer.events.len() - 1; @@ -145,30 +157,6 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes } } -/// After a line ending. -/// -/// ```markdown -/// [a -/// |b] -/// ``` -fn line_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { - tokenizer.attempt_opt(space_or_tab(), |t, c| line_begin(t, c, info))(tokenizer, code) -} - -/// After a line ending, after optional whitespace. -/// -/// ```markdown -/// [a -/// |b] -/// ``` -fn line_begin(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { - match code { - // Blank line not allowed. - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), - _ => at_break(tokenizer, code, info), - } -} - /// In a label, in text. /// /// ```markdown @@ -176,20 +164,14 @@ fn line_begin(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResul /// ``` fn label(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { match code { - Code::None | Code::Char('[' | ']') => { - tokenizer.exit(TokenType::ChunkString); + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => { + tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } _ if info.size > LINK_REFERENCE_SIZE_MAX => { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.consume(code); - info.size += 1; - tokenizer.exit(TokenType::ChunkString); - (State::Fn(Box::new(|t, c| line_start(t, c, info))), None) - } Code::VirtualSpace | Code::Char('\t' | ' ') => { tokenizer.consume(code); info.size += 1; diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 43bdc53..8df7601 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -4,7 +4,8 @@ //! //! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) -use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; +use crate::subtokenize::link; +use crate::tokenizer::{Code, ContentType, State, StateFn, StateFnResult, TokenType, Tokenizer}; /// Options to parse whitespace. #[derive(Debug)] @@ -15,6 +16,25 @@ pub struct Options { pub max: usize, /// Token type to use for whitespace events. pub kind: TokenType, + /// To do. + pub content_type: Option<ContentType>, + pub connect: bool, +} + +#[derive(Debug)] +pub struct OneLineEndingOptions { + /// To do. + pub content_type: Option<ContentType>, + pub connect: bool, +} + +/// Options to parse whitespace. +#[derive(Debug)] +struct OneLineInfo { + /// Whether something was seen. + connect: bool, + /// Configuration. + options: OneLineEndingOptions, } /// Options to parse whitespace. @@ -35,45 +55,6 @@ pub fn space_or_tab() -> Box<StateFn> { space_or_tab_min_max(1, usize::MAX) } -pub fn space_or_tab_one_line_ending() -> Box<StateFn> { - Box::new(|tokenizer, code| { - tokenizer.attempt(space_or_tab(), move |ok| { - Box::new(move |tokenizer, code| match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new(tokenizer.attempt_opt( - space_or_tab(), - move |_t, code| { - if !matches!( - code, - Code::None - | Code::CarriageReturnLineFeed - | Code::Char('\r' | '\n') - ) { - (State::Ok, Some(vec![code])) - } else { - (State::Nok, None) - } - }, - ))), - None, - ) - } - _ => { - if ok { - (State::Ok, Some(vec![code])) - } else { - (State::Nok, None) - } - } - }) - })(tokenizer, code) - }) -} - /// Between `x` and `y` `space_or_tab` /// /// ```bnf @@ -84,6 +65,8 @@ pub fn space_or_tab_min_max(min: usize, max: usize) -> Box<StateFn> { kind: TokenType::SpaceOrTab, min, max, + content_type: None, + connect: false, }) } @@ -104,7 +87,13 @@ pub fn space_or_tab_with_options(options: Options) -> Box<StateFn> { fn start(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { match code { Code::VirtualSpace | Code::Char('\t' | ' ') if info.options.max > 0 => { - tokenizer.enter(info.options.kind.clone()); + tokenizer.enter_with_content(info.options.kind.clone(), info.options.content_type); + + if info.options.content_type.is_some() { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } + tokenizer.consume(code); info.size += 1; (State::Fn(Box::new(|t, c| inside(t, c, info))), None) @@ -146,3 +135,93 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResul } } } + +pub fn space_or_tab_one_line_ending() -> Box<StateFn> { + space_or_tab_one_line_ending_with_options(OneLineEndingOptions { + content_type: None, + connect: false, + }) +} + +pub fn space_or_tab_one_line_ending_with_options(options: OneLineEndingOptions) -> Box<StateFn> { + Box::new(move |tokenizer, code| { + let mut info = OneLineInfo { + connect: false, + options, + }; + + tokenizer.attempt( + space_or_tab_with_options(Options { + kind: TokenType::SpaceOrTab, + min: 1, + max: usize::MAX, + content_type: info.options.content_type, + connect: info.options.connect, + }), + move |ok| { + if ok && info.options.content_type.is_some() { + info.connect = true; + } + + Box::new(move |tokenizer, code| match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + at_eol(tokenizer, code, info) + } + _ => { + if ok { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } + } + }) + }, + )(tokenizer, code) + }) +} + +fn at_eol(tokenizer: &mut Tokenizer, code: Code, mut info: OneLineInfo) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.enter_with_content(TokenType::LineEnding, info.options.content_type); + + if info.options.content_type.is_some() { + if info.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else { + info.connect = true; + } + } + + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(tokenizer.attempt_opt( + space_or_tab_with_options(Options { + kind: TokenType::SpaceOrTab, + min: 1, + max: usize::MAX, + content_type: info.options.content_type, + connect: info.connect, + }), + after_eol, + ))), + None, + ) + } + _ => unreachable!("expected eol"), + } +} + +fn after_eol(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // Blank line not allowed. + if matches!( + code, + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + ) { + (State::Nok, None) + } else { + (State::Ok, Some(vec![code])) + } +} diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 78ae311..b102f7e 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -31,9 +31,11 @@ //! //! <!-- To do: link label end. --> -use crate::construct::partial_space_or_tab::space_or_tab; -use crate::subtokenize::link_to; -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use super::partial_space_or_tab::{ + space_or_tab_one_line_ending_with_options, OneLineEndingOptions, +}; +use crate::subtokenize::link; +use crate::tokenizer::{Code, ContentType, State, StateFnResult, TokenType, Tokenizer}; /// Configuration. /// @@ -108,8 +110,8 @@ impl Kind { /// State needed to parse titles. #[derive(Debug)] struct Info { - /// Whether we’ve seen our first `ChunkString`. - connect_index: Option<usize>, + /// Whether we’ve seen data. + connect: bool, /// Kind of title. kind: Kind, /// Configuration. @@ -127,7 +129,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFn match code { Code::Char(char) if char == '"' || char == '\'' || char == '(' => { let info = Info { - connect_index: None, + connect: false, kind: Kind::from_char(char), options, }; @@ -181,14 +183,24 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes begin(tokenizer, code, info) } Code::None => (State::Nok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( + space_or_tab_one_line_ending_with_options(OneLineEndingOptions { + content_type: Some(ContentType::String), + connect: info.connect, + }), + |t, c| { + info.connect = true; + at_break(t, c, info) + }, + )(tokenizer, code), _ => { - tokenizer.enter(TokenType::ChunkString); + tokenizer.enter_with_content(TokenType::Data, Some(ContentType::String)); - if let Some(connect_index) = info.connect_index { + if info.connect { let index = tokenizer.events.len() - 1; - link_to(&mut tokenizer.events, connect_index, index); + link(&mut tokenizer.events, index); } else { - info.connect_index = Some(tokenizer.events.len() - 1); + info.connect = true; } title(tokenizer, code, info) @@ -196,30 +208,6 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes } } -/// After a line ending. -/// -/// ```markdown -/// "a -/// |b" -/// ``` -fn line_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { - tokenizer.attempt_opt(space_or_tab(), |t, c| line_begin(t, c, info))(tokenizer, code) -} - -/// After a line ending, after optional whitespace. -/// -/// ```markdown -/// "a -/// |b" -/// ``` -fn line_begin(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { - match code { - // Blank line not allowed. - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), - _ => at_break(tokenizer, code, info), - } -} - /// In title text. /// /// ```markdown @@ -228,18 +216,13 @@ fn line_begin(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResul fn title(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::Char(char) if char == info.kind.as_char() => { - tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } - Code::None => { - tokenizer.exit(TokenType::ChunkString); + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.consume(code); - tokenizer.exit(TokenType::ChunkString); - (State::Fn(Box::new(|t, c| line_start(t, c, info))), None) - } Code::Char('\\') => { tokenizer.consume(code); (State::Fn(Box::new(|t, c| escape(t, c, info))), None) |