diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/compiler.rs | 213 | ||||
-rw-r--r-- | src/constant.rs | 7 | ||||
-rw-r--r-- | src/construct/definition.rs | 136 | ||||
-rw-r--r-- | src/construct/label_end.rs | 712 | ||||
-rw-r--r-- | src/construct/label_start_image.rs | 47 | ||||
-rw-r--r-- | src/construct/label_start_link.rs | 30 | ||||
-rw-r--r-- | src/construct/mod.rs | 11 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 3 | ||||
-rw-r--r-- | src/construct/partial_space_or_tab.rs | 39 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 14 | ||||
-rw-r--r-- | src/content/flow.rs | 20 | ||||
-rw-r--r-- | src/content/text.rs | 17 | ||||
-rw-r--r-- | src/parser.rs | 20 | ||||
-rw-r--r-- | src/subtokenize.rs | 38 | ||||
-rw-r--r-- | src/tokenizer.rs | 111 | ||||
-rw-r--r-- | src/util/sanitize_uri.rs | 2 |
16 files changed, 1240 insertions, 180 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index cfe749a..11dea29 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -1,5 +1,5 @@ //! Turn events into a string of HTML. -use crate::constant::SAFE_PROTOCOL_HREF; +use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}; use crate::construct::character_reference::Kind as CharacterReferenceKind; use crate::tokenizer::{Code, Event, EventType, TokenType}; use crate::util::{ @@ -17,6 +17,23 @@ pub enum LineEnding { LineFeed, } +/// To do. +#[derive(Debug)] +struct Media { + /// To do. + image: bool, + /// To do. + label_id: String, + /// To do. + label: String, + /// To do. + // reference_id: String, + /// To do. + destination: Option<String>, + /// To do. + title: Option<String>, +} + impl LineEnding { /// Turn the line ending into a [str]. fn as_str(&self) -> &str { @@ -168,7 +185,13 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { } else { Some(SAFE_PROTOCOL_HREF.to_vec()) }; + let protocol_src = if options.allow_dangerous_protocol { + None + } else { + Some(SAFE_PROTOCOL_SRC.to_vec()) + }; let mut line_ending_inferred: Option<LineEnding> = None; + let mut media_stack: Vec<Media> = vec![]; // let mut slurp_all_line_endings = false; while index < events.len() { @@ -257,7 +280,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { | TokenType::CodeFencedFenceMeta | TokenType::Definition | TokenType::HeadingAtxText - | TokenType::HeadingSetextText => { + | TokenType::HeadingSetextText + | TokenType::Label + | TokenType::ResourceTitleString => { buffer(buffers); } TokenType::CodeIndented => { @@ -287,6 +312,56 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { ignore_encode = true; } } + TokenType::Image => { + media_stack.push(Media { + image: true, + label_id: "".to_string(), + label: "".to_string(), + // reference_id: "".to_string(), + destination: None, + title: None, + }); + // tags = undefined // Disallow tags. + } + TokenType::Link => { + media_stack.push(Media { + image: false, + label_id: "".to_string(), + label: "".to_string(), + // reference_id: "".to_string(), + destination: None, + title: None, + }); + } + TokenType::Resource => { + buffer(buffers); // We can have line endings in the resource, ignore them. + let media = media_stack.last_mut().unwrap(); + media.destination = Some("".to_string()); + } + TokenType::ResourceDestinationString => { + buffer(buffers); + // Ignore encoding the result, as we’ll first percent encode the url and + // encode manually after. + ignore_encode = true; + } + TokenType::LabelImage + | TokenType::LabelImageMarker + | TokenType::LabelLink + | TokenType::LabelMarker + | TokenType::LabelEnd + | TokenType::ResourceMarker + | TokenType::ResourceDestination + | TokenType::ResourceDestinationLiteral + | TokenType::ResourceDestinationLiteralMarker + | TokenType::ResourceDestinationRaw + | TokenType::ResourceTitle + | TokenType::ResourceTitleMarker + | TokenType::Reference + | TokenType::ReferenceMarker + | TokenType::ReferenceString + | TokenType::LabelText => { + println!("ignore labels for now"); + } TokenType::Paragraph => { buf_tail_mut(buffers).push("<p>".to_string()); } @@ -324,14 +399,88 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { | TokenType::SpaceOrTab => { // Ignore. } + TokenType::LabelImage + | TokenType::LabelImageMarker + | TokenType::LabelLink + | TokenType::LabelMarker + | TokenType::LabelEnd + | TokenType::ResourceMarker + | TokenType::ResourceDestination + | TokenType::ResourceDestinationLiteral + | TokenType::ResourceDestinationLiteralMarker + | TokenType::ResourceDestinationRaw + | TokenType::ResourceTitle + | TokenType::ResourceTitleMarker + | TokenType::Reference + | TokenType::ReferenceMarker + | TokenType::ReferenceString => { + println!("ignore labels for now"); + } + TokenType::Label => { + let media = media_stack.last_mut().unwrap(); + media.label = resume(buffers); + } + TokenType::LabelText => { + let media = media_stack.last_mut().unwrap(); + media.label_id = serialize(codes, &from_exit_event(events, index), false); + } + TokenType::ResourceDestinationString => { + let media = media_stack.last_mut().unwrap(); + media.destination = Some(resume(buffers)); + ignore_encode = false; + } + TokenType::ResourceTitleString => { + let media = media_stack.last_mut().unwrap(); + media.title = Some(resume(buffers)); + } + TokenType::Image | TokenType::Link => { + // let mut is_in_image = false; + // let mut index = 0; + // Skip current. + // while index < (media_stack.len() - 1) { + // if media_stack[index].image { + // is_in_image = true; + // break; + // } + // index += 1; + // } + + // tags = is_in_image; + + let media = media_stack.pop().unwrap(); + println!("media: {:?}", media); + let buf = buf_tail_mut(buffers); + // To do: get from definition. + let destination = media.destination.unwrap(); + let title = if let Some(title) = media.title { + format!(" title=\"{}\"", title) + } else { + "".to_string() + }; + + if media.image { + buf.push(format!( + "<img src=\"{}\" alt=\"{}\"{} />", + sanitize_uri(&destination, &protocol_src), + media.label, + title + )); + } else { + buf.push(format!( + "<a href=\"{}\"{}>{}</a>", + sanitize_uri(&destination, &protocol_href), + title, + media.label + )); + } + } // Just output it. TokenType::CodeTextData | TokenType::Data | TokenType::CharacterEscapeValue => { // last_was_tag = false; - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + )); } TokenType::AutolinkEmail => { let slice = serialize(codes, &from_exit_event(events, index), false); @@ -340,7 +489,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { "<a href=\"mailto:{}\">", sanitize_uri(slice.as_str(), &protocol_href) )); - buf.push(encode(&slice)); + buf.push(encode_opt(&slice, ignore_encode)); buf.push("</a>".to_string()); } TokenType::AutolinkProtocol => { @@ -350,7 +499,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { "<a href=\"{}\">", sanitize_uri(slice.as_str(), &protocol_href) )); - buf.push(encode(&slice)); + buf.push(encode_opt(&slice, ignore_encode)); buf.push("</a>".to_string()); } TokenType::CharacterReferenceMarker => { @@ -377,7 +526,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { CharacterReferenceKind::Named => decode_named(ref_string), }; - buf_tail_mut(buffers).push(encode(&value)); + buf_tail_mut(buffers).push(encode_opt(&value, ignore_encode)); character_reference_kind = None; } TokenType::CodeFenced | TokenType::CodeIndented => { @@ -432,16 +581,15 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); // tag = true; } - TokenType::CodeFencedFenceMeta => { + TokenType::CodeFencedFenceMeta | TokenType::Resource => { resume(buffers); } TokenType::CodeFlowChunk => { code_flow_seen_data = Some(true); - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + )); } TokenType::CodeText => { let result = resume(buffers); @@ -492,11 +640,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if let Some(buf) = atx_heading_buffer { atx_heading_buffer = Some( buf.to_string() - + &encode(&serialize( - codes, - &from_exit_event(events, index), - false, - )), + + &encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + ), ); } @@ -512,14 +659,14 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if let Some(ref buf) = atx_heading_buffer { if !buf.is_empty() { - buf_tail_mut(buffers).push(encode(buf)); + buf_tail_mut(buffers).push(encode_opt(buf, ignore_encode)); atx_heading_buffer = Some("".to_string()); } } else { atx_heading_buffer = Some("".to_string()); } - buf_tail_mut(buffers).push(encode(&result)); + buf_tail_mut(buffers).push(encode_opt(&result, ignore_encode)); } TokenType::HeadingSetextText => { heading_setext_buffer = Some(resume(buffers)); @@ -540,7 +687,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { TokenType::HtmlFlowData | TokenType::HtmlTextData => { let slice = serialize(codes, &from_exit_event(events, index), false); // last_was_tag = false; - buf_tail_mut(buffers).push(if ignore_encode { slice } else { encode(&slice) }); + buf_tail_mut(buffers).push(encode_opt(&slice, ignore_encode)); } TokenType::LineEnding => { // if slurp_all_line_endings { @@ -549,11 +696,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if slurp_one_line_ending { slurp_one_line_ending = false; } else { - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + )); } } TokenType::Paragraph => { @@ -605,6 +751,15 @@ fn buf_tail(buffers: &mut [Vec<String>]) -> &Vec<String> { buffers.last().expect("at least one buffer should exist") } +/// To do. +fn encode_opt(value: &str, ignore_encode: bool) -> String { + if ignore_encode { + value.to_string() + } else { + encode(value) + } +} + /// Add a line ending. fn line_ending(buffers: &mut [Vec<String>], default: &LineEnding) { let tail = buf_tail_mut(buffers); diff --git a/src/constant.rs b/src/constant.rs index 8e1acf3..5cb7826 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -193,6 +193,11 @@ pub const HTML_RAW_SIZE_MAX: usize = 8; /// To safeguard performance, labels are capped at a large number: `999`. pub const LINK_REFERENCE_SIZE_MAX: usize = 999; +/// To do. +/// See: <https://spec.commonmark.org/0.30/#link-destination>, +/// <https://github.com/remarkjs/react-markdown/issues/658#issuecomment-984345577>. +pub const LINK_RESOURCE_DESTINATION_BALANCE_MAX: usize = 32; + /// List of protocols allowed, when operating safely, as `href` on `a`. /// /// This list is based on what is allowed by GitHub. @@ -201,8 +206,6 @@ pub const SAFE_PROTOCOL_HREF: [&str; 6] = ["http", "https", "irc", "ircs", "mail /// List of protocols allowed, when operating safely, as `src` on `img`. /// /// This list is based on what is allowed by GitHub. -// To do: image. -#[allow(dead_code)] pub const SAFE_PROTOCOL_SRC: [&str; 2] = ["http", "https"]; /// The number of characters that form a tab stop. diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 92d275c..674bd65 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -115,7 +115,7 @@ use crate::construct::{ partial_destination::{start as destination, Options as DestinationOptions}, partial_label::{start as label, Options as LabelOptions}, - partial_space_or_tab::space_or_tab, + partial_space_or_tab::{space_or_tab, space_or_tab_one_line_ending}, partial_title::{start as title, Options as TitleOptions}, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -168,7 +168,7 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::DefinitionMarker); ( State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), marker_after), + tokenizer.go(space_or_tab_one_line_ending(), destination_before), )), None, ) @@ -177,31 +177,6 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// After the marker, after whitespace. -/// -/// ```markdown -/// [a]: |b "c" -/// -/// [a]: |␊ -/// b "c" -/// ``` -fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), destination_before), - )), - None, - ) - } - _ => destination_before(tokenizer, code), - } -} - /// Before a destination. /// /// ```markdown @@ -211,35 +186,23 @@ fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |b "c" /// ``` fn destination_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let event = tokenizer.events.last().unwrap(); - - // Whitespace. - if (event.token_type == TokenType::LineEnding || event.token_type == TokenType::SpaceOrTab) - // Blank line not ok. - && !matches!( - code, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') - ) { - tokenizer.go( - |t, c| { - destination( - t, - c, - DestinationOptions { - limit: usize::MAX, - destination: TokenType::DefinitionDestination, - literal: TokenType::DefinitionDestinationLiteral, - marker: TokenType::DefinitionDestinationLiteralMarker, - raw: TokenType::DefinitionDestinationRaw, - string: TokenType::DefinitionDestinationString, - }, - ) - }, - destination_after, - )(tokenizer, code) - } else { - (State::Nok, None) - } + tokenizer.go( + |t, c| { + destination( + t, + c, + DestinationOptions { + limit: usize::MAX, + destination: TokenType::DefinitionDestination, + literal: TokenType::DefinitionDestinationLiteral, + marker: TokenType::DefinitionDestinationLiteralMarker, + raw: TokenType::DefinitionDestinationRaw, + string: TokenType::DefinitionDestinationString, + }, + ) + }, + destination_after, + )(tokenizer, code) } /// After a destination. @@ -289,32 +252,7 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// "c" /// ``` fn title_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_opt(space_or_tab(), title_before_after_optional_whitespace)(tokenizer, code) -} - -/// Before a title, after optional whitespace. -/// -/// ```markdown -/// [a]: b |"c" -/// -/// [a]: b |␊ -/// "c" -/// ``` -fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), title_before_marker), - )), - None, - ) - } - _ => title_before_marker(tokenizer, code), - } + tokenizer.go(space_or_tab_one_line_ending(), title_before_marker)(tokenizer, code) } /// Before a title, after a line ending. @@ -324,26 +262,20 @@ fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) /// | "c" /// ``` fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let event = tokenizer.events.last().unwrap(); - - if event.token_type == TokenType::LineEnding || event.token_type == TokenType::SpaceOrTab { - tokenizer.go( - |t, c| { - title( - t, - c, - TitleOptions { - title: TokenType::DefinitionTitle, - marker: TokenType::DefinitionTitleMarker, - string: TokenType::DefinitionTitleString, - }, - ) - }, - title_after, - )(tokenizer, code) - } else { - (State::Nok, None) - } + tokenizer.go( + |t, c| { + title( + t, + c, + TitleOptions { + title: TokenType::DefinitionTitle, + marker: TokenType::DefinitionTitleMarker, + string: TokenType::DefinitionTitleString, + }, + ) + }, + title_after, + )(tokenizer, code) } /// After a title. diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs new file mode 100644 index 0000000..405858d --- /dev/null +++ b/src/construct/label_end.rs @@ -0,0 +1,712 @@ +//! To do + +use crate::constant::LINK_RESOURCE_DESTINATION_BALANCE_MAX; +use crate::construct::{ + partial_destination::{start as destination, Options as DestinationOptions}, + partial_label::{start as label, Options as LabelOptions}, + partial_space_or_tab::space_or_tab_one_line_ending, + partial_title::{start as title, Options as TitleOptions}, +}; +use crate::tokenizer::{ + Code, Event, EventType, LabelStart, Media, State, StateFnResult, TokenType, Tokenizer, +}; +use crate::util::{ + normalize_identifier::normalize_identifier, + span::{serialize, Span}, +}; +/// To do: could we do without `HashMap`, so we don’t need `std`? +use std::collections::HashMap; + +#[derive(Debug)] +struct Info { + /// To do. + label_start_index: usize, + /// To do. + media: Media, +} + +#[allow(clippy::too_many_lines)] +pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec<Event> { + let mut left: Vec<LabelStart> = tokenizer.label_start_list_loose.drain(..).collect(); + let mut left_2: Vec<LabelStart> = tokenizer.label_start_stack.drain(..).collect(); + let media: Vec<Media> = tokenizer.media_list.drain(..).collect(); + left.append(&mut left_2); + + let mut map: HashMap<usize, (usize, Vec<Event>)> = HashMap::new(); + let events = &tokenizer.events; + + let mut index = 0; + while index < left.len() { + let label_start = &left[index]; + let data_enter_index = label_start.start.0; + let data_exit_index = label_start.start.1; + + map.insert( + data_enter_index, + ( + data_exit_index - data_enter_index, + vec![ + Event { + event_type: EventType::Enter, + token_type: TokenType::Data, + point: events[data_enter_index].point.clone(), + index: events[data_enter_index].index, + previous: None, + next: None, + }, + Event { + event_type: EventType::Exit, + token_type: TokenType::Data, + point: events[data_exit_index].point.clone(), + index: events[data_exit_index].index, + previous: None, + next: None, + }, + ], + ), + ); + + index += 1; + } + + let mut index = 0; + while index < media.len() { + let media = &media[index]; + // LabelLink:Enter or LabelImage:Enter. + let group_enter_index = media.start.0; + let group_enter_event = &events[group_enter_index]; + // LabelLink:Exit or LabelImage:Exit. + let text_enter_index = media.start.0 + + (if group_enter_event.token_type == TokenType::LabelLink { + 4 + } else { + 6 + }); + // LabelEnd:Enter. + let text_exit_index = media.end.0; + // LabelEnd:Exit. + let label_exit_index = media.end.0 + 3; + // Resource:Exit, etc. + let group_end_index = media.end.1; + + // Insert a group enter and label enter. + add( + &mut map, + group_enter_index, + 0, + vec![ + Event { + event_type: EventType::Enter, + token_type: if group_enter_event.token_type == TokenType::LabelLink { + TokenType::Link + } else { + TokenType::Image + }, + point: group_enter_event.point.clone(), + index: group_enter_event.index, + previous: None, + next: None, + }, + Event { + event_type: EventType::Enter, + token_type: TokenType::Label, + point: group_enter_event.point.clone(), + index: group_enter_event.index, + previous: None, + next: None, + }, + ], + ); + + // Empty events not allowed. + if text_enter_index != text_exit_index { + // Insert a text enter. + add( + &mut map, + text_enter_index, + 0, + vec![Event { + event_type: EventType::Enter, + token_type: TokenType::LabelText, + point: events[text_enter_index].point.clone(), + index: events[text_enter_index].index, + previous: None, + next: None, + }], + ); + + // Insert a text exit. + add( + &mut map, + text_exit_index, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::LabelText, + point: events[text_exit_index].point.clone(), + index: events[text_exit_index].index, + previous: None, + next: None, + }], + ); + } + + // Insert a label exit. + add( + &mut map, + label_exit_index + 1, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::Label, + point: events[label_exit_index].point.clone(), + index: events[label_exit_index].index, + previous: None, + next: None, + }], + ); + + // Insert a group exit. + add( + &mut map, + group_end_index + 1, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::Link, + point: events[group_end_index].point.clone(), + index: events[group_end_index].index, + previous: None, + next: None, + }], + ); + + index += 1; + } + + let mut indices: Vec<&usize> = map.keys().collect(); + indices.sort_unstable(); + let mut next_events: Vec<Event> = vec![]; + let mut index_into_indices = 0; + let mut start = 0; + let events = &mut tokenizer.events; + let mut shift: i32 = 0; + + while index_into_indices < indices.len() { + let index = *indices[index_into_indices]; + + if start < index { + let append = &mut events[start..index].to_vec(); + let mut index = 0; + + while index < append.len() { + let ev = &mut append[index]; + + if let Some(x) = ev.previous { + let next = (x as i32 + shift) as usize; + ev.previous = Some(next); + println!("todo: y: previous {:?} {:?} {:?}", x, shift, start); + } + + if let Some(x) = ev.next { + let next = (x as i32 + shift) as usize; + ev.next = Some(next); + println!("todo: y: next {:?} {:?} {:?}", x, shift, start); + } + + index += 1; + } + + next_events.append(append); + } + + let (remove, add) = map.get(&index).unwrap(); + shift += (add.len() as i32) - (*remove as i32); + + if !add.is_empty() { + let append = &mut add.clone(); + let mut index = 0; + + while index < append.len() { + let ev = &mut append[index]; + + if let Some(x) = ev.previous { + println!("todo: x: previous {:?} {:?} {:?}", x, shift, start); + } + + if let Some(x) = ev.next { + println!("todo: x: next {:?} {:?} {:?}", x, shift, start); + } + + index += 1; + } + + next_events.append(append); + } + + start = index + remove; + index_into_indices += 1; + } + + if start < events.len() { + next_events.append(&mut events[start..].to_vec()); + } + + next_events +} + +/// Start of label end. +/// +/// ```markdown +/// [a|](b) c +/// [a|][b] c +/// [a|][] b +/// [a|] b +/// +/// [a]: z +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char(']') == code { + let mut label_start_index: Option<usize> = None; + let mut index = tokenizer.label_start_stack.len(); + + while index > 0 { + index -= 1; + + if !tokenizer.label_start_stack[index].balanced { + label_start_index = Some(index); + break; + } + } + + // If there is an okay opening: + if let Some(label_start_index) = label_start_index { + let label_start = tokenizer + .label_start_stack + .get_mut(label_start_index) + .unwrap(); + + // Mark as balanced if the info is inactive. + if label_start.inactive { + return nok(tokenizer, code, label_start_index); + } + + let label_end_start = tokenizer.events.len(); + let info = Info { + label_start_index, + media: Media { + start: label_start.start, + end: (label_end_start, label_end_start + 3), + id: normalize_identifier(&serialize( + &tokenizer.parse_state.codes, + &Span { + start_index: tokenizer.events[label_start.start.1].index, + end_index: tokenizer.events[label_end_start - 1].index, + }, + false, + )), + }, + }; + + tokenizer.enter(TokenType::LabelEnd); + tokenizer.enter(TokenType::LabelMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelMarker); + tokenizer.exit(TokenType::LabelEnd); + + return (State::Fn(Box::new(move |t, c| after(t, c, info))), None); + } + } + + (State::Nok, None) +} + +/// After `]`. +/// +/// ```markdown +/// [a]|(b) c +/// [a]|[b] c +/// [a]|[] b +/// [a]| b +/// +/// [a]: z +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + // let label_start = tokenizer + // .label_start_stack + // .get_mut(info.label_start_index) + // .unwrap(); + // To do: figure out if defined or not. + let defined = false; + println!("to do: is `{:?}` defined?", info); + match code { + // Resource (`[asd](fgh)`)? + Code::Char('(') => tokenizer.attempt(resource, move |is_ok| { + Box::new(move |t, c| { + // Also fine if `defined`, as then it’s a valid shortcut. + if is_ok || defined { + ok(t, c, info) + } else { + nok(t, c, info.label_start_index) + } + }) + })(tokenizer, code), + // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? + Code::Char('[') => tokenizer.attempt(full_reference, move |is_ok| { + Box::new(move |t, c| { + if is_ok { + ok(t, c, info) + } else if defined { + reference_not_full(t, c, info) + } else { + nok(t, c, info.label_start_index) + } + }) + })(tokenizer, code), + // Shortcut reference: `[asd]`? + _ => { + if defined { + ok(tokenizer, code, info) + } else { + nok(tokenizer, code, info.label_start_index) + } + } + } +} + +/// After `]`, at `[`, but not at a full reference. +/// +/// > 👉 **Note**: we only get here if the label is defined. +/// +/// ```markdown +/// [a]|[] b +/// +/// [a]: z +/// ``` +fn reference_not_full(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + tokenizer.attempt(collapsed_reference, move |is_ok| { + Box::new(move |t, c| { + if is_ok { + ok(t, c, info) + } else { + nok(t, c, info.label_start_index) + } + }) + })(tokenizer, code) +} + +/// Done, we found something. +/// +/// ```markdown +/// [a](b)| c +/// [a][b]| c +/// [a][]| b +/// [a]| b +/// +/// [a]: z +/// ``` +fn ok(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { + println!( + "ok res, ref full, ref, collapsed, or ref shortcut: {:?}", + info.media + ); + // Remove this one and everything after it. + let mut left: Vec<LabelStart> = tokenizer + .label_start_stack + .drain(info.label_start_index..) + .collect(); + // Remove this one from `left`, as we’ll move it to `media_list`. + left.remove(0); + tokenizer.label_start_list_loose.append(&mut left); + + let is_link = tokenizer.events[info.media.start.0].token_type == TokenType::LabelLink; + + if is_link { + let mut index = 0; + while index < tokenizer.label_start_stack.len() { + let label_start = &mut tokenizer.label_start_stack[index]; + if tokenizer.events[label_start.start.0].token_type == TokenType::LabelLink { + label_start.inactive = true; + } + index += 1; + } + } + + info.media.end.1 = tokenizer.events.len() - 1; + tokenizer.media_list.push(info.media); + tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + (State::Ok, Some(vec![code])) +} + +/// Done, it’s nothing. +/// +/// There was an okay opening, but we didn’t match anything. +/// +/// ```markdown +/// [a]|(b c +/// [a]|[b c +/// [b]|[ c +/// [b]| c +/// +/// [a]: z +/// ``` +fn nok(tokenizer: &mut Tokenizer, _code: Code, label_start_index: usize) -> StateFnResult { + let label_start = tokenizer + .label_start_stack + .get_mut(label_start_index) + .unwrap(); + println!("just balanced braces: {:?}", label_start); + label_start.balanced = true; + // To do: pop things off the list? + (State::Nok, None) +} + +/// Before a resource, at `(`. +/// +/// ```markdown +/// [a]|(b) c +/// ``` +fn resource(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('(') => { + tokenizer.enter(TokenType::Resource); + tokenizer.enter(TokenType::ResourceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ResourceMarker); + (State::Fn(Box::new(resource_start)), None) + } + _ => unreachable!("expected `(`"), + } +} + +/// At the start of a resource, after `(`, before a definition. +/// +/// ```markdown +/// [a](|b) c +/// ``` +fn resource_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt_opt(space_or_tab_one_line_ending(), resource_open)(tokenizer, code) +} + +/// At the start of a resource, after optional whitespace. +/// +/// ```markdown +/// [a](|b) c +/// ``` +fn resource_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(')') => resource_end(tokenizer, code), + _ => tokenizer.go( + |t, c| { + destination( + t, + c, + DestinationOptions { + limit: LINK_RESOURCE_DESTINATION_BALANCE_MAX, + destination: TokenType::ResourceDestination, + literal: TokenType::ResourceDestinationLiteral, + marker: TokenType::ResourceDestinationLiteralMarker, + raw: TokenType::ResourceDestinationRaw, + string: TokenType::ResourceDestinationString, + }, + ) + }, + destination_after, + )(tokenizer, code), + } +} + +/// In a resource, after a destination, before optional whitespace. +/// +/// ```markdown +/// [a](b|) c +/// [a](b| "c") d +/// ``` +fn destination_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(space_or_tab_one_line_ending(), |ok| { + Box::new(if ok { resource_between } else { resource_end }) + })(tokenizer, code) +} + +/// In a resource, after a destination, after whitespace. +/// +/// ```markdown +/// [a](b |) c +/// [a](b |"c") d +/// ``` +fn resource_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('"' | '\'' | '(') => tokenizer.go( + |t, c| { + title( + t, + c, + TitleOptions { + title: TokenType::ResourceTitle, + marker: TokenType::ResourceTitleMarker, + string: TokenType::ResourceTitleString, + }, + ) + }, + title_after, + )(tokenizer, code), + _ => resource_end(tokenizer, code), + } +} + +/// In a resource, after a title. +/// +/// ```markdown +/// [a](b "c"|) d +/// ``` +fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt_opt(space_or_tab_one_line_ending(), resource_end)(tokenizer, code) +} + +/// In a resource, at the `)`. +/// +/// ```markdown +/// [a](b|) c +/// [a](b |) c +/// [a](b "c"|) d +/// ``` +fn resource_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(')') => { + tokenizer.enter(TokenType::ResourceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ResourceMarker); + tokenizer.exit(TokenType::Resource); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} + +/// In a reference (full), at the `[`. +/// +/// ```markdown +/// [a]|[b] +/// ``` +fn full_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => tokenizer.go( + |t, c| { + label( + t, + c, + LabelOptions { + label: TokenType::Reference, + marker: TokenType::ReferenceMarker, + string: TokenType::ReferenceString, + }, + ) + }, + full_reference_after, + )(tokenizer, code), + _ => unreachable!("expected `[`"), + } +} + +/// In a reference (full), after `]`. +/// +/// ```markdown +/// [a][b]| +/// ``` +fn full_reference_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let events = &tokenizer.events; + let mut index = events.len() - 1; + let mut start: Option<usize> = None; + let mut end: Option<usize> = None; + + while index > 0 { + index -= 1; + let event = &events[index]; + if event.token_type == TokenType::ReferenceString { + if event.event_type == EventType::Exit { + end = Some(event.index); + } else { + start = Some(event.index); + break; + } + } + } + + // Always found, otherwise we don’t get here. + let start = start.unwrap(); + let end = end.unwrap(); + + let id = normalize_identifier(&serialize( + &tokenizer.parse_state.codes, + &Span { + start_index: start, + end_index: end, + }, + false, + )); + println!("to do: is `{:?}` defined?", id); + let defined = false; + + if defined { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } +} + +/// In a reference (collapsed), at the `[`. +/// +/// > 👉 **Note**: we only get here if the label is defined. +/// +/// ```markdown +/// [a]|[] +/// ``` +fn collapsed_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => { + tokenizer.enter(TokenType::Reference); + tokenizer.enter(TokenType::ReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ReferenceMarker); + (State::Fn(Box::new(collapsed_reference_open)), None) + } + _ => (State::Nok, None), + } +} + +/// In a reference (collapsed), at the `]`. +/// +/// > 👉 **Note**: we only get here if the label is defined. +/// +/// ```markdown +/// [a][|] +/// ``` +fn collapsed_reference_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.enter(TokenType::ReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ReferenceMarker); + tokenizer.exit(TokenType::Reference); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} + +pub fn add( + map: &mut HashMap<usize, (usize, Vec<Event>)>, + index: usize, + mut remove: usize, + mut add: Vec<Event>, +) { + let curr = map.remove(&index); + + if let Some((curr_rm, mut curr_add)) = curr { + remove += curr_rm; + curr_add.append(&mut add); + add = curr_add; + } + + map.insert(index, (remove, add)); +} diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs new file mode 100644 index 0000000..2e96977 --- /dev/null +++ b/src/construct/label_start_image.rs @@ -0,0 +1,47 @@ +//! To do + +use super::label_end::resolve_media; +use crate::tokenizer::{Code, LabelStart, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of label (image) start. +/// +/// ```markdown +/// a |![ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.enter(TokenType::LabelImage); + tokenizer.enter(TokenType::LabelImageMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelImageMarker); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// After `!`, before a `[`. +/// +/// ```markdown +/// a !|[ b +/// ``` +pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => { + tokenizer.enter(TokenType::LabelMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelMarker); + tokenizer.exit(TokenType::LabelImage); + let end = tokenizer.events.len() - 1; + tokenizer.label_start_stack.push(LabelStart { + start: (end - 5, end), + balanced: false, + inactive: false, + }); + tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs new file mode 100644 index 0000000..35c9dcd --- /dev/null +++ b/src/construct/label_start_link.rs @@ -0,0 +1,30 @@ +//! To do + +use super::label_end::resolve_media; +use crate::tokenizer::{Code, LabelStart, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of label (link) start. +/// +/// ```markdown +/// a |[ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => { + let start = tokenizer.events.len(); + tokenizer.enter(TokenType::LabelLink); + tokenizer.enter(TokenType::LabelMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelMarker); + tokenizer.exit(TokenType::LabelLink); + tokenizer.label_start_stack.push(LabelStart { + start: (start, tokenizer.events.len() - 1), + balanced: false, + inactive: false, + }); + tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 9e5da0e..8565b2f 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -30,9 +30,9 @@ //! * [heading (setext)][heading_setext] //! * [html (flow)][html_flow] //! * [html (text)][html_text] -//! * label end -//! * label start (image) -//! * label start (link) +//! * [label end][label_end] +//! * [label start (image)][label_start_image] +//! * [label start (link)][label_start_link] //! * list //! * [paragraph][] //! * [thematic break][thematic_break] @@ -59,8 +59,6 @@ //! They also contain references to character as defined by [char][], so for //! example `ascii_punctuation` refers to //! [`char::is_ascii_punctuation`][char::is_ascii_punctuation]. -//! -//! pub mod autolink; pub mod blank_line; @@ -76,6 +74,9 @@ pub mod heading_atx; pub mod heading_setext; pub mod html_flow; pub mod html_text; +pub mod label_end; +pub mod label_start_image; +pub mod label_start_link; pub mod paragraph; pub mod partial_data; pub mod partial_destination; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 03dcbee..7887a44 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -267,11 +267,10 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { /// ```markdown /// a\|)b /// ``` -fn raw_escape(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { +fn raw_escape(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::Char('(' | ')' | '\\') => { tokenizer.consume(code); - info.balance += 1; (State::Fn(Box::new(move |t, c| raw(t, c, info))), None) } _ => raw(tokenizer, code, info), diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 024a4b2..43bdc53 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -35,6 +35,45 @@ pub fn space_or_tab() -> Box<StateFn> { space_or_tab_min_max(1, usize::MAX) } +pub fn space_or_tab_one_line_ending() -> Box<StateFn> { + Box::new(|tokenizer, code| { + tokenizer.attempt(space_or_tab(), move |ok| { + Box::new(move |tokenizer, code| match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(tokenizer.attempt_opt( + space_or_tab(), + move |_t, code| { + if !matches!( + code, + Code::None + | Code::CarriageReturnLineFeed + | Code::Char('\r' | '\n') + ) { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } + }, + ))), + None, + ) + } + _ => { + if ok { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } + } + }) + })(tokenizer, code) + }) +} + /// Between `x` and `y` `space_or_tab` /// /// ```bnf diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 3e61788..78ae311 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -32,7 +32,7 @@ //! <!-- To do: link label end. --> use crate::construct::partial_space_or_tab::space_or_tab; -use crate::subtokenize::link; +use crate::subtokenize::link_to; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Configuration. @@ -109,7 +109,7 @@ impl Kind { #[derive(Debug)] struct Info { /// Whether we’ve seen our first `ChunkString`. - connect: bool, + connect_index: Option<usize>, /// Kind of title. kind: Kind, /// Configuration. @@ -125,9 +125,9 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFnResult { match code { - Code::Char(char) if char == '(' || char == '"' || char == '\'' => { + Code::Char(char) if char == '"' || char == '\'' || char == '(' => { let info = Info { - connect: false, + connect_index: None, kind: Kind::from_char(char), options, }; @@ -184,11 +184,11 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes _ => { tokenizer.enter(TokenType::ChunkString); - if info.connect { + if let Some(connect_index) = info.connect_index { let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); + link_to(&mut tokenizer.events, connect_index, index); } else { - info.connect = true; + info.connect_index = Some(tokenizer.events.len() - 1); } title(tokenizer, code, info) diff --git a/src/content/flow.rs b/src/content/flow.rs index e71d25a..546712f 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -26,6 +26,7 @@ use crate::construct::{ html_flow::start as html_flow, paragraph::start as paragraph, thematic_break::start as thematic_break, }; +use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; use crate::util::{ @@ -34,9 +35,10 @@ use crate::util::{ }; /// Turn `codes` as the flow content type into events. -pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { - let mut tokenizer = Tokenizer::new(point, index); - tokenizer.feed(codes, Box::new(start), true); +pub fn flow(parse_state: &ParseState, point: Point, index: usize) -> Vec<Event> { + let mut tokenizer = Tokenizer::new(point, index, parse_state); + + tokenizer.push(&parse_state.codes, Box::new(start), true); let mut index = 0; @@ -47,9 +49,14 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { && event.token_type == TokenType::DefinitionLabelString { let id = normalize_identifier( - serialize(codes, &from_exit_event(&tokenizer.events, index), false).as_str(), + serialize( + &parse_state.codes, + &from_exit_event(&tokenizer.events, index), + false, + ) + .as_str(), ); - println!("to do: use identifier {:?}", id); + println!("to do: use definition identifier {:?}", id); } index += 1; @@ -58,8 +65,9 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { let mut result = (tokenizer.events, false); while !result.1 { - result = subtokenize(result.0, codes); + result = subtokenize(result.0, parse_state); } + result.0 } diff --git a/src/content/text.rs b/src/content/text.rs index 1224064..5718617 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -21,15 +21,19 @@ use crate::construct::{ character_reference::start as character_reference, code_text::start as code_text, hard_break_escape::start as hard_break_escape, hard_break_trailing::start as hard_break_trailing, html_text::start as html_text, - partial_data::start as data, + label_end::start as label_end, label_start_image::start as label_start_image, + label_start_link::start as label_start_link, partial_data::start as data, }; use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; -const MARKERS: [Code; 5] = [ +const MARKERS: [Code; 8] = [ Code::Char(' '), // `hard_break_trailing` + Code::Char('!'), // `label_start_image` Code::Char('&'), // `character_reference` Code::Char('<'), // `autolink`, `html_text` + Code::Char('['), // `label_start_link` Code::Char('\\'), // `character_escape`, `hard_break_escape` + Code::Char(']'), // `label_end` Code::Char('`'), // `code_text` ]; @@ -47,13 +51,16 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None => (State::Ok, None), _ => tokenizer.attempt_n( vec![ - Box::new(character_reference), + Box::new(autolink), Box::new(character_escape), + Box::new(character_reference), + Box::new(code_text), Box::new(hard_break_escape), Box::new(hard_break_trailing), - Box::new(autolink), Box::new(html_text), - Box::new(code_text), + Box::new(label_end), + Box::new(label_start_image), + Box::new(label_start_link), ], |ok| Box::new(if ok { start } else { before_data }), )(tokenizer, code), diff --git a/src/parser.rs b/src/parser.rs index 49d99d3..32b7f36 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4,14 +4,24 @@ use crate::content::flow::flow; use crate::tokenizer::{as_codes, Code, Event, Point}; +pub struct ParseState { + /// To do. + pub codes: Vec<Code>, + /// To do. + pub definitions: Vec<String>, +} + /// Turn a string of markdown into events. /// /// Passes the codes back so the compiler can access the source. pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) { - let codes = as_codes(value); - // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough. + let parse_state = ParseState { + codes: as_codes(value), + definitions: vec![], + }; + let events = flow( - &codes, + &parse_state, Point { line: 1, column: 1, @@ -19,5 +29,7 @@ pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) { }, 0, ); - (events, codes) + + // To do: pass whole `parse_state` back? + (events, parse_state.codes) } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 4ee2242..58db3c6 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -28,9 +28,8 @@ use std::collections::HashMap; use crate::content::{string::start as string, text::start as text}; -use crate::tokenizer::{ - Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, -}; +use crate::parser::ParseState; +use crate::tokenizer::{Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer}; use crate::util::span; /// Create a link between two [`Event`][]s. @@ -39,25 +38,36 @@ use crate::util::span; /// This optimizes for the common case where the token at `index` is connected /// to the previous void token. pub fn link(events: &mut [Event], index: usize) { - let prev = &mut events[index - 2]; + link_to(events, index - 2, index); +} + +/// To do +pub fn link_to(events: &mut [Event], pevious: usize, next: usize) { + let prev = &mut events[pevious]; + // To do: force chunks? + // assert!( + // prev.token_type == TokenType::ChunkString || prev.token_type == TokenType::ChunkText, + // "{:?}", + // prev.token_type.to_owned() + // ); assert_eq!(prev.event_type, EventType::Enter); - prev.next = Some(index); + prev.next = Some(next); - let prev_ref = &events[index - 2]; - let prev_exit_ref = &events[index - 1]; + let prev_ref = &events[pevious]; + let prev_exit_ref = &events[pevious + 1]; assert_eq!(prev_exit_ref.event_type, EventType::Exit); assert_eq!(prev_exit_ref.token_type, prev_ref.token_type); - let curr = &mut events[index]; + let curr = &mut events[next]; assert_eq!(curr.event_type, EventType::Enter); - curr.previous = Some(index - 2); + curr.previous = Some(pevious); // Note: the exit of this event may not exist, so don’t check for that. } /// Parse linked events. /// /// Supposed to be called repeatedly, returns `1: true` when done. -pub fn subtokenize(mut events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { +pub fn subtokenize(mut events: Vec<Event>, parse_state: &ParseState) -> (Vec<Event>, bool) { let mut index = 0; // Map of first chunks to their tokenizer. let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new(); @@ -83,7 +93,7 @@ pub fn subtokenize(mut events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) // Index into `events` pointing to a chunk. let mut index_opt: Option<usize> = Some(index); // Subtokenizer. - let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); + let mut tokenizer = Tokenizer::new(event.point.clone(), event.index, parse_state); // Substate. let mut result: StateFnResult = ( State::Fn(Box::new(if event.token_type == TokenType::ChunkString { @@ -115,7 +125,11 @@ pub fn subtokenize(mut events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) _ => unreachable!("cannot be ok/nok"), }; - result = tokenizer.feed(span::codes(codes, &span), func, enter.next == None); + result = tokenizer.push( + span::codes(&parse_state.codes, &span), + func, + enter.next == None, + ); assert!(result.1.is_none(), "expected no remainder"); index_opt = enter.next; } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7b71308..a692a4d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -15,6 +15,7 @@ use std::collections::HashMap; use crate::constant::TAB_SIZE; +use crate::parser::ParseState; /// Semantic label of a span. // To do: figure out how to share this so extensions can add their own stuff, @@ -1073,6 +1074,32 @@ pub enum TokenType { /// ^^^ /// ``` HtmlTextData, + /// To do, + LabelImage, + /// To do, + LabelImageMarker, + /// To do, + LabelLink, + /// To do, + LabelMarker, + LabelEnd, + Resource, + ResourceMarker, + ResourceDestination, + ResourceDestinationLiteral, + ResourceDestinationLiteralMarker, + ResourceDestinationRaw, + ResourceDestinationString, + ResourceTitle, + ResourceTitleMarker, + ResourceTitleString, + Reference, + ReferenceMarker, + ReferenceString, + Link, + Image, + Label, + LabelText, /// Line ending. /// /// ## Info @@ -1243,6 +1270,9 @@ pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult; /// In certain cases, it can also yield back up parsed codes that were passed down. pub type StateFnResult = (State, Option<Vec<Code>>); +/// To do. +pub type Resolver = dyn FnOnce(&mut Tokenizer) -> Vec<Event>; + /// The result of a state. pub enum State { /// There is a future state: a boxed [`StateFn`][] to pass the next code to. @@ -1253,6 +1283,30 @@ pub enum State { Nok, } +/// To do. +#[derive(Debug)] +pub struct LabelStart { + /// To do. + pub start: (usize, usize), + /// A boolean used internally to figure out if a label start link can’t be + /// used (because links in links are incorrect). + pub inactive: bool, + /// A boolean used internally to figure out if a label is balanced: they’re + /// not media, it’s just balanced braces. + pub balanced: bool, +} + +/// To do. +#[derive(Debug)] +pub struct Media { + /// To do. + pub start: (usize, usize), + /// To do. + pub end: (usize, usize), + /// To do. + pub id: String, +} + /// The internal state of a tokenizer, not to be confused with states from the /// state machine, this instead is all the information about where we currently /// are and what’s going on. @@ -1272,9 +1326,10 @@ struct InternalState { point: Point, } +// #[derive(Debug)] + /// A tokenizer itself. -#[derive(Debug)] -pub struct Tokenizer { +pub struct Tokenizer<'a> { column_start: HashMap<usize, usize>, /// Track whether a character is expected to be consumed, and whether it’s /// actually consumed @@ -1295,11 +1350,22 @@ pub struct Tokenizer { index: usize, /// Current relative and absolute place in the file. point: Point, + /// To do. + pub parse_state: &'a ParseState, + /// To do. + pub label_start_stack: Vec<LabelStart>, + /// To do. + pub label_start_list_loose: Vec<LabelStart>, + /// To do. + pub media_list: Vec<Media>, + /// To do. + resolvers: Vec<Box<Resolver>>, + resolver_ids: Vec<String>, } -impl Tokenizer { +impl<'a> Tokenizer<'a> { /// Create a new tokenizer. - pub fn new(point: Point, index: usize) -> Tokenizer { + pub fn new(point: Point, index: usize, parse_state: &'a ParseState) -> Tokenizer { Tokenizer { previous: Code::None, current: Code::None, @@ -1309,6 +1375,20 @@ impl Tokenizer { point, stack: vec![], events: vec![], + parse_state, + label_start_stack: vec![], + label_start_list_loose: vec![], + media_list: vec![], + resolvers: vec![], + resolver_ids: vec![], + } + } + + /// To do. + pub fn register_resolver(&mut self, id: String, resolver: Box<Resolver>) { + if !self.resolver_ids.contains(&id) { + self.resolver_ids.push(id); + self.resolvers.push(resolver); } } @@ -1582,7 +1662,8 @@ impl Tokenizer { /// This is set up to support repeatedly calling `feed`, and thus streaming /// markdown into the state machine, and normally pauses after feeding. /// When `done: true` is passed, the EOF is fed. - pub fn feed( + // To do: call this `feed_impl`, and rename `push` to `feed`? + fn feed( &mut self, codes: &[Code], start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, @@ -1643,6 +1724,26 @@ impl Tokenizer { check_statefn_result((state, None)) } + + /// To do. + // To do: set a `drained` to prevent passing after draining? + pub fn push( + &mut self, + codes: &[Code], + start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + drain: bool, + ) -> StateFnResult { + let result = self.feed(codes, start, drain); + + if drain { + while !self.resolvers.is_empty() { + let resolver = self.resolvers.remove(0); + self.events = resolver(self); + } + } + + result + } } /// Internal utility to wrap states to also capture codes. diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index d66978e..55b15e4 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -115,7 +115,7 @@ fn normalize_uri(value: &str) -> String { result.push( buff[0..char.len_utf8()] .iter() - .map(|&byte| format!("%{:X}", byte)) + .map(|&byte| format!("%{:>02X}", byte)) .collect::<String>(), ); |