From a3dd207e3b1ebcbcb6cec0f703a695e51ae4ece0 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 24 Jun 2022 17:57:10 +0200 Subject: Add link, images (resource) This is still some messy code that needs cleaning up, but it adds support for links and images, of the resource kind (`[a](b)`). References (`[a][b]`) are parsed and will soon be supported, but need matching. * Fix bug to pad percent-encoded bytes when normalizing urls * Fix bug with escapes counting as balancing in destination * Add `space_or_tab_one_line_ending`, to parse whitespace including up to one line ending (but not a blank line) * Add `ParserState` to share codes, definitions, etc --- readme.md | 33 +- src/compiler.rs | 213 ++++++++-- src/constant.rs | 7 +- src/construct/definition.rs | 136 ++----- src/construct/label_end.rs | 712 ++++++++++++++++++++++++++++++++++ src/construct/label_start_image.rs | 47 +++ src/construct/label_start_link.rs | 30 ++ src/construct/mod.rs | 11 +- src/construct/partial_destination.rs | 3 +- src/construct/partial_space_or_tab.rs | 39 ++ src/construct/partial_title.rs | 14 +- src/content/flow.rs | 20 +- src/content/text.rs | 17 +- src/parser.rs | 20 +- src/subtokenize.rs | 38 +- src/tokenizer.rs | 111 +++++- src/util/sanitize_uri.rs | 2 +- tests/character_escape.rs | 11 +- tests/character_reference.rs | 24 +- tests/image.rs | 229 +++++++++++ tests/link_resource.rs | 464 ++++++++++++++++++++++ tests/misc_dangerous_protocol.rs | 324 ++++++++-------- tests/misc_tabs.rs | 66 ++-- tests/misc_url.rs | 107 +++-- 24 files changed, 2216 insertions(+), 462 deletions(-) create mode 100644 src/construct/label_end.rs create mode 100644 src/construct/label_start_image.rs create mode 100644 src/construct/label_start_link.rs create mode 100644 tests/image.rs create mode 100644 tests/link_resource.rs diff --git a/readme.md b/readme.md index e5bc638..6dd8cc5 100644 --- a/readme.md +++ b/readme.md @@ -82,9 +82,9 @@ cargo doc --document-private-items - [x] heading (setext) - [x] html (flow) - [x] html (text) -- [ ] (3) label end -- [ ] (3) label start (image) -- [ ] (3) label start (link) +- [x] label end +- [x] label start (image) +- [x] label start (link) - [ ] (8) list - [x] paragraph - [x] thematic break @@ -113,9 +113,9 @@ cargo doc --document-private-items - [x] hard break (escape) - [x] hard break (trailing) - [x] html (text) - - [ ] label end - - [ ] label start (image) - - [ ] label start (link) + - [x] label end + - [x] label start (image) + - [x] label start (link) - [x] string - [x] character escape - [x] character reference @@ -124,10 +124,28 @@ cargo doc --document-private-items #### Docs +- [ ] (1) Media in compiler (`Media`, `encode_opt`) +- [ ] (1) `LINK_RESOURCE_DESTINATION_BALANCE_MAX` in constants +- [ ] (1) `label_start_image`, `label_start_link` +- [ ] (1) `label_end` +- [ ] (1) `space_or_tab_one_line_ending` +- [ ] (1) `ParseState` +- [ ] (1) Image, Link, and other media token types; `LabelStart`, `Media` +- [ ] (1) Resolvers, push, feed, etc. - [ ] (1) Go through all bnf - [ ] (1) Go through all docs - [ ] (1) Add overview docs on how everything works +#### Refactor + +- [ ] (1) Move map handling from `resolve_media`, reuse in `subtokenize` +- [ ] (1) Clean shifting, assertions in the above helper +- [ ] (1) Clean `space_or_tab_one_line_ending` +- [ ] (1) Use `link_to` (and `space_or_tab_one_line_ending`) in more places? + It’s probably better +- [ ] (1) Force chunks in `link_to`, disallowing `LineEnding` and such +- [ ] (1) Clean feeding, resolving + #### Parse - [ ] (1) Parse initial and final space_or_tab of paragraphs (in text)\ @@ -136,8 +154,7 @@ cargo doc --document-private-items `misc_tabs`, `thematic_break`) - [ ] (3) Interrupting (html flow complete) - [ ] (5) labels\ - test (`character_escape`, `character_reference`, `definition`, - `misc_dangerous_protocol`, `misc_tabs`, `misc_url`, `thematic_break`)\ + test (`character_escape`, `character_reference`, `definition`)\ link link reference (definition)\ link label end (destination, label, title)\ link label start (label) diff --git a/src/compiler.rs b/src/compiler.rs index cfe749a..11dea29 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -1,5 +1,5 @@ //! Turn events into a string of HTML. -use crate::constant::SAFE_PROTOCOL_HREF; +use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}; use crate::construct::character_reference::Kind as CharacterReferenceKind; use crate::tokenizer::{Code, Event, EventType, TokenType}; use crate::util::{ @@ -17,6 +17,23 @@ pub enum LineEnding { LineFeed, } +/// To do. +#[derive(Debug)] +struct Media { + /// To do. + image: bool, + /// To do. + label_id: String, + /// To do. + label: String, + /// To do. + // reference_id: String, + /// To do. + destination: Option, + /// To do. + title: Option, +} + impl LineEnding { /// Turn the line ending into a [str]. fn as_str(&self) -> &str { @@ -168,7 +185,13 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { } else { Some(SAFE_PROTOCOL_HREF.to_vec()) }; + let protocol_src = if options.allow_dangerous_protocol { + None + } else { + Some(SAFE_PROTOCOL_SRC.to_vec()) + }; let mut line_ending_inferred: Option = None; + let mut media_stack: Vec = vec![]; // let mut slurp_all_line_endings = false; while index < events.len() { @@ -257,7 +280,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { | TokenType::CodeFencedFenceMeta | TokenType::Definition | TokenType::HeadingAtxText - | TokenType::HeadingSetextText => { + | TokenType::HeadingSetextText + | TokenType::Label + | TokenType::ResourceTitleString => { buffer(buffers); } TokenType::CodeIndented => { @@ -287,6 +312,56 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { ignore_encode = true; } } + TokenType::Image => { + media_stack.push(Media { + image: true, + label_id: "".to_string(), + label: "".to_string(), + // reference_id: "".to_string(), + destination: None, + title: None, + }); + // tags = undefined // Disallow tags. + } + TokenType::Link => { + media_stack.push(Media { + image: false, + label_id: "".to_string(), + label: "".to_string(), + // reference_id: "".to_string(), + destination: None, + title: None, + }); + } + TokenType::Resource => { + buffer(buffers); // We can have line endings in the resource, ignore them. + let media = media_stack.last_mut().unwrap(); + media.destination = Some("".to_string()); + } + TokenType::ResourceDestinationString => { + buffer(buffers); + // Ignore encoding the result, as we’ll first percent encode the url and + // encode manually after. + ignore_encode = true; + } + TokenType::LabelImage + | TokenType::LabelImageMarker + | TokenType::LabelLink + | TokenType::LabelMarker + | TokenType::LabelEnd + | TokenType::ResourceMarker + | TokenType::ResourceDestination + | TokenType::ResourceDestinationLiteral + | TokenType::ResourceDestinationLiteralMarker + | TokenType::ResourceDestinationRaw + | TokenType::ResourceTitle + | TokenType::ResourceTitleMarker + | TokenType::Reference + | TokenType::ReferenceMarker + | TokenType::ReferenceString + | TokenType::LabelText => { + println!("ignore labels for now"); + } TokenType::Paragraph => { buf_tail_mut(buffers).push("

".to_string()); } @@ -324,14 +399,88 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { | TokenType::SpaceOrTab => { // Ignore. } + TokenType::LabelImage + | TokenType::LabelImageMarker + | TokenType::LabelLink + | TokenType::LabelMarker + | TokenType::LabelEnd + | TokenType::ResourceMarker + | TokenType::ResourceDestination + | TokenType::ResourceDestinationLiteral + | TokenType::ResourceDestinationLiteralMarker + | TokenType::ResourceDestinationRaw + | TokenType::ResourceTitle + | TokenType::ResourceTitleMarker + | TokenType::Reference + | TokenType::ReferenceMarker + | TokenType::ReferenceString => { + println!("ignore labels for now"); + } + TokenType::Label => { + let media = media_stack.last_mut().unwrap(); + media.label = resume(buffers); + } + TokenType::LabelText => { + let media = media_stack.last_mut().unwrap(); + media.label_id = serialize(codes, &from_exit_event(events, index), false); + } + TokenType::ResourceDestinationString => { + let media = media_stack.last_mut().unwrap(); + media.destination = Some(resume(buffers)); + ignore_encode = false; + } + TokenType::ResourceTitleString => { + let media = media_stack.last_mut().unwrap(); + media.title = Some(resume(buffers)); + } + TokenType::Image | TokenType::Link => { + // let mut is_in_image = false; + // let mut index = 0; + // Skip current. + // while index < (media_stack.len() - 1) { + // if media_stack[index].image { + // is_in_image = true; + // break; + // } + // index += 1; + // } + + // tags = is_in_image; + + let media = media_stack.pop().unwrap(); + println!("media: {:?}", media); + let buf = buf_tail_mut(buffers); + // To do: get from definition. + let destination = media.destination.unwrap(); + let title = if let Some(title) = media.title { + format!(" title=\"{}\"", title) + } else { + "".to_string() + }; + + if media.image { + buf.push(format!( + "\"{}\"{}", + sanitize_uri(&destination, &protocol_src), + media.label, + title + )); + } else { + buf.push(format!( + "{}", + sanitize_uri(&destination, &protocol_href), + title, + media.label + )); + } + } // Just output it. TokenType::CodeTextData | TokenType::Data | TokenType::CharacterEscapeValue => { // last_was_tag = false; - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + )); } TokenType::AutolinkEmail => { let slice = serialize(codes, &from_exit_event(events, index), false); @@ -340,7 +489,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { "", sanitize_uri(slice.as_str(), &protocol_href) )); - buf.push(encode(&slice)); + buf.push(encode_opt(&slice, ignore_encode)); buf.push("".to_string()); } TokenType::AutolinkProtocol => { @@ -350,7 +499,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { "", sanitize_uri(slice.as_str(), &protocol_href) )); - buf.push(encode(&slice)); + buf.push(encode_opt(&slice, ignore_encode)); buf.push("".to_string()); } TokenType::CharacterReferenceMarker => { @@ -377,7 +526,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { CharacterReferenceKind::Named => decode_named(ref_string), }; - buf_tail_mut(buffers).push(encode(&value)); + buf_tail_mut(buffers).push(encode_opt(&value, ignore_encode)); character_reference_kind = None; } TokenType::CodeFenced | TokenType::CodeIndented => { @@ -432,16 +581,15 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); // tag = true; } - TokenType::CodeFencedFenceMeta => { + TokenType::CodeFencedFenceMeta | TokenType::Resource => { resume(buffers); } TokenType::CodeFlowChunk => { code_flow_seen_data = Some(true); - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + )); } TokenType::CodeText => { let result = resume(buffers); @@ -492,11 +640,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if let Some(buf) = atx_heading_buffer { atx_heading_buffer = Some( buf.to_string() - + &encode(&serialize( - codes, - &from_exit_event(events, index), - false, - )), + + &encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + ), ); } @@ -512,14 +659,14 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if let Some(ref buf) = atx_heading_buffer { if !buf.is_empty() { - buf_tail_mut(buffers).push(encode(buf)); + buf_tail_mut(buffers).push(encode_opt(buf, ignore_encode)); atx_heading_buffer = Some("".to_string()); } } else { atx_heading_buffer = Some("".to_string()); } - buf_tail_mut(buffers).push(encode(&result)); + buf_tail_mut(buffers).push(encode_opt(&result, ignore_encode)); } TokenType::HeadingSetextText => { heading_setext_buffer = Some(resume(buffers)); @@ -540,7 +687,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { TokenType::HtmlFlowData | TokenType::HtmlTextData => { let slice = serialize(codes, &from_exit_event(events, index), false); // last_was_tag = false; - buf_tail_mut(buffers).push(if ignore_encode { slice } else { encode(&slice) }); + buf_tail_mut(buffers).push(encode_opt(&slice, ignore_encode)); } TokenType::LineEnding => { // if slurp_all_line_endings { @@ -549,11 +696,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if slurp_one_line_ending { slurp_one_line_ending = false; } else { - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(encode_opt( + &serialize(codes, &from_exit_event(events, index), false), + ignore_encode, + )); } } TokenType::Paragraph => { @@ -605,6 +751,15 @@ fn buf_tail(buffers: &mut [Vec]) -> &Vec { buffers.last().expect("at least one buffer should exist") } +/// To do. +fn encode_opt(value: &str, ignore_encode: bool) -> String { + if ignore_encode { + value.to_string() + } else { + encode(value) + } +} + /// Add a line ending. fn line_ending(buffers: &mut [Vec], default: &LineEnding) { let tail = buf_tail_mut(buffers); diff --git a/src/constant.rs b/src/constant.rs index 8e1acf3..5cb7826 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -193,6 +193,11 @@ pub const HTML_RAW_SIZE_MAX: usize = 8; /// To safeguard performance, labels are capped at a large number: `999`. pub const LINK_REFERENCE_SIZE_MAX: usize = 999; +/// To do. +/// See: , +/// . +pub const LINK_RESOURCE_DESTINATION_BALANCE_MAX: usize = 32; + /// List of protocols allowed, when operating safely, as `href` on `a`. /// /// This list is based on what is allowed by GitHub. @@ -201,8 +206,6 @@ pub const SAFE_PROTOCOL_HREF: [&str; 6] = ["http", "https", "irc", "ircs", "mail /// List of protocols allowed, when operating safely, as `src` on `img`. /// /// This list is based on what is allowed by GitHub. -// To do: image. -#[allow(dead_code)] pub const SAFE_PROTOCOL_SRC: [&str; 2] = ["http", "https"]; /// The number of characters that form a tab stop. diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 92d275c..674bd65 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -115,7 +115,7 @@ use crate::construct::{ partial_destination::{start as destination, Options as DestinationOptions}, partial_label::{start as label, Options as LabelOptions}, - partial_space_or_tab::space_or_tab, + partial_space_or_tab::{space_or_tab, space_or_tab_one_line_ending}, partial_title::{start as title, Options as TitleOptions}, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; @@ -168,7 +168,7 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::DefinitionMarker); ( State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), marker_after), + tokenizer.go(space_or_tab_one_line_ending(), destination_before), )), None, ) @@ -177,31 +177,6 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// After the marker, after whitespace. -/// -/// ```markdown -/// [a]: |b "c" -/// -/// [a]: |␊ -/// b "c" -/// ``` -fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), destination_before), - )), - None, - ) - } - _ => destination_before(tokenizer, code), - } -} - /// Before a destination. /// /// ```markdown @@ -211,35 +186,23 @@ fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |b "c" /// ``` fn destination_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let event = tokenizer.events.last().unwrap(); - - // Whitespace. - if (event.token_type == TokenType::LineEnding || event.token_type == TokenType::SpaceOrTab) - // Blank line not ok. - && !matches!( - code, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') - ) { - tokenizer.go( - |t, c| { - destination( - t, - c, - DestinationOptions { - limit: usize::MAX, - destination: TokenType::DefinitionDestination, - literal: TokenType::DefinitionDestinationLiteral, - marker: TokenType::DefinitionDestinationLiteralMarker, - raw: TokenType::DefinitionDestinationRaw, - string: TokenType::DefinitionDestinationString, - }, - ) - }, - destination_after, - )(tokenizer, code) - } else { - (State::Nok, None) - } + tokenizer.go( + |t, c| { + destination( + t, + c, + DestinationOptions { + limit: usize::MAX, + destination: TokenType::DefinitionDestination, + literal: TokenType::DefinitionDestinationLiteral, + marker: TokenType::DefinitionDestinationLiteralMarker, + raw: TokenType::DefinitionDestinationRaw, + string: TokenType::DefinitionDestinationString, + }, + ) + }, + destination_after, + )(tokenizer, code) } /// After a destination. @@ -289,32 +252,7 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// "c" /// ``` fn title_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_opt(space_or_tab(), title_before_after_optional_whitespace)(tokenizer, code) -} - -/// Before a title, after optional whitespace. -/// -/// ```markdown -/// [a]: b |"c" -/// -/// [a]: b |␊ -/// "c" -/// ``` -fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new( - tokenizer.attempt_opt(space_or_tab(), title_before_marker), - )), - None, - ) - } - _ => title_before_marker(tokenizer, code), - } + tokenizer.go(space_or_tab_one_line_ending(), title_before_marker)(tokenizer, code) } /// Before a title, after a line ending. @@ -324,26 +262,20 @@ fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) /// | "c" /// ``` fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let event = tokenizer.events.last().unwrap(); - - if event.token_type == TokenType::LineEnding || event.token_type == TokenType::SpaceOrTab { - tokenizer.go( - |t, c| { - title( - t, - c, - TitleOptions { - title: TokenType::DefinitionTitle, - marker: TokenType::DefinitionTitleMarker, - string: TokenType::DefinitionTitleString, - }, - ) - }, - title_after, - )(tokenizer, code) - } else { - (State::Nok, None) - } + tokenizer.go( + |t, c| { + title( + t, + c, + TitleOptions { + title: TokenType::DefinitionTitle, + marker: TokenType::DefinitionTitleMarker, + string: TokenType::DefinitionTitleString, + }, + ) + }, + title_after, + )(tokenizer, code) } /// After a title. diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs new file mode 100644 index 0000000..405858d --- /dev/null +++ b/src/construct/label_end.rs @@ -0,0 +1,712 @@ +//! To do + +use crate::constant::LINK_RESOURCE_DESTINATION_BALANCE_MAX; +use crate::construct::{ + partial_destination::{start as destination, Options as DestinationOptions}, + partial_label::{start as label, Options as LabelOptions}, + partial_space_or_tab::space_or_tab_one_line_ending, + partial_title::{start as title, Options as TitleOptions}, +}; +use crate::tokenizer::{ + Code, Event, EventType, LabelStart, Media, State, StateFnResult, TokenType, Tokenizer, +}; +use crate::util::{ + normalize_identifier::normalize_identifier, + span::{serialize, Span}, +}; +/// To do: could we do without `HashMap`, so we don’t need `std`? +use std::collections::HashMap; + +#[derive(Debug)] +struct Info { + /// To do. + label_start_index: usize, + /// To do. + media: Media, +} + +#[allow(clippy::too_many_lines)] +pub fn resolve_media(tokenizer: &mut Tokenizer) -> Vec { + let mut left: Vec = tokenizer.label_start_list_loose.drain(..).collect(); + let mut left_2: Vec = tokenizer.label_start_stack.drain(..).collect(); + let media: Vec = tokenizer.media_list.drain(..).collect(); + left.append(&mut left_2); + + let mut map: HashMap)> = HashMap::new(); + let events = &tokenizer.events; + + let mut index = 0; + while index < left.len() { + let label_start = &left[index]; + let data_enter_index = label_start.start.0; + let data_exit_index = label_start.start.1; + + map.insert( + data_enter_index, + ( + data_exit_index - data_enter_index, + vec![ + Event { + event_type: EventType::Enter, + token_type: TokenType::Data, + point: events[data_enter_index].point.clone(), + index: events[data_enter_index].index, + previous: None, + next: None, + }, + Event { + event_type: EventType::Exit, + token_type: TokenType::Data, + point: events[data_exit_index].point.clone(), + index: events[data_exit_index].index, + previous: None, + next: None, + }, + ], + ), + ); + + index += 1; + } + + let mut index = 0; + while index < media.len() { + let media = &media[index]; + // LabelLink:Enter or LabelImage:Enter. + let group_enter_index = media.start.0; + let group_enter_event = &events[group_enter_index]; + // LabelLink:Exit or LabelImage:Exit. + let text_enter_index = media.start.0 + + (if group_enter_event.token_type == TokenType::LabelLink { + 4 + } else { + 6 + }); + // LabelEnd:Enter. + let text_exit_index = media.end.0; + // LabelEnd:Exit. + let label_exit_index = media.end.0 + 3; + // Resource:Exit, etc. + let group_end_index = media.end.1; + + // Insert a group enter and label enter. + add( + &mut map, + group_enter_index, + 0, + vec![ + Event { + event_type: EventType::Enter, + token_type: if group_enter_event.token_type == TokenType::LabelLink { + TokenType::Link + } else { + TokenType::Image + }, + point: group_enter_event.point.clone(), + index: group_enter_event.index, + previous: None, + next: None, + }, + Event { + event_type: EventType::Enter, + token_type: TokenType::Label, + point: group_enter_event.point.clone(), + index: group_enter_event.index, + previous: None, + next: None, + }, + ], + ); + + // Empty events not allowed. + if text_enter_index != text_exit_index { + // Insert a text enter. + add( + &mut map, + text_enter_index, + 0, + vec![Event { + event_type: EventType::Enter, + token_type: TokenType::LabelText, + point: events[text_enter_index].point.clone(), + index: events[text_enter_index].index, + previous: None, + next: None, + }], + ); + + // Insert a text exit. + add( + &mut map, + text_exit_index, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::LabelText, + point: events[text_exit_index].point.clone(), + index: events[text_exit_index].index, + previous: None, + next: None, + }], + ); + } + + // Insert a label exit. + add( + &mut map, + label_exit_index + 1, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::Label, + point: events[label_exit_index].point.clone(), + index: events[label_exit_index].index, + previous: None, + next: None, + }], + ); + + // Insert a group exit. + add( + &mut map, + group_end_index + 1, + 0, + vec![Event { + event_type: EventType::Exit, + token_type: TokenType::Link, + point: events[group_end_index].point.clone(), + index: events[group_end_index].index, + previous: None, + next: None, + }], + ); + + index += 1; + } + + let mut indices: Vec<&usize> = map.keys().collect(); + indices.sort_unstable(); + let mut next_events: Vec = vec![]; + let mut index_into_indices = 0; + let mut start = 0; + let events = &mut tokenizer.events; + let mut shift: i32 = 0; + + while index_into_indices < indices.len() { + let index = *indices[index_into_indices]; + + if start < index { + let append = &mut events[start..index].to_vec(); + let mut index = 0; + + while index < append.len() { + let ev = &mut append[index]; + + if let Some(x) = ev.previous { + let next = (x as i32 + shift) as usize; + ev.previous = Some(next); + println!("todo: y: previous {:?} {:?} {:?}", x, shift, start); + } + + if let Some(x) = ev.next { + let next = (x as i32 + shift) as usize; + ev.next = Some(next); + println!("todo: y: next {:?} {:?} {:?}", x, shift, start); + } + + index += 1; + } + + next_events.append(append); + } + + let (remove, add) = map.get(&index).unwrap(); + shift += (add.len() as i32) - (*remove as i32); + + if !add.is_empty() { + let append = &mut add.clone(); + let mut index = 0; + + while index < append.len() { + let ev = &mut append[index]; + + if let Some(x) = ev.previous { + println!("todo: x: previous {:?} {:?} {:?}", x, shift, start); + } + + if let Some(x) = ev.next { + println!("todo: x: next {:?} {:?} {:?}", x, shift, start); + } + + index += 1; + } + + next_events.append(append); + } + + start = index + remove; + index_into_indices += 1; + } + + if start < events.len() { + next_events.append(&mut events[start..].to_vec()); + } + + next_events +} + +/// Start of label end. +/// +/// ```markdown +/// [a|](b) c +/// [a|][b] c +/// [a|][] b +/// [a|] b +/// +/// [a]: z +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char(']') == code { + let mut label_start_index: Option = None; + let mut index = tokenizer.label_start_stack.len(); + + while index > 0 { + index -= 1; + + if !tokenizer.label_start_stack[index].balanced { + label_start_index = Some(index); + break; + } + } + + // If there is an okay opening: + if let Some(label_start_index) = label_start_index { + let label_start = tokenizer + .label_start_stack + .get_mut(label_start_index) + .unwrap(); + + // Mark as balanced if the info is inactive. + if label_start.inactive { + return nok(tokenizer, code, label_start_index); + } + + let label_end_start = tokenizer.events.len(); + let info = Info { + label_start_index, + media: Media { + start: label_start.start, + end: (label_end_start, label_end_start + 3), + id: normalize_identifier(&serialize( + &tokenizer.parse_state.codes, + &Span { + start_index: tokenizer.events[label_start.start.1].index, + end_index: tokenizer.events[label_end_start - 1].index, + }, + false, + )), + }, + }; + + tokenizer.enter(TokenType::LabelEnd); + tokenizer.enter(TokenType::LabelMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelMarker); + tokenizer.exit(TokenType::LabelEnd); + + return (State::Fn(Box::new(move |t, c| after(t, c, info))), None); + } + } + + (State::Nok, None) +} + +/// After `]`. +/// +/// ```markdown +/// [a]|(b) c +/// [a]|[b] c +/// [a]|[] b +/// [a]| b +/// +/// [a]: z +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + // let label_start = tokenizer + // .label_start_stack + // .get_mut(info.label_start_index) + // .unwrap(); + // To do: figure out if defined or not. + let defined = false; + println!("to do: is `{:?}` defined?", info); + match code { + // Resource (`[asd](fgh)`)? + Code::Char('(') => tokenizer.attempt(resource, move |is_ok| { + Box::new(move |t, c| { + // Also fine if `defined`, as then it’s a valid shortcut. + if is_ok || defined { + ok(t, c, info) + } else { + nok(t, c, info.label_start_index) + } + }) + })(tokenizer, code), + // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? + Code::Char('[') => tokenizer.attempt(full_reference, move |is_ok| { + Box::new(move |t, c| { + if is_ok { + ok(t, c, info) + } else if defined { + reference_not_full(t, c, info) + } else { + nok(t, c, info.label_start_index) + } + }) + })(tokenizer, code), + // Shortcut reference: `[asd]`? + _ => { + if defined { + ok(tokenizer, code, info) + } else { + nok(tokenizer, code, info.label_start_index) + } + } + } +} + +/// After `]`, at `[`, but not at a full reference. +/// +/// > 👉 **Note**: we only get here if the label is defined. +/// +/// ```markdown +/// [a]|[] b +/// +/// [a]: z +/// ``` +fn reference_not_full(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + tokenizer.attempt(collapsed_reference, move |is_ok| { + Box::new(move |t, c| { + if is_ok { + ok(t, c, info) + } else { + nok(t, c, info.label_start_index) + } + }) + })(tokenizer, code) +} + +/// Done, we found something. +/// +/// ```markdown +/// [a](b)| c +/// [a][b]| c +/// [a][]| b +/// [a]| b +/// +/// [a]: z +/// ``` +fn ok(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { + println!( + "ok res, ref full, ref, collapsed, or ref shortcut: {:?}", + info.media + ); + // Remove this one and everything after it. + let mut left: Vec = tokenizer + .label_start_stack + .drain(info.label_start_index..) + .collect(); + // Remove this one from `left`, as we’ll move it to `media_list`. + left.remove(0); + tokenizer.label_start_list_loose.append(&mut left); + + let is_link = tokenizer.events[info.media.start.0].token_type == TokenType::LabelLink; + + if is_link { + let mut index = 0; + while index < tokenizer.label_start_stack.len() { + let label_start = &mut tokenizer.label_start_stack[index]; + if tokenizer.events[label_start.start.0].token_type == TokenType::LabelLink { + label_start.inactive = true; + } + index += 1; + } + } + + info.media.end.1 = tokenizer.events.len() - 1; + tokenizer.media_list.push(info.media); + tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + (State::Ok, Some(vec![code])) +} + +/// Done, it’s nothing. +/// +/// There was an okay opening, but we didn’t match anything. +/// +/// ```markdown +/// [a]|(b c +/// [a]|[b c +/// [b]|[ c +/// [b]| c +/// +/// [a]: z +/// ``` +fn nok(tokenizer: &mut Tokenizer, _code: Code, label_start_index: usize) -> StateFnResult { + let label_start = tokenizer + .label_start_stack + .get_mut(label_start_index) + .unwrap(); + println!("just balanced braces: {:?}", label_start); + label_start.balanced = true; + // To do: pop things off the list? + (State::Nok, None) +} + +/// Before a resource, at `(`. +/// +/// ```markdown +/// [a]|(b) c +/// ``` +fn resource(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('(') => { + tokenizer.enter(TokenType::Resource); + tokenizer.enter(TokenType::ResourceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ResourceMarker); + (State::Fn(Box::new(resource_start)), None) + } + _ => unreachable!("expected `(`"), + } +} + +/// At the start of a resource, after `(`, before a definition. +/// +/// ```markdown +/// [a](|b) c +/// ``` +fn resource_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt_opt(space_or_tab_one_line_ending(), resource_open)(tokenizer, code) +} + +/// At the start of a resource, after optional whitespace. +/// +/// ```markdown +/// [a](|b) c +/// ``` +fn resource_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(')') => resource_end(tokenizer, code), + _ => tokenizer.go( + |t, c| { + destination( + t, + c, + DestinationOptions { + limit: LINK_RESOURCE_DESTINATION_BALANCE_MAX, + destination: TokenType::ResourceDestination, + literal: TokenType::ResourceDestinationLiteral, + marker: TokenType::ResourceDestinationLiteralMarker, + raw: TokenType::ResourceDestinationRaw, + string: TokenType::ResourceDestinationString, + }, + ) + }, + destination_after, + )(tokenizer, code), + } +} + +/// In a resource, after a destination, before optional whitespace. +/// +/// ```markdown +/// [a](b|) c +/// [a](b| "c") d +/// ``` +fn destination_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(space_or_tab_one_line_ending(), |ok| { + Box::new(if ok { resource_between } else { resource_end }) + })(tokenizer, code) +} + +/// In a resource, after a destination, after whitespace. +/// +/// ```markdown +/// [a](b |) c +/// [a](b |"c") d +/// ``` +fn resource_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('"' | '\'' | '(') => tokenizer.go( + |t, c| { + title( + t, + c, + TitleOptions { + title: TokenType::ResourceTitle, + marker: TokenType::ResourceTitleMarker, + string: TokenType::ResourceTitleString, + }, + ) + }, + title_after, + )(tokenizer, code), + _ => resource_end(tokenizer, code), + } +} + +/// In a resource, after a title. +/// +/// ```markdown +/// [a](b "c"|) d +/// ``` +fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt_opt(space_or_tab_one_line_ending(), resource_end)(tokenizer, code) +} + +/// In a resource, at the `)`. +/// +/// ```markdown +/// [a](b|) c +/// [a](b |) c +/// [a](b "c"|) d +/// ``` +fn resource_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(')') => { + tokenizer.enter(TokenType::ResourceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ResourceMarker); + tokenizer.exit(TokenType::Resource); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} + +/// In a reference (full), at the `[`. +/// +/// ```markdown +/// [a]|[b] +/// ``` +fn full_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => tokenizer.go( + |t, c| { + label( + t, + c, + LabelOptions { + label: TokenType::Reference, + marker: TokenType::ReferenceMarker, + string: TokenType::ReferenceString, + }, + ) + }, + full_reference_after, + )(tokenizer, code), + _ => unreachable!("expected `[`"), + } +} + +/// In a reference (full), after `]`. +/// +/// ```markdown +/// [a][b]| +/// ``` +fn full_reference_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let events = &tokenizer.events; + let mut index = events.len() - 1; + let mut start: Option = None; + let mut end: Option = None; + + while index > 0 { + index -= 1; + let event = &events[index]; + if event.token_type == TokenType::ReferenceString { + if event.event_type == EventType::Exit { + end = Some(event.index); + } else { + start = Some(event.index); + break; + } + } + } + + // Always found, otherwise we don’t get here. + let start = start.unwrap(); + let end = end.unwrap(); + + let id = normalize_identifier(&serialize( + &tokenizer.parse_state.codes, + &Span { + start_index: start, + end_index: end, + }, + false, + )); + println!("to do: is `{:?}` defined?", id); + let defined = false; + + if defined { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } +} + +/// In a reference (collapsed), at the `[`. +/// +/// > 👉 **Note**: we only get here if the label is defined. +/// +/// ```markdown +/// [a]|[] +/// ``` +fn collapsed_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => { + tokenizer.enter(TokenType::Reference); + tokenizer.enter(TokenType::ReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ReferenceMarker); + (State::Fn(Box::new(collapsed_reference_open)), None) + } + _ => (State::Nok, None), + } +} + +/// In a reference (collapsed), at the `]`. +/// +/// > 👉 **Note**: we only get here if the label is defined. +/// +/// ```markdown +/// [a][|] +/// ``` +fn collapsed_reference_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.enter(TokenType::ReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::ReferenceMarker); + tokenizer.exit(TokenType::Reference); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} + +pub fn add( + map: &mut HashMap)>, + index: usize, + mut remove: usize, + mut add: Vec, +) { + let curr = map.remove(&index); + + if let Some((curr_rm, mut curr_add)) = curr { + remove += curr_rm; + curr_add.append(&mut add); + add = curr_add; + } + + map.insert(index, (remove, add)); +} diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs new file mode 100644 index 0000000..2e96977 --- /dev/null +++ b/src/construct/label_start_image.rs @@ -0,0 +1,47 @@ +//! To do + +use super::label_end::resolve_media; +use crate::tokenizer::{Code, LabelStart, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of label (image) start. +/// +/// ```markdown +/// a |![ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.enter(TokenType::LabelImage); + tokenizer.enter(TokenType::LabelImageMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelImageMarker); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// After `!`, before a `[`. +/// +/// ```markdown +/// a !|[ b +/// ``` +pub fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => { + tokenizer.enter(TokenType::LabelMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelMarker); + tokenizer.exit(TokenType::LabelImage); + let end = tokenizer.events.len() - 1; + tokenizer.label_start_stack.push(LabelStart { + start: (end - 5, end), + balanced: false, + inactive: false, + }); + tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs new file mode 100644 index 0000000..35c9dcd --- /dev/null +++ b/src/construct/label_start_link.rs @@ -0,0 +1,30 @@ +//! To do + +use super::label_end::resolve_media; +use crate::tokenizer::{Code, LabelStart, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of label (link) start. +/// +/// ```markdown +/// a |[ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('[') => { + let start = tokenizer.events.len(); + tokenizer.enter(TokenType::LabelLink); + tokenizer.enter(TokenType::LabelMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::LabelMarker); + tokenizer.exit(TokenType::LabelLink); + tokenizer.label_start_stack.push(LabelStart { + start: (start, tokenizer.events.len() - 1), + balanced: false, + inactive: false, + }); + tokenizer.register_resolver("media".to_string(), Box::new(resolve_media)); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 9e5da0e..8565b2f 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -30,9 +30,9 @@ //! * [heading (setext)][heading_setext] //! * [html (flow)][html_flow] //! * [html (text)][html_text] -//! * label end -//! * label start (image) -//! * label start (link) +//! * [label end][label_end] +//! * [label start (image)][label_start_image] +//! * [label start (link)][label_start_link] //! * list //! * [paragraph][] //! * [thematic break][thematic_break] @@ -59,8 +59,6 @@ //! They also contain references to character as defined by [char][], so for //! example `ascii_punctuation` refers to //! [`char::is_ascii_punctuation`][char::is_ascii_punctuation]. -//! -//! pub mod autolink; pub mod blank_line; @@ -76,6 +74,9 @@ pub mod heading_atx; pub mod heading_setext; pub mod html_flow; pub mod html_text; +pub mod label_end; +pub mod label_start_image; +pub mod label_start_link; pub mod paragraph; pub mod partial_data; pub mod partial_destination; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 03dcbee..7887a44 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -267,11 +267,10 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { /// ```markdown /// a\|)b /// ``` -fn raw_escape(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { +fn raw_escape(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { Code::Char('(' | ')' | '\\') => { tokenizer.consume(code); - info.balance += 1; (State::Fn(Box::new(move |t, c| raw(t, c, info))), None) } _ => raw(tokenizer, code, info), diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 024a4b2..43bdc53 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -35,6 +35,45 @@ pub fn space_or_tab() -> Box { space_or_tab_min_max(1, usize::MAX) } +pub fn space_or_tab_one_line_ending() -> Box { + Box::new(|tokenizer, code| { + tokenizer.attempt(space_or_tab(), move |ok| { + Box::new(move |tokenizer, code| match code { + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(tokenizer.attempt_opt( + space_or_tab(), + move |_t, code| { + if !matches!( + code, + Code::None + | Code::CarriageReturnLineFeed + | Code::Char('\r' | '\n') + ) { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } + }, + ))), + None, + ) + } + _ => { + if ok { + (State::Ok, Some(vec![code])) + } else { + (State::Nok, None) + } + } + }) + })(tokenizer, code) + }) +} + /// Between `x` and `y` `space_or_tab` /// /// ```bnf diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 3e61788..78ae311 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -32,7 +32,7 @@ //! use crate::construct::partial_space_or_tab::space_or_tab; -use crate::subtokenize::link; +use crate::subtokenize::link_to; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Configuration. @@ -109,7 +109,7 @@ impl Kind { #[derive(Debug)] struct Info { /// Whether we’ve seen our first `ChunkString`. - connect: bool, + connect_index: Option, /// Kind of title. kind: Kind, /// Configuration. @@ -125,9 +125,9 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFnResult { match code { - Code::Char(char) if char == '(' || char == '"' || char == '\'' => { + Code::Char(char) if char == '"' || char == '\'' || char == '(' => { let info = Info { - connect: false, + connect_index: None, kind: Kind::from_char(char), options, }; @@ -184,11 +184,11 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes _ => { tokenizer.enter(TokenType::ChunkString); - if info.connect { + if let Some(connect_index) = info.connect_index { let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); + link_to(&mut tokenizer.events, connect_index, index); } else { - info.connect = true; + info.connect_index = Some(tokenizer.events.len() - 1); } title(tokenizer, code, info) diff --git a/src/content/flow.rs b/src/content/flow.rs index e71d25a..546712f 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -26,6 +26,7 @@ use crate::construct::{ html_flow::start as html_flow, paragraph::start as paragraph, thematic_break::start as thematic_break, }; +use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; use crate::util::{ @@ -34,9 +35,10 @@ use crate::util::{ }; /// Turn `codes` as the flow content type into events. -pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec { - let mut tokenizer = Tokenizer::new(point, index); - tokenizer.feed(codes, Box::new(start), true); +pub fn flow(parse_state: &ParseState, point: Point, index: usize) -> Vec { + let mut tokenizer = Tokenizer::new(point, index, parse_state); + + tokenizer.push(&parse_state.codes, Box::new(start), true); let mut index = 0; @@ -47,9 +49,14 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec { && event.token_type == TokenType::DefinitionLabelString { let id = normalize_identifier( - serialize(codes, &from_exit_event(&tokenizer.events, index), false).as_str(), + serialize( + &parse_state.codes, + &from_exit_event(&tokenizer.events, index), + false, + ) + .as_str(), ); - println!("to do: use identifier {:?}", id); + println!("to do: use definition identifier {:?}", id); } index += 1; @@ -58,8 +65,9 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec { let mut result = (tokenizer.events, false); while !result.1 { - result = subtokenize(result.0, codes); + result = subtokenize(result.0, parse_state); } + result.0 } diff --git a/src/content/text.rs b/src/content/text.rs index 1224064..5718617 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -21,15 +21,19 @@ use crate::construct::{ character_reference::start as character_reference, code_text::start as code_text, hard_break_escape::start as hard_break_escape, hard_break_trailing::start as hard_break_trailing, html_text::start as html_text, - partial_data::start as data, + label_end::start as label_end, label_start_image::start as label_start_image, + label_start_link::start as label_start_link, partial_data::start as data, }; use crate::tokenizer::{Code, State, StateFnResult, Tokenizer}; -const MARKERS: [Code; 5] = [ +const MARKERS: [Code; 8] = [ Code::Char(' '), // `hard_break_trailing` + Code::Char('!'), // `label_start_image` Code::Char('&'), // `character_reference` Code::Char('<'), // `autolink`, `html_text` + Code::Char('['), // `label_start_link` Code::Char('\\'), // `character_escape`, `hard_break_escape` + Code::Char(']'), // `label_end` Code::Char('`'), // `code_text` ]; @@ -47,13 +51,16 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { Code::None => (State::Ok, None), _ => tokenizer.attempt_n( vec![ - Box::new(character_reference), + Box::new(autolink), Box::new(character_escape), + Box::new(character_reference), + Box::new(code_text), Box::new(hard_break_escape), Box::new(hard_break_trailing), - Box::new(autolink), Box::new(html_text), - Box::new(code_text), + Box::new(label_end), + Box::new(label_start_image), + Box::new(label_start_link), ], |ok| Box::new(if ok { start } else { before_data }), )(tokenizer, code), diff --git a/src/parser.rs b/src/parser.rs index 49d99d3..32b7f36 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4,14 +4,24 @@ use crate::content::flow::flow; use crate::tokenizer::{as_codes, Code, Event, Point}; +pub struct ParseState { + /// To do. + pub codes: Vec, + /// To do. + pub definitions: Vec, +} + /// Turn a string of markdown into events. /// /// Passes the codes back so the compiler can access the source. pub fn parse(value: &str) -> (Vec, Vec) { - let codes = as_codes(value); - // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough. + let parse_state = ParseState { + codes: as_codes(value), + definitions: vec![], + }; + let events = flow( - &codes, + &parse_state, Point { line: 1, column: 1, @@ -19,5 +29,7 @@ pub fn parse(value: &str) -> (Vec, Vec) { }, 0, ); - (events, codes) + + // To do: pass whole `parse_state` back? + (events, parse_state.codes) } diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 4ee2242..58db3c6 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -28,9 +28,8 @@ use std::collections::HashMap; use crate::content::{string::start as string, text::start as text}; -use crate::tokenizer::{ - Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, -}; +use crate::parser::ParseState; +use crate::tokenizer::{Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer}; use crate::util::span; /// Create a link between two [`Event`][]s. @@ -39,25 +38,36 @@ use crate::util::span; /// This optimizes for the common case where the token at `index` is connected /// to the previous void token. pub fn link(events: &mut [Event], index: usize) { - let prev = &mut events[index - 2]; + link_to(events, index - 2, index); +} + +/// To do +pub fn link_to(events: &mut [Event], pevious: usize, next: usize) { + let prev = &mut events[pevious]; + // To do: force chunks? + // assert!( + // prev.token_type == TokenType::ChunkString || prev.token_type == TokenType::ChunkText, + // "{:?}", + // prev.token_type.to_owned() + // ); assert_eq!(prev.event_type, EventType::Enter); - prev.next = Some(index); + prev.next = Some(next); - let prev_ref = &events[index - 2]; - let prev_exit_ref = &events[index - 1]; + let prev_ref = &events[pevious]; + let prev_exit_ref = &events[pevious + 1]; assert_eq!(prev_exit_ref.event_type, EventType::Exit); assert_eq!(prev_exit_ref.token_type, prev_ref.token_type); - let curr = &mut events[index]; + let curr = &mut events[next]; assert_eq!(curr.event_type, EventType::Enter); - curr.previous = Some(index - 2); + curr.previous = Some(pevious); // Note: the exit of this event may not exist, so don’t check for that. } /// Parse linked events. /// /// Supposed to be called repeatedly, returns `1: true` when done. -pub fn subtokenize(mut events: Vec, codes: &[Code]) -> (Vec, bool) { +pub fn subtokenize(mut events: Vec, parse_state: &ParseState) -> (Vec, bool) { let mut index = 0; // Map of first chunks to their tokenizer. let mut head_to_tokenizer: HashMap = HashMap::new(); @@ -83,7 +93,7 @@ pub fn subtokenize(mut events: Vec, codes: &[Code]) -> (Vec, bool) // Index into `events` pointing to a chunk. let mut index_opt: Option = Some(index); // Subtokenizer. - let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); + let mut tokenizer = Tokenizer::new(event.point.clone(), event.index, parse_state); // Substate. let mut result: StateFnResult = ( State::Fn(Box::new(if event.token_type == TokenType::ChunkString { @@ -115,7 +125,11 @@ pub fn subtokenize(mut events: Vec, codes: &[Code]) -> (Vec, bool) _ => unreachable!("cannot be ok/nok"), }; - result = tokenizer.feed(span::codes(codes, &span), func, enter.next == None); + result = tokenizer.push( + span::codes(&parse_state.codes, &span), + func, + enter.next == None, + ); assert!(result.1.is_none(), "expected no remainder"); index_opt = enter.next; } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7b71308..a692a4d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -15,6 +15,7 @@ use std::collections::HashMap; use crate::constant::TAB_SIZE; +use crate::parser::ParseState; /// Semantic label of a span. // To do: figure out how to share this so extensions can add their own stuff, @@ -1073,6 +1074,32 @@ pub enum TokenType { /// ^^^ /// ``` HtmlTextData, + /// To do, + LabelImage, + /// To do, + LabelImageMarker, + /// To do, + LabelLink, + /// To do, + LabelMarker, + LabelEnd, + Resource, + ResourceMarker, + ResourceDestination, + ResourceDestinationLiteral, + ResourceDestinationLiteralMarker, + ResourceDestinationRaw, + ResourceDestinationString, + ResourceTitle, + ResourceTitleMarker, + ResourceTitleString, + Reference, + ReferenceMarker, + ReferenceString, + Link, + Image, + Label, + LabelText, /// Line ending. /// /// ## Info @@ -1243,6 +1270,9 @@ pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult; /// In certain cases, it can also yield back up parsed codes that were passed down. pub type StateFnResult = (State, Option>); +/// To do. +pub type Resolver = dyn FnOnce(&mut Tokenizer) -> Vec; + /// The result of a state. pub enum State { /// There is a future state: a boxed [`StateFn`][] to pass the next code to. @@ -1253,6 +1283,30 @@ pub enum State { Nok, } +/// To do. +#[derive(Debug)] +pub struct LabelStart { + /// To do. + pub start: (usize, usize), + /// A boolean used internally to figure out if a label start link can’t be + /// used (because links in links are incorrect). + pub inactive: bool, + /// A boolean used internally to figure out if a label is balanced: they’re + /// not media, it’s just balanced braces. + pub balanced: bool, +} + +/// To do. +#[derive(Debug)] +pub struct Media { + /// To do. + pub start: (usize, usize), + /// To do. + pub end: (usize, usize), + /// To do. + pub id: String, +} + /// The internal state of a tokenizer, not to be confused with states from the /// state machine, this instead is all the information about where we currently /// are and what’s going on. @@ -1272,9 +1326,10 @@ struct InternalState { point: Point, } +// #[derive(Debug)] + /// A tokenizer itself. -#[derive(Debug)] -pub struct Tokenizer { +pub struct Tokenizer<'a> { column_start: HashMap, /// Track whether a character is expected to be consumed, and whether it’s /// actually consumed @@ -1295,11 +1350,22 @@ pub struct Tokenizer { index: usize, /// Current relative and absolute place in the file. point: Point, + /// To do. + pub parse_state: &'a ParseState, + /// To do. + pub label_start_stack: Vec, + /// To do. + pub label_start_list_loose: Vec, + /// To do. + pub media_list: Vec, + /// To do. + resolvers: Vec>, + resolver_ids: Vec, } -impl Tokenizer { +impl<'a> Tokenizer<'a> { /// Create a new tokenizer. - pub fn new(point: Point, index: usize) -> Tokenizer { + pub fn new(point: Point, index: usize, parse_state: &'a ParseState) -> Tokenizer { Tokenizer { previous: Code::None, current: Code::None, @@ -1309,6 +1375,20 @@ impl Tokenizer { point, stack: vec![], events: vec![], + parse_state, + label_start_stack: vec![], + label_start_list_loose: vec![], + media_list: vec![], + resolvers: vec![], + resolver_ids: vec![], + } + } + + /// To do. + pub fn register_resolver(&mut self, id: String, resolver: Box) { + if !self.resolver_ids.contains(&id) { + self.resolver_ids.push(id); + self.resolvers.push(resolver); } } @@ -1582,7 +1662,8 @@ impl Tokenizer { /// This is set up to support repeatedly calling `feed`, and thus streaming /// markdown into the state machine, and normally pauses after feeding. /// When `done: true` is passed, the EOF is fed. - pub fn feed( + // To do: call this `feed_impl`, and rename `push` to `feed`? + fn feed( &mut self, codes: &[Code], start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, @@ -1643,6 +1724,26 @@ impl Tokenizer { check_statefn_result((state, None)) } + + /// To do. + // To do: set a `drained` to prevent passing after draining? + pub fn push( + &mut self, + codes: &[Code], + start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + drain: bool, + ) -> StateFnResult { + let result = self.feed(codes, start, drain); + + if drain { + while !self.resolvers.is_empty() { + let resolver = self.resolvers.remove(0); + self.events = resolver(self); + } + } + + result + } } /// Internal utility to wrap states to also capture codes. diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index d66978e..55b15e4 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -115,7 +115,7 @@ fn normalize_uri(value: &str) -> String { result.push( buff[0..char.len_utf8()] .iter() - .map(|&byte| format!("%{:X}", byte)) + .map(|&byte| format!("%{:>02X}", byte)) .collect::(), ); diff --git a/tests/character_escape.rs b/tests/character_escape.rs index e4f23d2..3e3e839 100644 --- a/tests/character_escape.rs +++ b/tests/character_escape.rs @@ -61,12 +61,11 @@ fn character_escape() { "should not escape in flow html" ); - // To do: link (reference). - // assert_eq!( - // micromark("[foo](/bar\\* \"ti\\*tle\")"), - // "

foo

", - // "should escape in resource and title" - // ); + assert_eq!( + micromark("[foo](/bar\\* \"ti\\*tle\")"), + "

foo

", + "should escape in resource and title" + ); // To do: link (reference). // assert_eq!( diff --git a/tests/character_reference.rs b/tests/character_reference.rs index 136ce17..3d2111e 100644 --- a/tests/character_reference.rs +++ b/tests/character_reference.rs @@ -55,14 +55,13 @@ fn character_reference() { "should not care about character references in html" ); - // To do: link (resource). - // assert_eq!( - // micromark("[foo](/föö \"föö\")"), - // "

foo

", - // "should support character references in resource URLs and titles" - // ); + assert_eq!( + micromark("[foo](/föö \"föö\")"), + "

foo

", + "should support character references in resource URLs and titles" + ); - // To do: link (resource). + // To do: link (reference). // assert_eq!( // micromark("[foo]: /föö \"föö\"\n\n[foo]"), // "

foo

", @@ -101,12 +100,11 @@ fn character_reference() { // "should not support character references as construct markers (2)" // ); - // To do: link (resource). - // assert_eq!( - // micromark("[a](url "tit")"), - // "

[a](url "tit")

", - // "should not support character references as construct markers (3)" - // ); + assert_eq!( + micromark("[a](url "tit")"), + "

[a](url "tit")

", + "should not support character references as construct markers (3)" + ); assert_eq!( micromark("foo bar"), diff --git a/tests/image.rs b/tests/image.rs new file mode 100644 index 0000000..a54c8d2 --- /dev/null +++ b/tests/image.rs @@ -0,0 +1,229 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn image() { + assert_eq!( + micromark("[link](/uri \"title\")"), + "

link

", + "should support links" + ); + assert_eq!( + micromark("![foo](/url \"title\")"), + "

\"foo\"

", + "should support image w/ resource" + ); + + // To do: attention. + // assert_eq!( + // micromark("[foo *bar*]: train.jpg \"train & tracks\"\n\n![foo *bar*]"), + // "

\"foo

", + // "should support image as shortcut reference" + // ); + + // To do: tags in images. + // assert_eq!( + // micromark("![foo ![bar](/url)](/url2)"), + // "

\"foo

", + // "should “support” images in images" + // ); + + // To do: tags in images. + // assert_eq!( + // micromark("![foo [bar](/url)](/url2)"), + // "

\"foo

", + // "should “support” links in images" + // ); + + // To do: tags in images. + // assert_eq!( + // micromark("[foo *bar*]: train.jpg \"train & tracks\"\n\n![foo *bar*][]"), + // "

\"foo

", + // "should support “content” in images" + // ); + + // To do: tags in images, attention, references. + // assert_eq!( + // micromark("[FOOBAR]: train.jpg \"train & tracks\"\n\n![foo *bar*][foobar]"), + // "

\"foo

", + // "should support “content” in images" + // ); + + assert_eq!( + micromark("![foo](train.jpg)"), + "

\"foo\"

", + "should support images w/o title" + ); + + assert_eq!( + micromark("My ![foo bar](/path/to/train.jpg \"title\" )"), + "

My \"foo

", + "should support images w/ lots of whitespace" + ); + + assert_eq!( + micromark("![foo]()"), + "

\"foo\"

", + "should support images w/ enclosed destinations" + ); + + assert_eq!( + micromark("![](/url)"), + "

\"\"

", + "should support images w/ empty labels" + ); + + // To do: references. + // assert_eq!( + // micromark("[bar]: /url\n\n![foo][bar]"), + // "

\"foo\"

", + // "should support full references (1)" + // ); + + // To do: references. + // assert_eq!( + // micromark("[BAR]: /url\n\n![foo][bar]"), + // "

\"foo\"

", + // "should support full references (2)" + // ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n![foo][]"), + // "

\"foo\"

", + // "should support collapsed references (1)" + // ); + + // To do: references, attention, tags in images. + // assert_eq!( + // micromark("[*foo* bar]: /url \"title\"\n\n![*foo* bar][]"), + // "

\"foo

", + // "should support collapsed references (2)" + // ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n![Foo][]"), + // "

\"Foo\"

", + // "should support case-insensitive labels" + // ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n![foo] \n[]"), + // "

\"foo\"\n[]

", + // "should not support whitespace between sets of brackets" + // ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n![foo]"), + // "

\"foo\"

", + // "should support shortcut references (1)" + // ); + + // To do: references, tags in images, attention. + // assert_eq!( + // micromark("[*foo* bar]: /url \"title\"\n\n![*foo* bar]"), + // "

\"foo

", + // "should support shortcut references (2)" + // ); + + assert_eq!( + micromark("[[foo]]: /url \"title\"\n\n![[foo]]"), + "

[[foo]]: /url "title"

\n

![[foo]]

", + "should not support link labels w/ unescaped brackets" + ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n![Foo]"), + // "

\"Foo\"

", + // "should support case-insensitive label matching" + // ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n!\\[foo]"), + // "

![foo]

", + // "should “support” an escaped bracket instead of an image" + // ); + + // To do: references. + // assert_eq!( + // micromark("[foo]: /url \"title\"\n\n\\![foo]"), + // "

!foo

", + // "should support an escaped bang instead of an image, but still have a link" + // ); + + // Extra + assert_eq!( + micromark("![foo]()"), + "

\"foo\"

", + "should support images w/o destination" + ); + + assert_eq!( + micromark("![foo](<>)"), + "

\"foo\"

", + "should support images w/ explicit empty destination" + ); + + assert_eq!( + micromark("![](example.png)"), + "

\"\"

", + "should support images w/o alt" + ); + + assert_eq!( + micromark("![alpha](bravo.png \"\")"), + "

\"alpha\"

", + "should support images w/ empty title (1)" + ); + + assert_eq!( + micromark("![alpha](bravo.png '')"), + "

\"alpha\"

", + "should support images w/ empty title (2)" + ); + + assert_eq!( + micromark("![alpha](bravo.png ())"), + "

\"alpha\"

", + "should support images w/ empty title (3)" + ); + + assert_eq!( + micromark("![&©&](example.com/&©& \"&©&\")"), + "

\"&©&\"

", + "should support character references in images" + ); + + // Extra + // See: + assert_eq!( + micromark("![](<> \"\")"), + "

\"\"

", + "should ignore an empty title" + ); + + // To do: extensions + // assert_eq!( + // micromark("![x]()", {extensions: [{disable: {null: ["labelStartImage"]}}]}), + // "

!x

", + // "should support turning off label start (image)" + // ); + + assert_eq!( + micromark("![](javascript:alert(1))"), + "

\"\"

", + "should ignore non-http protocols by default" + ); + + // To do: extensions + // assert_eq!( + // micromark("![](javascript:alert(1))", {allowDangerousProtocol: true}), + // "

\"\"

", + // "should allow non-http protocols w/ `allowDangerousProtocol`" + // ); +} diff --git a/tests/link_resource.rs b/tests/link_resource.rs new file mode 100644 index 0000000..b1e1905 --- /dev/null +++ b/tests/link_resource.rs @@ -0,0 +1,464 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Options}; + +const DANGER: &Options = &Options { + allow_dangerous_html: true, + allow_dangerous_protocol: true, + default_line_ending: None, +}; + +#[test] +fn link_resource() { + assert_eq!( + micromark("[link](/uri \"title\")"), + "

link

", + "should support links" + ); + + assert_eq!( + micromark("[link](/uri)"), + "

link

", + "should support links w/o title" + ); + + assert_eq!( + micromark("[link]()"), + "

link

", + "should support links w/o destination" + ); + + assert_eq!( + micromark("[link](<>)"), + "

link

", + "should support links w/ empty enclosed destination" + ); + + assert_eq!( + micromark("[link](/my uri)"), + "

[link](/my uri)

", + "should not support links w/ spaces in destination" + ); + + assert_eq!( + micromark("[link]()"), + "

link

", + "should support links w/ spaces in enclosed destination" + ); + + assert_eq!( + micromark("[link](foo\nbar)"), + "

[link](foo\nbar)

", + "should not support links w/ line endings in destination" + ); + + assert_eq!( + micromark_with_options("[link]()", DANGER), + "

[link]()

", + "should not support links w/ line endings in enclosed destination" + ); + + assert_eq!( + micromark("[a]()"), + "

a

", + "should support links w/ closing parens in destination" + ); + + assert_eq!( + micromark("[link]()"), + "

[link](<foo>)

", + "should not support links w/ enclosed destinations w/o end" + ); + + assert_eq!( + micromark_with_options("[a](\n[a](c)", DANGER), + "

[a](<b)c\n[a](<b)c>\n[a](c)

", + "should not support links w/ unmatched enclosed destinations" + ); + + assert_eq!( + micromark("[link](\\(foo\\))"), + "

link

", + "should support links w/ destinations w/ escaped parens" + ); + + assert_eq!( + micromark("[link](foo(and(bar)))"), + "

link

", + "should support links w/ destinations w/ balanced parens" + ); + + assert_eq!( + micromark("[link](foo\\(and\\(bar\\))"), + "

link

", + "should support links w/ destinations w/ escaped parens" + ); + + assert_eq!( + micromark("[link]()"), + "

link

", + "should support links w/ enclosed destinations w/ parens" + ); + + assert_eq!( + micromark_with_options("[link](foo\\)\\:)", DANGER), + "

link

", + "should support links w/ escapes in destinations" + ); + + assert_eq!( + micromark("[link](#fragment)"), + "

link

", + "should support links w/ destinations to fragments" + ); + + assert_eq!( + micromark("[link](http://example.com#fragment)"), + "

link

", + "should support links w/ destinations to URLs w/ fragments" + ); + + assert_eq!( + micromark("[link](http://example.com?foo=3#frag)"), + "

link

", + "should support links w/ destinations to URLs w/ search and fragments" + ); + + assert_eq!( + micromark("[link](foo\\bar)"), + "

link

", + "should not support non-punctuation character escapes in links" + ); + + assert_eq!( + micromark("[link](foo%20bä)"), + "

link

", + "should support character references in links" + ); + + assert_eq!( + micromark("[link](\"title\")"), + "

link

", + "should not support links w/ only a title" + ); + + assert_eq!( + micromark("[link](/url \"title\")"), + "

link

", + "should support titles w/ double quotes" + ); + + assert_eq!( + micromark("[link](/url 'title')"), + "

link

", + "should support titles w/ single quotes" + ); + + assert_eq!( + micromark("[link](/url (title))"), + "

link

", + "should support titles w/ parens" + ); + + assert_eq!( + micromark("[link](/url \"title \\\""\")"), + "

link

", + "should support character references and escapes in titles" + ); + + assert_eq!( + micromark("[link](/url \"title\")"), + "

link

", + "should not support unicode whitespace between destination and title" + ); + + assert_eq!( + micromark("[link](/url \"title \"and\" title\")"), + "

[link](/url "title "and" title")

", + "should not support nested balanced quotes in title" + ); + + assert_eq!( + micromark("[link](/url 'title \"and\" title')"), + "

link

", + "should support the other quotes in titles" + ); + + assert_eq!( + micromark("[link]( /uri\n \"title\" )"), + "

link

", + "should support whitespace around destination and title (1)" + ); + + assert_eq!( + micromark("[link](\t\n/uri \"title\")"), + "

link

", + "should support whitespace around destination and title (2)" + ); + + assert_eq!( + micromark("[link](/uri \"title\"\t\n)"), + "

link

", + "should support whitespace around destination and title (3)" + ); + + assert_eq!( + micromark("[link] (/uri)"), + "

[link] (/uri)

", + "should not support whitespace between label and resource" + ); + + assert_eq!( + micromark("[link [foo [bar]]](/uri)"), + "

link [foo [bar]]

", + "should support balanced brackets" + ); + + assert_eq!( + micromark("[link] bar](/uri)"), + "

[link] bar](/uri)

", + "should not support unbalanced brackets (1)" + ); + + assert_eq!( + micromark("[link [bar](/uri)"), + "

[link bar

", + "should not support unbalanced brackets (2)" + ); + + assert_eq!( + micromark("[link \\[bar](/uri)"), + "

link [bar

", + "should support characer escapes" + ); + + // To do: attention. + // assert_eq!( + // micromark("[link *foo **bar** `#`*](/uri)"), + // "

link foo bar #

", + // "should support content" + // ); + + assert_eq!( + micromark("[![moon](moon.jpg)](/uri)"), + "

\"moon\"

", + "should support an image as content" + ); + + assert_eq!( + micromark("[foo [bar](/uri)](/uri)"), + "

[foo bar](/uri)

", + "should not support links in links (1)" + ); + + // To do: attention. + // assert_eq!( + // micromark("[foo *[bar [baz](/uri)](/uri)*](/uri)"), + // "

[foo [bar baz](/uri)](/uri)

", + // "should not support links in links (2)" + // ); + + // To do: tags in images. + // assert_eq!( + // micromark("![[[foo](uri1)](uri2)](uri3)"), + // "

\"[foo](uri2)\"

", + // "should not support links in links (3)" + // ); + + assert_eq!( + micromark("*[foo*](/uri)"), + "

*foo*

", + "should prefer links over emphasis (1)" + ); + + assert_eq!( + micromark("[foo *bar](baz*)"), + "

foo *bar

", + "should prefer links over emphasis (2)" + ); + + assert_eq!( + micromark_with_options("[foo ", DANGER), + "

[foo

", + "should prefer HTML over links" + ); + + assert_eq!( + micromark("[foo`](/uri)`"), + "

[foo](/uri)

", + "should prefer code over links" + ); + + assert_eq!( + micromark("[foo"), + "

[foohttp://example.com/?search=](uri)

", + "should prefer autolinks over links" + ); + + assert_eq!( + micromark("[foo"), + "

[foohttp://example.com/?search=](uri)

", + "should prefer autolinks over links" + ); + + // Extra + assert_eq!( + micromark("[]()"), + "

", + "should support an empty link" + ); + + // See: + assert_eq!( + micromark("[](<> \"\")"), + "

", + "should ignore an empty title" + ); + + assert_eq!( + micromark_with_options("[a](\"c\")", DANGER), + "

[a]("c")

", + "should require whitespace between enclosed destination and title" + ); + + assert_eq!( + micromark("[](<"), + "

[](<

", + "should not support an unclosed enclosed destination" + ); + + assert_eq!( + micromark("[]("), + "

[](

", + "should not support an unclosed destination" + ); + + assert_eq!( + micromark("[](\\<)"), + "

", + "should support unenclosed link destination starting w/ escapes" + ); + + assert_eq!( + micromark("[](<\\<>)"), + "

", + "should support enclosed link destination starting w/ escapes" + ); + + assert_eq!( + micromark("[](\\"), + "

[](\\

", + "should not support unenclosed link destination starting w/ an incorrect escape" + ); + + assert_eq!( + micromark("[](<\\"), + "

[](<\\

", + "should not support enclosed link destination starting w/ an incorrect escape" + ); + + assert_eq!( + micromark("[](a \""), + "

[](a "

", + "should not support an eof in a link title (1)" + ); + + assert_eq!( + micromark("[](a '"), + "

[](a '

", + "should not support an eof in a link title (2)" + ); + + assert_eq!( + micromark("[](a ("), + "

[](a (

", + "should not support an eof in a link title (3)" + ); + + assert_eq!( + micromark("[](a \"\\"), + "

[](a "\\

", + "should not support an eof in a link title escape (1)" + ); + + assert_eq!( + micromark("[](a '\\"), + "

[](a '\\

", + "should not support an eof in a link title escape (2)" + ); + + assert_eq!( + micromark("[](a (\\"), + "

[](a (\\

", + "should not support an eof in a link title escape (3)" + ); + + assert_eq!( + micromark("[](a \"\\\"\")"), + "

", + "should support a character escape to start a link title (1)" + ); + + assert_eq!( + micromark("[](a '\\'')"), + "

", + "should support a character escape to start a link title (2)" + ); + + assert_eq!( + micromark("[](a (\\)))"), + "

", + "should support a character escape to start a link title (3)" + ); + + assert_eq!( + micromark("[&©&](example.com/&©& \"&©&\")"), + "

&©&

", + "should support character references in links" + ); + + assert_eq!( + micromark("[a](1())"), + "

a

", + "should support 1 set of parens" + ); + + assert_eq!( + micromark("[a](1(2()))"), + "

a

", + "should support 2 sets of parens" + ); + + assert_eq!( + micromark( + "[a](1(2(3(4(5(6(7(8(9(10(11(12(13(14(15(16(17(18(19(20(21(22(23(24(25(26(27(28(29(30(31(32()))))))))))))))))))))))))))))))))" + ), + "

a

", + "should support 32 sets of parens" + ); + + assert_eq!( + micromark( + "[a](1(2(3(4(5(6(7(8(9(10(11(12(13(14(15(16(17(18(19(20(21(22(23(24(25(26(27(28(29(30(31(32(33())))))))))))))))))))))))))))))))))" + ), + "

[a](1(2(3(4(5(6(7(8(9(10(11(12(13(14(15(16(17(18(19(20(21(22(23(24(25(26(27(28(29(30(31(32(33())))))))))))))))))))))))))))))))))

", + "should not support 33 or more sets of parens" + ); + + assert_eq!( + micromark("[a](b \"\n c\")"), + "

a

", + "should support an eol at the start of a title" + ); + + assert_eq!( + micromark("[a](b( \"c\")"), + "

[a](b( "c")

", + "should not support whitespace when unbalanced in a raw destination" + ); + + assert_eq!( + micromark("[a](\0)"), + "

a

", + "should support a single NUL character as a link resource" + ); +} diff --git a/tests/misc_dangerous_protocol.rs b/tests/misc_dangerous_protocol.rs index 6f759e3..3aa042a 100644 --- a/tests/misc_dangerous_protocol.rs +++ b/tests/misc_dangerous_protocol.rs @@ -34,166 +34,164 @@ fn dangerous_protocol_autolink() { ); } -// To do: image. -// #[test] -// fn dangerous_protocol_image() { -// assert_eq!( -// micromark("![](javascript:alert(1))"), -// "

\"\"

", -// "should be safe by default" -// ); - -// assert_eq!( -// micromark("![](http://a)"), -// "

\"\"

", -// "should allow `http:`" -// ); - -// assert_eq!( -// micromark("![](https://a)"), -// "

\"\"

", -// "should allow `https:`" -// ); - -// assert_eq!( -// micromark("![](irc:///help)"), -// "

\"\"

", -// "should not allow `irc:`" -// ); - -// assert_eq!( -// micromark("![](mailto:a)"), -// "

\"\"

", -// "should not allow `mailto:`" -// ); - -// assert_eq!( -// micromark("![](#a)"), -// "

\"\"

", -// "should allow a hash" -// ); - -// assert_eq!( -// micromark("![](?a)"), -// "

\"\"

", -// "should allow a search" -// ); - -// assert_eq!( -// micromark("![](/a)"), -// "

\"\"

", -// "should allow an absolute" -// ); - -// assert_eq!( -// micromark("![](./a)"), -// "

\"\"

", -// "should allow an relative" -// ); - -// assert_eq!( -// micromark("![](../a)"), -// "

\"\"

", -// "should allow an upwards relative" -// ); - -// assert_eq!( -// micromark("![](a#b:c)"), -// "

\"\"

", -// "should allow a colon in a hash" -// ); - -// assert_eq!( -// micromark("![](a?b:c)"), -// "

\"\"

", -// "should allow a colon in a search" -// ); - -// assert_eq!( -// micromark("![](a/b:c)"), -// "

\"\"

", -// "should allow a colon in a path" -// ); -// } - -// To do: link. -// #[test] -// fn dangerous_protocol_link() { -// assert_eq!( -// micromark("[](javascript:alert(1))"), -// "

", -// "should be safe by default" -// ); - -// assert_eq!( -// micromark("[](http://a)"), -// "

", -// "should allow `http:`" -// ); - -// assert_eq!( -// micromark("[](https://a)"), -// "

", -// "should allow `https:`" -// ); - -// assert_eq!( -// micromark("[](irc:///help)"), -// "

", -// "should allow `irc:`" -// ); - -// assert_eq!( -// micromark("[](mailto:a)"), -// "

", -// "should allow `mailto:`" -// ); - -// assert_eq!( -// micromark("[](#a)"), -// "

", -// "should allow a hash" -// ); - -// assert_eq!( -// micromark("[](?a)"), -// "

", -// "should allow a search" -// ); - -// assert_eq!( -// micromark("[](/a)"), -// "

", -// "should allow an absolute" -// ); - -// assert_eq!( -// micromark("[](./a)"), -// "

", -// "should allow an relative" -// ); - -// assert_eq!( -// micromark("[](../a)"), -// "

", -// "should allow an upwards relative" -// ); - -// assert_eq!( -// micromark("[](a#b:c)"), -// "

", -// "should allow a colon in a hash" -// ); - -// assert_eq!( -// micromark("[](a?b:c)"), -// "

", -// "should allow a colon in a search" -// ); - -// assert_eq!( -// micromark("[](a/b:c)"), -// "

", -// "should allow a colon in a path" -// ); -// } +#[test] +fn dangerous_protocol_image() { + assert_eq!( + micromark("![](javascript:alert(1))"), + "

\"\"

", + "should be safe by default" + ); + + assert_eq!( + micromark("![](http://a)"), + "

\"\"

", + "should allow `http:`" + ); + + assert_eq!( + micromark("![](https://a)"), + "

\"\"

", + "should allow `https:`" + ); + + assert_eq!( + micromark("![](irc:///help)"), + "

\"\"

", + "should not allow `irc:`" + ); + + assert_eq!( + micromark("![](mailto:a)"), + "

\"\"

", + "should not allow `mailto:`" + ); + + assert_eq!( + micromark("![](#a)"), + "

\"\"

", + "should allow a hash" + ); + + assert_eq!( + micromark("![](?a)"), + "

\"\"

", + "should allow a search" + ); + + assert_eq!( + micromark("![](/a)"), + "

\"\"

", + "should allow an absolute" + ); + + assert_eq!( + micromark("![](./a)"), + "

\"\"

", + "should allow an relative" + ); + + assert_eq!( + micromark("![](../a)"), + "

\"\"

", + "should allow an upwards relative" + ); + + assert_eq!( + micromark("![](a#b:c)"), + "

\"\"

", + "should allow a colon in a hash" + ); + + assert_eq!( + micromark("![](a?b:c)"), + "

\"\"

", + "should allow a colon in a search" + ); + + assert_eq!( + micromark("![](a/b:c)"), + "

\"\"

", + "should allow a colon in a path" + ); +} + +#[test] +fn dangerous_protocol_link() { + assert_eq!( + micromark("[](javascript:alert(1))"), + "

", + "should be safe by default" + ); + + assert_eq!( + micromark("[](http://a)"), + "

", + "should allow `http:`" + ); + + assert_eq!( + micromark("[](https://a)"), + "

", + "should allow `https:`" + ); + + assert_eq!( + micromark("[](irc:///help)"), + "

", + "should allow `irc:`" + ); + + assert_eq!( + micromark("[](mailto:a)"), + "

", + "should allow `mailto:`" + ); + + assert_eq!( + micromark("[](#a)"), + "

", + "should allow a hash" + ); + + assert_eq!( + micromark("[](?a)"), + "

", + "should allow a search" + ); + + assert_eq!( + micromark("[](/a)"), + "

", + "should allow an absolute" + ); + + assert_eq!( + micromark("[](./a)"), + "

", + "should allow an relative" + ); + + assert_eq!( + micromark("[](../a)"), + "

", + "should allow an upwards relative" + ); + + assert_eq!( + micromark("[](a#b:c)"), + "

", + "should allow a colon in a hash" + ); + + assert_eq!( + micromark("[](a?b:c)"), + "

", + "should allow a colon in a search" + ); + + assert_eq!( + micromark("[](a/b:c)"), + "

", + "should allow a colon in a path" + ); +} diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs index e9a0b72..568172e 100644 --- a/tests/misc_tabs.rs +++ b/tests/misc_tabs.rs @@ -221,48 +221,42 @@ fn tabs_text() { // "should support an initial tab after a line ending in a paragraph" // ); - // To do: link (reference). - // assert_eq!( - // micromark("x[\ty](z)"), - // "

x\ty

", - // "should support an initial tab in a link label" - // ); + assert_eq!( + micromark("x[\ty](z)"), + "

x\ty

", + "should support an initial tab in a link label" + ); - // To do: link (reference). - // assert_eq!( - // micromark("x[y\t](z)"), - // "

xy\t

", - // "should support a final tab in a link label" - // ); + assert_eq!( + micromark("x[y\t](z)"), + "

xy\t

", + "should support a final tab in a link label" + ); - // To do: link (reference). - // assert_eq!( - // micromark("[x\ty](z)"), - // "

x\ty

", - // "should support a tab in a link label" - // ); + assert_eq!( + micromark("[x\ty](z)"), + "

x\ty

", + "should support a tab in a link label" + ); - // To do: link (resource). // Note: CM.js bug, see: - // assert_eq!( - // micromark("[x](\ty)"), - // "

x

", - // "should support a tab starting a link resource" - // ); + assert_eq!( + micromark("[x](\ty)"), + "

x

", + "should support a tab starting a link resource" + ); - // To do: link (resource). - // assert_eq!( - // micromark("[x](y\t)"), - // "

x

", - // "should support a tab ending a link resource" - // ); + assert_eq!( + micromark("[x](y\t)"), + "

x

", + "should support a tab ending a link resource" + ); - // To do: link (resource). - // assert_eq!( - // micromark("[x](y\t\"z\")"), - // "

x

", - // "should support a tab between a link destination and title" - // ); + assert_eq!( + micromark("[x](y\t\"z\")"), + "

x

", + "should support a tab between a link destination and title" + ); } #[test] diff --git a/tests/misc_url.rs b/tests/misc_url.rs index a6f8ead..5e94366 100644 --- a/tests/misc_url.rs +++ b/tests/misc_url.rs @@ -9,28 +9,25 @@ fn url() { "should support incorrect percentage encoded values (0)" ); - // To do: link. - // assert_eq!( - // micromark("[](<%>)"), - // "

", - // "should support incorrect percentage encoded values (1)" - // ); - - // To do: link. - // assert_eq!( - // micromark("[](<%%20>)"), - // "

", - // "should support incorrect percentage encoded values (2)" - // ); - - // To do: link. - // assert_eq!( - // micromark("[](<%a%20>)"), - // "

", - // "should support incorrect percentage encoded values (3)" - // ); + assert_eq!( + micromark("[](<%>)"), + "

", + "should support incorrect percentage encoded values (1)" + ); - // Surrogate handling not needed in Rust. + assert_eq!( + micromark("[](<%%20>)"), + "

", + "should support incorrect percentage encoded values (2)" + ); + + assert_eq!( + micromark("[](<%a%20>)"), + "

", + "should support incorrect percentage encoded values (3)" + ); + + // Note: Surrogate handling not needed in Rust. // assert_eq!( // micromark("[]()"), // "

", @@ -114,39 +111,37 @@ fn url() { // "should support a lone low surrogate at the end (highest)" // ); - // To do: link. - // assert_eq!( - // micromark("[](<🤔>)"), - // "

", - // "should support an emoji" - // ); - - // To do: link. - // let mut ascii: Vec = vec![]; - // let mut code = 0; - - // while code < 128 { - // // LF and CR can’t be in resources. - // if code == 10 || code == 13 { - // code += 1; - // continue; - // } - - // // `<`, `>`, `\` need to be escaped. - // if code == 60 || code == 62 || code == 92 { - // ascii.push('\\'); - // } - - // ascii.push(char::from_u32(code).unwrap()); - - // code += 1; - // } - - // let ascii_in = ascii.into_iter().collect::(); - // let ascii_out = "%EF%BF%BD%01%02%03%04%05%06%07%08%09%0B%0C%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&\"()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F"; - // assert_eq!( - // micromark(&format!("[](<{}>)", ascii_in)), - // format!("

", ascii_out), - // "should support ascii characters" - // ); + assert_eq!( + micromark("[](<🤔>)"), + "

", + "should support an emoji" + ); + + let mut ascii: Vec = vec![]; + let mut code = 0; + + while code < 128 { + // LF and CR can’t be in resources. + if code == 10 || code == 13 { + code += 1; + continue; + } + + // `<`, `>`, `\` need to be escaped. + if code == 60 || code == 62 || code == 92 { + ascii.push('\\'); + } + + ascii.push(char::from_u32(code).unwrap()); + + code += 1; + } + + let ascii_in = ascii.into_iter().collect::(); + let ascii_out = "%EF%BF%BD%01%02%03%04%05%06%07%08%09%0B%0C%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F"; + assert_eq!( + micromark(&format!("[](<{}>)", ascii_in)), + format!("

", ascii_out), + "should support ascii characters" + ); } -- cgit