From 25e267afbc0789ea36508d45c3ea3545b84223bb Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 18 Aug 2022 18:33:10 +0200 Subject: Add support for GFM autolink literals --- Untitled.txt | 38 ++-- examples/lib.rs | 14 +- readme.md | 2 +- src/compiler.rs | 107 ++++++---- src/construct/attention.rs | 70 +------ src/construct/gfm_autolink_literal.rs | 382 ++++++++++++++++++++++++++++++++++ src/construct/mod.rs | 9 +- src/construct/partial_data.rs | 2 - src/construct/string.rs | 7 +- src/construct/text.rs | 12 +- src/event.rs | 269 ++++++++++++++++-------- src/lib.rs | 21 ++ src/util/classify_character.rs | 72 +++++++ src/util/mod.rs | 1 + tests/gfm_autolink_literal.rs | 256 +++++++++++++++++++++++ 15 files changed, 1040 insertions(+), 222 deletions(-) create mode 100644 src/construct/gfm_autolink_literal.rs create mode 100644 src/util/classify_character.rs create mode 100644 tests/gfm_autolink_literal.rs diff --git a/Untitled.txt b/Untitled.txt index 8238cf7..ca56d67 100644 --- a/Untitled.txt +++ b/Untitled.txt @@ -7,26 +7,22 @@ micromark.js: `atLineEnding` in html (text) should always eat arbitrary whitespa // --------------------- // Useful helper: -let mut index = 0; -let mut balance = 0; -println!("before: {:?}", events.len()); -while index < events.len() { - let event = &events[index]; - if event.event_type == EventType::Exit { - balance -= 1; + let mut index = 0; + let mut balance = 0; + println!("before: {:?}", tokenizer.events.len()); + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + if event.kind == Kind::Exit { + balance -= 1; + } + let prefix = String::from_utf8(vec![b' '; balance * 2]).unwrap(); + println!( + "ev: {}{:?}:{:?} ({:?}): {:?}", + prefix, event.kind, event.name, index, event.link, + ); + if event.kind == Kind::Enter { + balance += 1; + } + index += 1; } - let prefix = String::from_utf8(vec![b' '; balance * 2]).unwrap(); - println!( - "ev: {}{:?}:{:?} ({:?}): {:?}", - prefix, - event.kind, - event.name, - index, - event.link, - ); - if event.event_type == EventType::Enter { - balance += 1; - } - index += 1; -} ``` diff --git a/examples/lib.rs b/examples/lib.rs index 94c2c58..62d7ee4 100644 --- a/examples/lib.rs +++ b/examples/lib.rs @@ -22,7 +22,19 @@ fn main() { ) ); - // Support extensions that are not in CommonMark. + // Support GFM extensions. + println!( + "{}", + micromark_with_options( + "Just a link! https://example.com.", + &Options { + constructs: Constructs::gfm(), + ..Options::default() + } + ) + ); + + // Support other extensions that are not in CommonMark. println!( "{:?}", micromark_with_options( diff --git a/readme.md b/readme.md index 4143e39..f1c33f8 100644 --- a/readme.md +++ b/readme.md @@ -46,7 +46,7 @@ important. - [x] (1) frontmatter (yaml, toml) (flow) — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter) -- [ ] (3) autolink literal (GFM) (text) +- [x] (3) autolink literal (GFM) (text) — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) - [ ] (3) footnote (GFM) (flow, text) — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) diff --git a/src/compiler.rs b/src/compiler.rs index bb08745..ac68504 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -369,6 +369,9 @@ fn exit(context: &mut CompileContext) { Name::DefinitionTitleString => on_exit_definition_title_string(context), Name::Emphasis => on_exit_emphasis(context), Name::Frontmatter => on_exit_frontmatter(context), + Name::GfmAutolinkLiteralProtocol => on_exit_gfm_autolink_literal_protocol(context), + Name::GfmAutolinkLiteralWww => on_exit_gfm_autolink_literal_www(context), + Name::GfmAutolinkLiteralEmail => on_exit_gfm_autolink_literal_email(context), Name::HardBreakEscape | Name::HardBreakTrailing => on_exit_break(context), Name::HeadingAtx => on_exit_heading_atx(context), Name::HeadingAtxSequence => on_exit_heading_atx_sequence(context), @@ -647,47 +650,28 @@ fn on_enter_strong(context: &mut CompileContext) { /// Handle [`Exit`][Kind::Exit]:[`AutolinkEmail`][Name::AutolinkEmail]. fn on_exit_autolink_email(context: &mut CompileContext) { - let slice = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), + generate_autolink( + context, + Some("mailto:"), + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), ); - let value = slice.as_str(); - - if !context.image_alt_inside { - context.push(""); - } - - context.push(&encode(value, context.encode_html)); - - if !context.image_alt_inside { - context.push(""); - } } /// Handle [`Exit`][Kind::Exit]:[`AutolinkProtocol`][Name::AutolinkProtocol]. fn on_exit_autolink_protocol(context: &mut CompileContext) { - let slice = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), + generate_autolink( + context, + None, + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), ); - let value = slice.as_str(); - - if !context.image_alt_inside { - context.push(""); - } - - context.push(&encode(value, context.encode_html)); - - if !context.image_alt_inside { - context.push(""); - } } /// Handle [`Exit`][Kind::Exit]:{[`HardBreakEscape`][Name::HardBreakEscape],[`HardBreakTrailing`][Name::HardBreakTrailing]}. @@ -927,6 +911,37 @@ fn on_exit_frontmatter(context: &mut CompileContext) { context.slurp_one_line_ending = true; } +/// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralProtocol`][Name::GfmAutolinkLiteralProtocol]. +fn on_exit_gfm_autolink_literal_protocol(context: &mut CompileContext) { + generate_autolink( + context, + None, + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); +} + +/// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralWww`][Name::GfmAutolinkLiteralWww]. +fn on_exit_gfm_autolink_literal_www(context: &mut CompileContext) { + generate_autolink( + context, + Some("http://"), + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); +} + +/// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail]. +fn on_exit_gfm_autolink_literal_email(context: &mut CompileContext) { + on_exit_autolink_email(context); +} + /// Handle [`Exit`][Kind::Exit]:[`HeadingAtx`][Name::HeadingAtx]. fn on_exit_heading_atx(context: &mut CompileContext) { let rank = context @@ -1244,3 +1259,25 @@ fn on_exit_thematic_break(context: &mut CompileContext) { context.line_ending_if_needed(); context.push("
"); } + +/// Generate an autolink (used by unicode autolinks and GFM autolink literals). +fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { + if !context.image_alt_inside { + context.push(""); + } + + context.push(&encode(value, context.encode_html)); + + if !context.image_alt_inside { + context.push(""); + } +} diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 8df0f61..ef960d4 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -62,42 +62,10 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::unicode::PUNCTUATION; +use crate::util::classify_character::{classify_opt, Kind as CharacterKind}; use crate::util::slice::Slice; use alloc::{string::String, vec, vec::Vec}; -/// Character code kinds. -#[derive(Debug, PartialEq)] -enum CharacterKind { - /// Whitespace. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Whitespace, - /// Punctuation. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^^ ^ ^ ^ - /// ``` - Punctuation, - /// Everything else. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Other, -} - /// Attentention sequence that we can take markers from. #[derive(Debug)] struct Sequence { @@ -192,8 +160,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) .head() .unwrap(); - let before = classify_character(char_before); - let after = classify_character(char_after); + let before = classify_opt(char_before); + let after = classify_opt(char_after); let open = after == CharacterKind::Other || (after == CharacterKind::Punctuation && before != CharacterKind::Other); // To do: GFM strikethrough? @@ -429,35 +397,3 @@ fn match_sequences( next } - -/// Classify whether a character code represents whitespace, punctuation, or -/// something else. -/// -/// Used for attention (emphasis, strong), whose sequences can open or close -/// based on the class of surrounding characters. -/// -/// > 👉 **Note** that eof (`None`) is seen as whitespace. -/// -/// ## References -/// -/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -fn classify_character(char: Option) -> CharacterKind { - if let Some(char) = char { - // Unicode whitespace. - if char.is_whitespace() { - CharacterKind::Whitespace - } - // Unicode punctuation. - else if PUNCTUATION.contains(&char) { - CharacterKind::Punctuation - } - // Everything else. - else { - CharacterKind::Other - } - } - // EOF. - else { - CharacterKind::Whitespace - } -} diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs new file mode 100644 index 0000000..7fdeb01 --- /dev/null +++ b/src/construct/gfm_autolink_literal.rs @@ -0,0 +1,382 @@ +//! To do. + +use crate::event::{Event, Kind, Name}; +use crate::tokenizer::Tokenizer; +use crate::util::classify_character::{classify, Kind as CharacterKind}; +use crate::util::slice::{Position, Slice}; +use alloc::vec::Vec; +extern crate std; +use core::str; + +// To do: doc al functions. + +pub fn resolve(tokenizer: &mut Tokenizer) { + tokenizer.map.consume(&mut tokenizer.events); + + let mut index = 0; + let mut links = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter { + if event.name == Name::Link { + links += 1; + } + } else { + if event.name == Name::Data && links == 0 { + let slice = Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, index), + ); + let bytes = slice.bytes; + let mut byte_index = 0; + let mut replace = Vec::new(); + let mut point = tokenizer.events[index - 1].point.clone(); + let start_index = point.index; + let mut start = 0; + + while byte_index < bytes.len() { + if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') { + if let Some(autolink) = peek(bytes, byte_index) { + byte_index = autolink.1; + + // If there is something between the last link + // (or the start) and this link. + if start != autolink.0 { + replace.push(Event { + kind: Kind::Enter, + name: Name::Data, + point: point.clone(), + link: None, + }); + point = point.shift_to( + tokenizer.parse_state.bytes, + start_index + autolink.0, + ); + replace.push(Event { + kind: Kind::Exit, + name: Name::Data, + point: point.clone(), + link: None, + }); + } + + // Add the link. + replace.push(Event { + kind: Kind::Enter, + name: autolink.2.clone(), + point: point.clone(), + link: None, + }); + point = point + .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1); + replace.push(Event { + kind: Kind::Exit, + name: autolink.2.clone(), + point: point.clone(), + link: None, + }); + start = autolink.1; + } + } + + byte_index += 1; + } + + // If there was a link, and we have more bytes left. + if start != 0 && start < bytes.len() { + replace.push(Event { + kind: Kind::Enter, + name: Name::Data, + point: point.clone(), + link: None, + }); + replace.push(Event { + kind: Kind::Exit, + name: Name::Data, + point: event.point.clone(), + link: None, + }); + } + + // If there were links. + if !replace.is_empty() { + tokenizer.map.add(index - 1, 2, replace); + } + } + + if event.name == Name::Link { + links -= 1; + } + } + + index += 1; + } +} + +fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> { + // Protocol. + if let Some(protocol_end) = peek_protocol(bytes, index) { + if let Some(domain_end) = peek_domain(bytes, protocol_end, true) { + let end = truncate(bytes, protocol_end, domain_end); + + // Cannot be empty. + if end != protocol_end { + return Some((index, end, Name::GfmAutolinkLiteralProtocol)); + } + } + } + + // Www. + if peek_www(bytes, index).is_some() { + // Note: we discard the `www.` we parsed, we now try to parse it as a domain. + let domain_end = peek_domain(bytes, index, false).unwrap_or(index); + let end = truncate(bytes, index, domain_end); + return Some((index, end, Name::GfmAutolinkLiteralWww)); + } + + // Email. + if bytes[index] == b'@' { + if let Some(start) = peek_atext(bytes, index) { + if let Some(end) = peek_email_domain(bytes, index + 1) { + let end = truncate(bytes, start, end); + return Some((start, end, Name::GfmAutolinkLiteralEmail)); + } + } + } + + None +} + +/// Move past `http://`, `https://`, case-insensitive. +fn peek_protocol(bytes: &[u8], mut index: usize) -> Option { + // `http` + if index + 3 < bytes.len() + && matches!(bytes[index], b'H' | b'h') + && matches!(bytes[index + 1], b'T' | b't') + && matches!(bytes[index + 2], b'T' | b't') + && matches!(bytes[index + 3], b'P' | b'p') + { + index += 4; + + // `s`, optional. + if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') { + index += 1; + } + + // `://` + if index + 3 < bytes.len() + && bytes[index] == b':' + && bytes[index + 1] == b'/' + && bytes[index + 2] == b'/' + { + return Some(index + 3); + } + } + + None +} + +/// Move past `www.`, case-insensitive. +fn peek_www(bytes: &[u8], index: usize) -> Option { + // `www.` + if index + 3 < bytes.len() + // Source: . + && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~')) + && matches!(bytes[index], b'W' | b'w') + && matches!(bytes[index + 1], b'W' | b'w') + && matches!(bytes[index + 2], b'W' | b'w') + && bytes[index + 3] == b'.' + { + Some(index + 4) + } else { + None + } +} + +/// Move past `example.com`. +fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option { + let mut dots = false; + let mut penultime = false; + let mut last = false; + // To do: expose this from slice? + // To do: do it ourselves? , , , . + let char_indices = str::from_utf8(&bytes[start..]) + .unwrap() + .char_indices() + .collect::>(); + let mut index = 0; + + while index < char_indices.len() { + match char_indices[index].1 { + '_' => last = true, + '.' => { + penultime = last; + last = false; + dots = true; + } + '-' => {} + // Source: . + char if classify(char) == CharacterKind::Other => {} + _ => break, + } + + index += 1; + } + + // No underscores allowed in last two parts. + // A valid domain needs to have at least a dot. + if penultime || last || (!allow_short && !dots) { + None + } else { + // Now peek past `/path?search#hash` (anything except whitespace). + while index < char_indices.len() { + if classify(char_indices[index].1) == CharacterKind::Whitespace { + break; + } + + index += 1; + } + + Some(if index == char_indices.len() { + bytes.len() + } else { + start + char_indices[index].0 + }) + } +} + +/// Move back past `contact`. +fn peek_atext(bytes: &[u8], end: usize) -> Option { + let mut index = end; + + // Take simplified atext. + // See `email_atext` in `autolink.rs` for a similar algorithm. + // Source: . + while index > 0 + && matches!(bytes[index - 1], b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z') + { + index -= 1; + } + + // Do not allow a slash “inside” atext. + // The reference code is a bit weird, but that’s what it results in. + // Source: . + // Other than slash, every preceding character is allowed. + if index == end || (index > 0 && bytes[index - 1] == b'/') { + None + } else { + Some(index) + } +} + +/// Move past `example.com`. +fn peek_email_domain(bytes: &[u8], start: usize) -> Option { + let mut index = start; + let mut dot = false; + + // Move past “domain”. + // The reference code is a bit overly complex as it handles the `@`, of which there may be just one. + // Source: + while index < bytes.len() { + match bytes[index] { + // Alphanumerical, `-`, and `_`. + b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z' => {} + // Dot followed by alphanumerical (not `-` or `_`). + b'.' if index + 1 < bytes.len() + && matches!(bytes[index + 1], b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => + { + dot = true; + } + _ => break, + } + + index += 1; + } + + // Domain must not be empty, must include a dot, and must end in alphabetical or `.`. + // Source: . + if index > start && dot && matches!(bytes[index - 1], b'.' | b'A'..=b'Z' | b'a'..=b'z') { + Some(index) + } else { + None + } +} + +/// Split trialing stuff from a URL. +fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { + let mut index = start; + + // Source: + while index < end { + if bytes[index] == b'<' { + end = index; + break; + } + index += 1; + } + + let mut split = end; + + // Move before trailing punctuation. + while split > start { + match bytes[split - 1] { + b'!' | b'"' | b'&' | b'\'' | b')' | b',' | b'.' | b':' | b'<' | b'>' | b'?' | b']' + | b'}' => {} + // Source: . + // Note: we can’t move across actual references, because those have been parsed already. + b';' => { + let mut new_split = split - 1; + // Move back past alphabeticals. + while new_split > start && matches!(bytes[new_split - 1], b'A'..=b'Z' | b'a'..=b'z') + { + new_split -= 1; + } + + // Nonempty character reference: + if new_split > start && bytes[new_split - 1] == b'&' && new_split < split - 1 { + split = new_split - 1; + continue; + } + + // Otherwise it’s just a `;`. + } + _ => break, + } + split -= 1; + } + + // If there was trailing punctuation, try to balance parens. + if split != end { + let mut open = 0; + let mut close = 0; + let mut paren_index = start; + + // Count parens in `url` (not in trail). + while paren_index < split { + match bytes[paren_index] { + b'(' => open += 1, + b')' => close += 1, + _ => {} + } + + paren_index += 1; + } + + let mut trail_index = split; + + // If there are more opening than closing parens, try to balance them + // from the trail. + while open > close && trail_index < end { + if bytes[trail_index] == b')' { + split = trail_index; + close += 1; + } + + trail_index += 1; + } + } + + split +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1c1c6f7..ba1a0b3 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -28,7 +28,7 @@ //! For example, [code (fenced)][code_fenced] and //! [code (indented)][code_indented] are considered different constructs. //! -//! The following constructs are found in markdown: +//! The following constructs are found in markdown (CommonMark): //! //! * [attention (strong, emphasis)][attention] //! * [autolink][] @@ -40,7 +40,6 @@ //! * [code (indented)][code_indented] //! * [code (text)][code_text] //! * [definition][] -//! * [frontmatter][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] //! * [heading (setext)][heading_setext] @@ -56,6 +55,11 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][partial_whitespace]. //! +//! The following constructs are extensions found in markdown: +//! +//! * [frontmatter][] +//! * [gfm autolink literal][gfm_autolink_literal] +//! //! There are also several small subroutines typically used in different places: //! //! * [bom][partial_bom] @@ -141,6 +145,7 @@ pub mod definition; pub mod document; pub mod flow; pub mod frontmatter; +pub mod gfm_autolink_literal; pub mod hard_break_escape; pub mod heading_atx; pub mod heading_setext; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index bc6d7f4..b6f1f47 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -7,7 +7,6 @@ //! [text]: crate::construct::text use crate::event::{Kind, Name}; -use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use alloc::vec; @@ -51,7 +50,6 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { } } - tokenizer.register_resolver_before(ResolveName::Data); State::Ok } diff --git a/src/construct/string.rs b/src/construct/string.rs index 698a51d..dba1ac1 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -27,7 +27,6 @@ const MARKERS: [u8; 2] = [b'&', b'\\']; /// ^ /// ```` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver(ResolveName::String); tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::StringBefore) } @@ -40,7 +39,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ```` pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Ok, + None => { + tokenizer.register_resolver(ResolveName::Data); + tokenizer.register_resolver(ResolveName::String); + State::Ok + } Some(b'&') => { tokenizer.attempt( State::Next(StateName::StringBefore), diff --git a/src/construct/text.rs b/src/construct/text.rs index 5c13dba..06ba378 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -20,6 +20,7 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][crate::construct::partial_whitespace]. +use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_literal; use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; @@ -45,7 +46,6 @@ const MARKERS: [u8; 9] = [ /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver(ResolveName::Text); tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::TextBefore) } @@ -58,7 +58,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Ok, + None => { + tokenizer.register_resolver(ResolveName::Data); + tokenizer.register_resolver(ResolveName::Text); + State::Ok + } Some(b'!') => { tokenizer.attempt( State::Next(StateName::TextBefore), @@ -170,4 +174,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { tokenizer.parse_state.constructs.hard_break_trailing, true, ); + + if tokenizer.parse_state.constructs.gfm_autolink_literal { + resolve_gfm_autolink_literal(tokenizer); + } } diff --git a/src/event.rs b/src/event.rs index f2f8ae1..169fdb5 100644 --- a/src/event.rs +++ b/src/event.rs @@ -878,6 +878,148 @@ pub enum Name { /// ^ /// ``` EmphasisText, + // To do: sort. + /// Whole frontmatter. + /// + /// ## Info + /// + /// * **Context**: + /// [document content][crate::construct::document] + /// * **Content model**: + /// [`FrontmatterFence`][Name::FrontmatterFence], + /// [`FrontmatterChunk`][Name::FrontmatterChunk], + /// [`LineEnding`][Name::LineEnding] + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// > | --- + /// ^^^ + /// > | title: Neptune + /// ^^^^^^^^^^^^^^ + /// > | --- + /// ^^^ + /// ``` + Frontmatter, + /// Frontmatter chunk. + /// + /// ## Info + /// + /// * **Context**: + /// [`Frontmatter`][Name::Frontmatter] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// | --- + /// > | title: Neptune + /// ^^^^^^^^^^^^^^ + /// | --- + /// ``` + FrontmatterChunk, + /// Frontmatter fence. + /// + /// ## Info + /// + /// * **Context**: + /// [`Frontmatter`][Name::Frontmatter] + /// * **Content model**: + /// [`FrontmatterSequence`][Name::FrontmatterSequence], + /// [`SpaceOrTab`][Name::SpaceOrTab] + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// > | --- + /// ^^^ + /// | title: Neptune + /// > | --- + /// ^^^ + /// ``` + FrontmatterFence, + /// Frontmatter sequence. + /// + /// ## Info + /// + /// * **Context**: + /// [`FrontmatterFence`][Name::FrontmatterFence] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// > | --- + /// ^^^ + /// | title: Neptune + /// > | --- + /// ^^^ + /// ``` + FrontmatterSequence, + /// GFM extension: email autolink. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// void. + /// * **Construct**: + /// [`gfm_autolink_literal`][crate::construct::gfm_autolink_literal] + /// + /// ## Example + /// + /// ```markdown + /// > | context@example.com + /// ^^^^^^^^^^^^^^^^^^^ + /// ``` + GfmAutolinkLiteralEmail, + /// GFM extension: autolink w/ protocol. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// void. + /// * **Construct**: + /// [`gfm_autolink_literal`][crate::construct::gfm_autolink_literal] + /// + /// ## Example + /// + /// ```markdown + /// > | https://example.com + /// ^^^^^^^^^^^^^^^^^^^ + /// ``` + GfmAutolinkLiteralProtocol, + /// GFM extension: autolink w/ www. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// void. + /// * **Construct**: + /// [`gfm_autolink_literal`][crate::construct::gfm_autolink_literal] + /// + /// ## Example + /// + /// ```markdown + /// > | www.example.com + /// ^^^^^^^^^^^^^^^ + /// ``` + GfmAutolinkLiteralWww, /// Whole hard break (escape). /// /// ## Info @@ -1832,98 +1974,10 @@ pub enum Name { /// ^ ^ ^ /// ``` ThematicBreakSequence, - - /// Whole frontmatter. - /// - /// ## Info - /// - /// * **Context**: - /// [document content][crate::construct::document] - /// * **Content model**: - /// [`FrontmatterFence`][Name::FrontmatterFence], - /// [`FrontmatterChunk`][Name::FrontmatterChunk], - /// [`LineEnding`][Name::LineEnding] - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// > | --- - /// ^^^ - /// > | title: Neptune - /// ^^^^^^^^^^^^^^ - /// > | --- - /// ^^^ - /// ```` - Frontmatter, - /// Frontmatter chunk. - /// - /// ## Info - /// - /// * **Context**: - /// [`Frontmatter`][Name::Frontmatter] - /// * **Content model**: - /// void - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// | --- - /// > | title: Neptune - /// ^^^^^^^^^^^^^^ - /// | --- - /// ```` - FrontmatterChunk, - /// Frontmatter fence. - /// - /// ## Info - /// - /// * **Context**: - /// [`Frontmatter`][Name::Frontmatter] - /// * **Content model**: - /// [`FrontmatterSequence`][Name::FrontmatterSequence], - /// [`SpaceOrTab`][Name::SpaceOrTab] - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// > | --- - /// ^^^ - /// | title: Neptune - /// > | --- - /// ^^^ - /// ```` - FrontmatterFence, - /// Frontmatter sequence. - /// - /// ## Info - /// - /// * **Context**: - /// [`FrontmatterFence`][Name::FrontmatterFence] - /// * **Content model**: - /// void - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// > | --- - /// ^^^ - /// | title: Neptune - /// > | --- - /// ^^^ - /// ```` - FrontmatterSequence, } /// List of void events, used to make sure everything is working well. -pub const VOID_EVENTS: [Name; 43] = [ +pub const VOID_EVENTS: [Name; 46] = [ Name::AttentionSequence, Name::AutolinkEmail, Name::AutolinkMarker, @@ -1949,6 +2003,9 @@ pub const VOID_EVENTS: [Name; 43] = [ Name::DefinitionTitleMarker, Name::EmphasisSequence, Name::FrontmatterChunk, + Name::GfmAutolinkLiteralEmail, + Name::GfmAutolinkLiteralProtocol, + Name::GfmAutolinkLiteralWww, Name::FrontmatterSequence, Name::HardBreakEscape, Name::HardBreakTrailing, @@ -2013,6 +2070,40 @@ pub struct Point { pub vs: usize, } +impl Point { + /// Create a new point, that is shifted from the close earlier current + /// point, to `index.` + // To do: tabs. + pub fn shift_to(&self, bytes: &[u8], index: usize) -> Point { + let mut next = self.clone(); + debug_assert!(index > next.index, "expect"); + + while next.index < index { + match bytes[next.index] { + b'\n' | b'\r' => unreachable!("cannot move past line endings"), + b'\t' => { + unreachable!("to do: tab") + // let remainder = next.column % TAB_SIZE; + // let vs = if remainder == 0 { + // 0 + // } else { + // TAB_SIZE - remainder + // }; + + // next.index += 1; + // next.column += 1 + vs; + } + _ => { + next.index += 1; + next.column += 1; + } + } + } + + next + } +} + /// Event kinds. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Kind { diff --git a/src/lib.rs b/src/lib.rs index 00f1c5c..ba257db 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -166,6 +166,13 @@ pub struct Constructs { /// ^^^ /// ```` pub frontmatter: bool, + /// GFM: autolink literal. + /// + /// ```markdown + /// > | https://example.com + /// ^^^^^^^^^^^^^^^^^^^ + /// ``` + pub gfm_autolink_literal: bool, /// Hard break (escape). /// /// ```markdown @@ -263,6 +270,7 @@ impl Default for Constructs { code_text: true, definition: true, frontmatter: false, + gfm_autolink_literal: false, hard_break_escape: true, hard_break_trailing: true, heading_atx: true, @@ -278,6 +286,19 @@ impl Default for Constructs { } } +impl Constructs { + /// GFM. + /// + /// This turns on `CommonMark` + GFM. + #[must_use] + pub fn gfm() -> Self { + Self { + gfm_autolink_literal: true, + ..Self::default() + } + } +} + /// Configuration (optional). #[derive(Clone, Debug, Default)] pub struct Options { diff --git a/src/util/classify_character.rs b/src/util/classify_character.rs new file mode 100644 index 0000000..b938502 --- /dev/null +++ b/src/util/classify_character.rs @@ -0,0 +1,72 @@ +//! Utilities to classify characters as whitespace, punctuation, or rest. + +use crate::unicode::PUNCTUATION; + +/// Character kinds. +#[derive(Debug, PartialEq, Eq)] +pub enum Kind { + /// Whitespace. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Whitespace, + /// Punctuation. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^^ ^ ^ ^ + /// ``` + Punctuation, + /// Everything else. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Other, +} + +/// Classify whether a character code represents whitespace, punctuation, or +/// something else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// > 👉 **Note** that eof (`None`) is seen as whitespace. +/// +/// ## References +/// +/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) +pub fn classify(char: char) -> Kind { + // Unicode whitespace. + if char.is_whitespace() { + Kind::Whitespace + } + // Unicode punctuation. + else if PUNCTUATION.contains(&char) { + Kind::Punctuation + } + // Everything else. + else { + Kind::Other + } +} + +/// Like [`classify`], but supports eof as whitespace. +pub fn classify_opt(char_opt: Option) -> Kind { + if let Some(char) = char_opt { + classify(char) + } + // EOF. + else { + Kind::Whitespace + } +} diff --git a/src/util/mod.rs b/src/util/mod.rs index f51845c..022c7d6 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ //! Utilities used when processing markdown. +pub mod classify_character; pub mod decode_character_reference; pub mod edit_map; pub mod encode; diff --git a/tests/gfm_autolink_literal.rs b/tests/gfm_autolink_literal.rs new file mode 100644 index 0000000..9551751 --- /dev/null +++ b/tests/gfm_autolink_literal.rs @@ -0,0 +1,256 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Constructs, Options}; +use pretty_assertions::assert_eq; + +#[test] +fn gfm_autolink_literal() { + let gfm = Options { + constructs: Constructs::gfm(), + ..Options::default() + }; + + assert_eq!( + micromark("https://example.com"), + "

https://example.com

", + "should ignore protocol urls by default" + ); + assert_eq!( + micromark("www.example.com"), + "

www.example.com

", + "should ignore www urls by default" + ); + assert_eq!( + micromark("user@example.com"), + "

user@example.com

", + "should ignore email urls by default" + ); + + assert_eq!( + micromark_with_options("https://example.com", &gfm), + "

https://example.com

", + "should support protocol urls if enabled" + ); + assert_eq!( + micromark_with_options("www.example.com", &gfm), + "

www.example.com

", + "should support www urls if enabled" + ); + assert_eq!( + micromark_with_options("user@example.com", &gfm), + "

user@example.com

", + "should support email urls if enabled" + ); + + assert_eq!( + micromark_with_options("user@example.com", &gfm), + "

user@example.com

", + "should support a closing paren at TLD (email)" + ); + + assert_eq!( + micromark_with_options("www.a.)", &gfm), + "

www.a.)

", + "should support a closing paren at TLD (www)" + ); + + assert_eq!( + micromark_with_options("www.a b", &gfm), + "

www.a b

", + "should support no TLD" + ); + + assert_eq!( + micromark_with_options("www.a/b c", &gfm), + "

www.a/b c

", + "should support a path instead of TLD" + ); + + assert_eq!( + micromark_with_options("www.�a", &gfm), + "

www.�a

", + "should support a replacement character in a domain" + ); + + assert_eq!( + micromark_with_options("http://點看.com", &gfm), + "

http://點看.com

", + "should support non-ascii characters in a domain (http)" + ); + + assert_eq!( + micromark_with_options("www.點看.com", &gfm), + "

www.點看.com

", + "should support non-ascii characters in a domain (www)" + ); + + assert_eq!( + micromark_with_options("點看@example.com", &gfm), + "

點看@example.com

", + "should *not* support non-ascii characters in atext (email)" + ); + + assert_eq!( + micromark_with_options("example@點看.com", &gfm), + "

example@點看.com

", + "should *not* support non-ascii characters in a domain (email)" + ); + + assert_eq!( + micromark_with_options("www.a.com/點看", &gfm), + "

www.a.com/點看

", + "should support non-ascii characters in a path" + ); + + assert_eq!( + micromark_with_options("www.-a.b", &gfm), + "

www.-a.b

", + "should support a dash to start a domain" + ); + + assert_eq!( + micromark_with_options("www.$", &gfm), + "

www.$

", + "should support a dollar as a domain name" + ); + + assert_eq!( + micromark_with_options("www.a..b.c", &gfm), + "

www.a..b.c

", + "should support adjacent dots in a domain name" + ); + + assert_eq!( + micromark_with_options("www.a&a;", &gfm), + "

www.a&a;

", + "should support named character references in domains" + ); + + assert_eq!( + micromark_with_options("https://a.bc/d/e/).", &gfm), + "

https://a.bc/d/e/).

", + "should support a closing paren and period after a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc/d/e/.)", &gfm), + "

https://a.bc/d/e/.)

", + "should support a period and closing paren after a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc).", &gfm), + "

https://a.bc).

", + "should support a closing paren and period after a domain" + ); + + assert_eq!( + micromark_with_options("https://a.bc.)", &gfm), + "

https://a.bc.)

", + "should support a period and closing paren after a domain" + ); + + assert_eq!( + micromark_with_options("https://a.bc).d", &gfm), + "

https://a.bc).d

", + "should support a closing paren and period in a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc.)d", &gfm), + "

https://a.bc.)d

", + "should support a period and closing paren in a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc/))d", &gfm), + "

https://a.bc/))d

", + "should support two closing parens in a path" + ); + + assert_eq!( + micromark_with_options("ftp://a/b/c.txt", &gfm), + "

ftp://a/b/c.txt

", + "should not support ftp links" + ); + + // Note: GH comments/issues/PRs do not link this, but Gists/readmes do. + // Fixing it would mean defiating from `cmark-gfm`: + // Source: . + // assert_eq!( + // micromark_with_options(",www.example.com", &gfm), + // "

www.example.com

", + // "should support www links after Unicode punctuation", + // ); + + assert_eq!( + micromark_with_options(",https://example.com", &gfm), + "

https://example.com

", + "should support http links after Unicode punctuation" + ); + + assert_eq!( + micromark_with_options(",example@example.com", &gfm), + "

example@example.com

", + "should support email links after Unicode punctuation" + ); + + assert_eq!( + micromark_with_options( + "http://user:password@host:port/path?key=value#fragment", + &gfm + ), + "

http://user:password@host:port/path?key=value#fragment

", + "should not link character reference for `:`" + ); + + assert_eq!( + micromark_with_options("http://example.com/abhttp://example.com/ab<cd

", + "should stop domains/paths at `<`" + ); + + assert_eq!( + micromark_with_options( + r###" +[ www.example.com + +[ https://example.com + +[ contact@example.com + +[ www.example.com ] + +[ https://example.com ] + +[ contact@example.com ] + +[ www.example.com ](#) + +[ https://example.com ](#) + +[ contact@example.com ](#) + +![ www.example.com ](#) + +![ https://example.com ](#) + +![ contact@example.com ](#) +"###, + &gfm + ), + r###"

[ www.example.com

+

[ https://example.com

+

[ contact@example.com

+

[ www.example.com ]

+

[ https://example.com ]

+

[ contact@example.com ]

+

www.example.com

+

https://example.com

+

contact@example.com

+

 www.example.com

+

 https://example.com

+

 contact@example.com

+"###, + "should interplay with brackets, links, and images" + ); +} -- cgit