diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-08-18 18:33:10 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-08-18 18:33:17 +0200 |
commit | 25e267afbc0789ea36508d45c3ea3545b84223bb (patch) | |
tree | 8dee2a78ad1df29e9df7cf151091a5d265fd7ecb | |
parent | 1dbf02d8c1955316c6cc43a427f506b91c87ef3a (diff) | |
download | markdown-rs-25e267afbc0789ea36508d45c3ea3545b84223bb.tar.gz markdown-rs-25e267afbc0789ea36508d45c3ea3545b84223bb.tar.bz2 markdown-rs-25e267afbc0789ea36508d45c3ea3545b84223bb.zip |
Add support for GFM autolink literals
-rw-r--r-- | Untitled.txt | 38 | ||||
-rw-r--r-- | examples/lib.rs | 14 | ||||
-rw-r--r-- | readme.md | 2 | ||||
-rw-r--r-- | src/compiler.rs | 107 | ||||
-rw-r--r-- | src/construct/attention.rs | 70 | ||||
-rw-r--r-- | src/construct/gfm_autolink_literal.rs | 382 | ||||
-rw-r--r-- | src/construct/mod.rs | 9 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 2 | ||||
-rw-r--r-- | src/construct/string.rs | 7 | ||||
-rw-r--r-- | src/construct/text.rs | 12 | ||||
-rw-r--r-- | src/event.rs | 269 | ||||
-rw-r--r-- | src/lib.rs | 21 | ||||
-rw-r--r-- | src/util/classify_character.rs | 72 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | tests/gfm_autolink_literal.rs | 256 |
15 files changed, 1040 insertions, 222 deletions
diff --git a/Untitled.txt b/Untitled.txt index 8238cf7..ca56d67 100644 --- a/Untitled.txt +++ b/Untitled.txt @@ -7,26 +7,22 @@ micromark.js: `atLineEnding` in html (text) should always eat arbitrary whitespa // --------------------- // Useful helper: -let mut index = 0; -let mut balance = 0; -println!("before: {:?}", events.len()); -while index < events.len() { - let event = &events[index]; - if event.event_type == EventType::Exit { - balance -= 1; + let mut index = 0; + let mut balance = 0; + println!("before: {:?}", tokenizer.events.len()); + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + if event.kind == Kind::Exit { + balance -= 1; + } + let prefix = String::from_utf8(vec![b' '; balance * 2]).unwrap(); + println!( + "ev: {}{:?}:{:?} ({:?}): {:?}", + prefix, event.kind, event.name, index, event.link, + ); + if event.kind == Kind::Enter { + balance += 1; + } + index += 1; } - let prefix = String::from_utf8(vec![b' '; balance * 2]).unwrap(); - println!( - "ev: {}{:?}:{:?} ({:?}): {:?}", - prefix, - event.kind, - event.name, - index, - event.link, - ); - if event.event_type == EventType::Enter { - balance += 1; - } - index += 1; -} ``` diff --git a/examples/lib.rs b/examples/lib.rs index 94c2c58..62d7ee4 100644 --- a/examples/lib.rs +++ b/examples/lib.rs @@ -22,7 +22,19 @@ fn main() { ) ); - // Support extensions that are not in CommonMark. + // Support GFM extensions. + println!( + "{}", + micromark_with_options( + "Just a link! https://example.com.", + &Options { + constructs: Constructs::gfm(), + ..Options::default() + } + ) + ); + + // Support other extensions that are not in CommonMark. println!( "{:?}", micromark_with_options( @@ -46,7 +46,7 @@ important. - [x] (1) frontmatter (yaml, toml) (flow) — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter) -- [ ] (3) autolink literal (GFM) (text) +- [x] (3) autolink literal (GFM) (text) — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) - [ ] (3) footnote (GFM) (flow, text) — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) diff --git a/src/compiler.rs b/src/compiler.rs index bb08745..ac68504 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -369,6 +369,9 @@ fn exit(context: &mut CompileContext) { Name::DefinitionTitleString => on_exit_definition_title_string(context), Name::Emphasis => on_exit_emphasis(context), Name::Frontmatter => on_exit_frontmatter(context), + Name::GfmAutolinkLiteralProtocol => on_exit_gfm_autolink_literal_protocol(context), + Name::GfmAutolinkLiteralWww => on_exit_gfm_autolink_literal_www(context), + Name::GfmAutolinkLiteralEmail => on_exit_gfm_autolink_literal_email(context), Name::HardBreakEscape | Name::HardBreakTrailing => on_exit_break(context), Name::HeadingAtx => on_exit_heading_atx(context), Name::HeadingAtxSequence => on_exit_heading_atx_sequence(context), @@ -647,47 +650,28 @@ fn on_enter_strong(context: &mut CompileContext) { /// Handle [`Exit`][Kind::Exit]:[`AutolinkEmail`][Name::AutolinkEmail]. fn on_exit_autolink_email(context: &mut CompileContext) { - let slice = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), + generate_autolink( + context, + Some("mailto:"), + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), ); - let value = slice.as_str(); - - if !context.image_alt_inside { - context.push("<a href=\""); - context.push(&sanitize_uri( - &format!("mailto:{}", value), - &context.protocol_href, - )); - context.push("\">"); - } - - context.push(&encode(value, context.encode_html)); - - if !context.image_alt_inside { - context.push("</a>"); - } } /// Handle [`Exit`][Kind::Exit]:[`AutolinkProtocol`][Name::AutolinkProtocol]. fn on_exit_autolink_protocol(context: &mut CompileContext) { - let slice = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), + generate_autolink( + context, + None, + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), ); - let value = slice.as_str(); - - if !context.image_alt_inside { - context.push("<a href=\""); - context.push(&sanitize_uri(value, &context.protocol_href)); - context.push("\">"); - } - - context.push(&encode(value, context.encode_html)); - - if !context.image_alt_inside { - context.push("</a>"); - } } /// Handle [`Exit`][Kind::Exit]:{[`HardBreakEscape`][Name::HardBreakEscape],[`HardBreakTrailing`][Name::HardBreakTrailing]}. @@ -927,6 +911,37 @@ fn on_exit_frontmatter(context: &mut CompileContext) { context.slurp_one_line_ending = true; } +/// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralProtocol`][Name::GfmAutolinkLiteralProtocol]. +fn on_exit_gfm_autolink_literal_protocol(context: &mut CompileContext) { + generate_autolink( + context, + None, + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); +} + +/// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralWww`][Name::GfmAutolinkLiteralWww]. +fn on_exit_gfm_autolink_literal_www(context: &mut CompileContext) { + generate_autolink( + context, + Some("http://"), + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); +} + +/// Handle [`Exit`][Kind::Exit]:[`GfmAutolinkLiteralEmail`][Name::GfmAutolinkLiteralEmail]. +fn on_exit_gfm_autolink_literal_email(context: &mut CompileContext) { + on_exit_autolink_email(context); +} + /// Handle [`Exit`][Kind::Exit]:[`HeadingAtx`][Name::HeadingAtx]. fn on_exit_heading_atx(context: &mut CompileContext) { let rank = context @@ -1244,3 +1259,25 @@ fn on_exit_thematic_break(context: &mut CompileContext) { context.line_ending_if_needed(); context.push("<hr />"); } + +/// Generate an autolink (used by unicode autolinks and GFM autolink literals). +fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { + if !context.image_alt_inside { + context.push("<a href=\""); + if let Some(protocol) = protocol { + context.push(&sanitize_uri( + &format!("{}{}", protocol, value), + &context.protocol_href, + )); + } else { + context.push(&sanitize_uri(value, &context.protocol_href)); + }; + context.push("\">"); + } + + context.push(&encode(value, context.encode_html)); + + if !context.image_alt_inside { + context.push("</a>"); + } +} diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 8df0f61..ef960d4 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -62,42 +62,10 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::unicode::PUNCTUATION; +use crate::util::classify_character::{classify_opt, Kind as CharacterKind}; use crate::util::slice::Slice; use alloc::{string::String, vec, vec::Vec}; -/// Character code kinds. -#[derive(Debug, PartialEq)] -enum CharacterKind { - /// Whitespace. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Whitespace, - /// Punctuation. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^^ ^ ^ ^ - /// ``` - Punctuation, - /// Everything else. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Other, -} - /// Attentention sequence that we can take markers from. #[derive(Debug)] struct Sequence { @@ -192,8 +160,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) .head() .unwrap(); - let before = classify_character(char_before); - let after = classify_character(char_after); + let before = classify_opt(char_before); + let after = classify_opt(char_after); let open = after == CharacterKind::Other || (after == CharacterKind::Punctuation && before != CharacterKind::Other); // To do: GFM strikethrough? @@ -429,35 +397,3 @@ fn match_sequences( next } - -/// Classify whether a character code represents whitespace, punctuation, or -/// something else. -/// -/// Used for attention (emphasis, strong), whose sequences can open or close -/// based on the class of surrounding characters. -/// -/// > 👉 **Note** that eof (`None`) is seen as whitespace. -/// -/// ## References -/// -/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -fn classify_character(char: Option<char>) -> CharacterKind { - if let Some(char) = char { - // Unicode whitespace. - if char.is_whitespace() { - CharacterKind::Whitespace - } - // Unicode punctuation. - else if PUNCTUATION.contains(&char) { - CharacterKind::Punctuation - } - // Everything else. - else { - CharacterKind::Other - } - } - // EOF. - else { - CharacterKind::Whitespace - } -} diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs new file mode 100644 index 0000000..7fdeb01 --- /dev/null +++ b/src/construct/gfm_autolink_literal.rs @@ -0,0 +1,382 @@ +//! To do. + +use crate::event::{Event, Kind, Name}; +use crate::tokenizer::Tokenizer; +use crate::util::classify_character::{classify, Kind as CharacterKind}; +use crate::util::slice::{Position, Slice}; +use alloc::vec::Vec; +extern crate std; +use core::str; + +// To do: doc al functions. + +pub fn resolve(tokenizer: &mut Tokenizer) { + tokenizer.map.consume(&mut tokenizer.events); + + let mut index = 0; + let mut links = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter { + if event.name == Name::Link { + links += 1; + } + } else { + if event.name == Name::Data && links == 0 { + let slice = Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, index), + ); + let bytes = slice.bytes; + let mut byte_index = 0; + let mut replace = Vec::new(); + let mut point = tokenizer.events[index - 1].point.clone(); + let start_index = point.index; + let mut start = 0; + + while byte_index < bytes.len() { + if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') { + if let Some(autolink) = peek(bytes, byte_index) { + byte_index = autolink.1; + + // If there is something between the last link + // (or the start) and this link. + if start != autolink.0 { + replace.push(Event { + kind: Kind::Enter, + name: Name::Data, + point: point.clone(), + link: None, + }); + point = point.shift_to( + tokenizer.parse_state.bytes, + start_index + autolink.0, + ); + replace.push(Event { + kind: Kind::Exit, + name: Name::Data, + point: point.clone(), + link: None, + }); + } + + // Add the link. + replace.push(Event { + kind: Kind::Enter, + name: autolink.2.clone(), + point: point.clone(), + link: None, + }); + point = point + .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1); + replace.push(Event { + kind: Kind::Exit, + name: autolink.2.clone(), + point: point.clone(), + link: None, + }); + start = autolink.1; + } + } + + byte_index += 1; + } + + // If there was a link, and we have more bytes left. + if start != 0 && start < bytes.len() { + replace.push(Event { + kind: Kind::Enter, + name: Name::Data, + point: point.clone(), + link: None, + }); + replace.push(Event { + kind: Kind::Exit, + name: Name::Data, + point: event.point.clone(), + link: None, + }); + } + + // If there were links. + if !replace.is_empty() { + tokenizer.map.add(index - 1, 2, replace); + } + } + + if event.name == Name::Link { + links -= 1; + } + } + + index += 1; + } +} + +fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> { + // Protocol. + if let Some(protocol_end) = peek_protocol(bytes, index) { + if let Some(domain_end) = peek_domain(bytes, protocol_end, true) { + let end = truncate(bytes, protocol_end, domain_end); + + // Cannot be empty. + if end != protocol_end { + return Some((index, end, Name::GfmAutolinkLiteralProtocol)); + } + } + } + + // Www. + if peek_www(bytes, index).is_some() { + // Note: we discard the `www.` we parsed, we now try to parse it as a domain. + let domain_end = peek_domain(bytes, index, false).unwrap_or(index); + let end = truncate(bytes, index, domain_end); + return Some((index, end, Name::GfmAutolinkLiteralWww)); + } + + // Email. + if bytes[index] == b'@' { + if let Some(start) = peek_atext(bytes, index) { + if let Some(end) = peek_email_domain(bytes, index + 1) { + let end = truncate(bytes, start, end); + return Some((start, end, Name::GfmAutolinkLiteralEmail)); + } + } + } + + None +} + +/// Move past `http://`, `https://`, case-insensitive. +fn peek_protocol(bytes: &[u8], mut index: usize) -> Option<usize> { + // `http` + if index + 3 < bytes.len() + && matches!(bytes[index], b'H' | b'h') + && matches!(bytes[index + 1], b'T' | b't') + && matches!(bytes[index + 2], b'T' | b't') + && matches!(bytes[index + 3], b'P' | b'p') + { + index += 4; + + // `s`, optional. + if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') { + index += 1; + } + + // `://` + if index + 3 < bytes.len() + && bytes[index] == b':' + && bytes[index + 1] == b'/' + && bytes[index + 2] == b'/' + { + return Some(index + 3); + } + } + + None +} + +/// Move past `www.`, case-insensitive. +fn peek_www(bytes: &[u8], index: usize) -> Option<usize> { + // `www.` + if index + 3 < bytes.len() + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. + && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~')) + && matches!(bytes[index], b'W' | b'w') + && matches!(bytes[index + 1], b'W' | b'w') + && matches!(bytes[index + 2], b'W' | b'w') + && bytes[index + 3] == b'.' + { + Some(index + 4) + } else { + None + } +} + +/// Move past `example.com`. +fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option<usize> { + let mut dots = false; + let mut penultime = false; + let mut last = false; + // To do: expose this from slice? + // To do: do it ourselves? <https://github.com/commonmark/cmark/blob/8a023286198a7e408398e282f293e3b0baebb644/src/utf8.c#L150>, <https://doc.rust-lang.org/core/str/fn.next_code_point.html>, <https://www.reddit.com/r/rust/comments/4g2zu0/lazy_unicode_iterator_from_byte_iteratorslice/>, <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>. + let char_indices = str::from_utf8(&bytes[start..]) + .unwrap() + .char_indices() + .collect::<Vec<_>>(); + let mut index = 0; + + while index < char_indices.len() { + match char_indices[index].1 { + '_' => last = true, + '.' => { + penultime = last; + last = false; + dots = true; + } + '-' => {} + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. + char if classify(char) == CharacterKind::Other => {} + _ => break, + } + + index += 1; + } + + // No underscores allowed in last two parts. + // A valid domain needs to have at least a dot. + if penultime || last || (!allow_short && !dots) { + None + } else { + // Now peek past `/path?search#hash` (anything except whitespace). + while index < char_indices.len() { + if classify(char_indices[index].1) == CharacterKind::Whitespace { + break; + } + + index += 1; + } + + Some(if index == char_indices.len() { + bytes.len() + } else { + start + char_indices[index].0 + }) + } +} + +/// Move back past `contact`. +fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> { + let mut index = end; + + // Take simplified atext. + // See `email_atext` in `autolink.rs` for a similar algorithm. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L301>. + while index > 0 + && matches!(bytes[index - 1], b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z') + { + index -= 1; + } + + // Do not allow a slash “inside” atext. + // The reference code is a bit weird, but that’s what it results in. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L307>. + // Other than slash, every preceding character is allowed. + if index == end || (index > 0 && bytes[index - 1] == b'/') { + None + } else { + Some(index) + } +} + +/// Move past `example.com`. +fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> { + let mut index = start; + let mut dot = false; + + // Move past “domain”. + // The reference code is a bit overly complex as it handles the `@`, of which there may be just one. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L318> + while index < bytes.len() { + match bytes[index] { + // Alphanumerical, `-`, and `_`. + b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z' => {} + // Dot followed by alphanumerical (not `-` or `_`). + b'.' if index + 1 < bytes.len() + && matches!(bytes[index + 1], b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => + { + dot = true; + } + _ => break, + } + + index += 1; + } + + // Domain must not be empty, must include a dot, and must end in alphabetical or `.`. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L332>. + if index > start && dot && matches!(bytes[index - 1], b'.' | b'A'..=b'Z' | b'a'..=b'z') { + Some(index) + } else { + None + } +} + +/// Split trialing stuff from a URL. +fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { + let mut index = start; + + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L42> + while index < end { + if bytes[index] == b'<' { + end = index; + break; + } + index += 1; + } + + let mut split = end; + + // Move before trailing punctuation. + while split > start { + match bytes[split - 1] { + b'!' | b'"' | b'&' | b'\'' | b')' | b',' | b'.' | b':' | b'<' | b'>' | b'?' | b']' + | b'}' => {} + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L61>. + // Note: we can’t move across actual references, because those have been parsed already. + b';' => { + let mut new_split = split - 1; + // Move back past alphabeticals. + while new_split > start && matches!(bytes[new_split - 1], b'A'..=b'Z' | b'a'..=b'z') + { + new_split -= 1; + } + + // Nonempty character reference: + if new_split > start && bytes[new_split - 1] == b'&' && new_split < split - 1 { + split = new_split - 1; + continue; + } + + // Otherwise it’s just a `;`. + } + _ => break, + } + split -= 1; + } + + // If there was trailing punctuation, try to balance parens. + if split != end { + let mut open = 0; + let mut close = 0; + let mut paren_index = start; + + // Count parens in `url` (not in trail). + while paren_index < split { + match bytes[paren_index] { + b'(' => open += 1, + b')' => close += 1, + _ => {} + } + + paren_index += 1; + } + + let mut trail_index = split; + + // If there are more opening than closing parens, try to balance them + // from the trail. + while open > close && trail_index < end { + if bytes[trail_index] == b')' { + split = trail_index; + close += 1; + } + + trail_index += 1; + } + } + + split +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1c1c6f7..ba1a0b3 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -28,7 +28,7 @@ //! For example, [code (fenced)][code_fenced] and //! [code (indented)][code_indented] are considered different constructs. //! -//! The following constructs are found in markdown: +//! The following constructs are found in markdown (CommonMark): //! //! * [attention (strong, emphasis)][attention] //! * [autolink][] @@ -40,7 +40,6 @@ //! * [code (indented)][code_indented] //! * [code (text)][code_text] //! * [definition][] -//! * [frontmatter][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] //! * [heading (setext)][heading_setext] @@ -56,6 +55,11 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][partial_whitespace]. //! +//! The following constructs are extensions found in markdown: +//! +//! * [frontmatter][] +//! * [gfm autolink literal][gfm_autolink_literal] +//! //! There are also several small subroutines typically used in different places: //! //! * [bom][partial_bom] @@ -141,6 +145,7 @@ pub mod definition; pub mod document; pub mod flow; pub mod frontmatter; +pub mod gfm_autolink_literal; pub mod hard_break_escape; pub mod heading_atx; pub mod heading_setext; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index bc6d7f4..b6f1f47 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -7,7 +7,6 @@ //! [text]: crate::construct::text use crate::event::{Kind, Name}; -use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use alloc::vec; @@ -51,7 +50,6 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { } } - tokenizer.register_resolver_before(ResolveName::Data); State::Ok } diff --git a/src/construct/string.rs b/src/construct/string.rs index 698a51d..dba1ac1 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -27,7 +27,6 @@ const MARKERS: [u8; 2] = [b'&', b'\\']; /// ^ /// ```` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver(ResolveName::String); tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::StringBefore) } @@ -40,7 +39,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ```` pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Ok, + None => { + tokenizer.register_resolver(ResolveName::Data); + tokenizer.register_resolver(ResolveName::String); + State::Ok + } Some(b'&') => { tokenizer.attempt( State::Next(StateName::StringBefore), diff --git a/src/construct/text.rs b/src/construct/text.rs index 5c13dba..06ba378 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -20,6 +20,7 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][crate::construct::partial_whitespace]. +use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_literal; use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; @@ -45,7 +46,6 @@ const MARKERS: [u8; 9] = [ /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver(ResolveName::Text); tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::TextBefore) } @@ -58,7 +58,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Ok, + None => { + tokenizer.register_resolver(ResolveName::Data); + tokenizer.register_resolver(ResolveName::Text); + State::Ok + } Some(b'!') => { tokenizer.attempt( State::Next(StateName::TextBefore), @@ -170,4 +174,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { tokenizer.parse_state.constructs.hard_break_trailing, true, ); + + if tokenizer.parse_state.constructs.gfm_autolink_literal { + resolve_gfm_autolink_literal(tokenizer); + } } diff --git a/src/event.rs b/src/event.rs index f2f8ae1..169fdb5 100644 --- a/src/event.rs +++ b/src/event.rs @@ -878,6 +878,148 @@ pub enum Name { /// ^ /// ``` EmphasisText, + // To do: sort. + /// Whole frontmatter. + /// + /// ## Info + /// + /// * **Context**: + /// [document content][crate::construct::document] + /// * **Content model**: + /// [`FrontmatterFence`][Name::FrontmatterFence], + /// [`FrontmatterChunk`][Name::FrontmatterChunk], + /// [`LineEnding`][Name::LineEnding] + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// > | --- + /// ^^^ + /// > | title: Neptune + /// ^^^^^^^^^^^^^^ + /// > | --- + /// ^^^ + /// ``` + Frontmatter, + /// Frontmatter chunk. + /// + /// ## Info + /// + /// * **Context**: + /// [`Frontmatter`][Name::Frontmatter] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// | --- + /// > | title: Neptune + /// ^^^^^^^^^^^^^^ + /// | --- + /// ``` + FrontmatterChunk, + /// Frontmatter fence. + /// + /// ## Info + /// + /// * **Context**: + /// [`Frontmatter`][Name::Frontmatter] + /// * **Content model**: + /// [`FrontmatterSequence`][Name::FrontmatterSequence], + /// [`SpaceOrTab`][Name::SpaceOrTab] + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// > | --- + /// ^^^ + /// | title: Neptune + /// > | --- + /// ^^^ + /// ``` + FrontmatterFence, + /// Frontmatter sequence. + /// + /// ## Info + /// + /// * **Context**: + /// [`FrontmatterFence`][Name::FrontmatterFence] + /// * **Content model**: + /// void + /// * **Construct**: + /// [`frontmatter`][crate::construct::frontmatter] + /// + /// ## Example + /// + /// ```markdown + /// > | --- + /// ^^^ + /// | title: Neptune + /// > | --- + /// ^^^ + /// ``` + FrontmatterSequence, + /// GFM extension: email autolink. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// void. + /// * **Construct**: + /// [`gfm_autolink_literal`][crate::construct::gfm_autolink_literal] + /// + /// ## Example + /// + /// ```markdown + /// > | context@example.com + /// ^^^^^^^^^^^^^^^^^^^ + /// ``` + GfmAutolinkLiteralEmail, + /// GFM extension: autolink w/ protocol. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// void. + /// * **Construct**: + /// [`gfm_autolink_literal`][crate::construct::gfm_autolink_literal] + /// + /// ## Example + /// + /// ```markdown + /// > | https://example.com + /// ^^^^^^^^^^^^^^^^^^^ + /// ``` + GfmAutolinkLiteralProtocol, + /// GFM extension: autolink w/ www. + /// + /// ## Info + /// + /// * **Context**: + /// [text content][crate::construct::text] + /// * **Content model**: + /// void. + /// * **Construct**: + /// [`gfm_autolink_literal`][crate::construct::gfm_autolink_literal] + /// + /// ## Example + /// + /// ```markdown + /// > | www.example.com + /// ^^^^^^^^^^^^^^^ + /// ``` + GfmAutolinkLiteralWww, /// Whole hard break (escape). /// /// ## Info @@ -1832,98 +1974,10 @@ pub enum Name { /// ^ ^ ^ /// ``` ThematicBreakSequence, - - /// Whole frontmatter. - /// - /// ## Info - /// - /// * **Context**: - /// [document content][crate::construct::document] - /// * **Content model**: - /// [`FrontmatterFence`][Name::FrontmatterFence], - /// [`FrontmatterChunk`][Name::FrontmatterChunk], - /// [`LineEnding`][Name::LineEnding] - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// > | --- - /// ^^^ - /// > | title: Neptune - /// ^^^^^^^^^^^^^^ - /// > | --- - /// ^^^ - /// ```` - Frontmatter, - /// Frontmatter chunk. - /// - /// ## Info - /// - /// * **Context**: - /// [`Frontmatter`][Name::Frontmatter] - /// * **Content model**: - /// void - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// | --- - /// > | title: Neptune - /// ^^^^^^^^^^^^^^ - /// | --- - /// ```` - FrontmatterChunk, - /// Frontmatter fence. - /// - /// ## Info - /// - /// * **Context**: - /// [`Frontmatter`][Name::Frontmatter] - /// * **Content model**: - /// [`FrontmatterSequence`][Name::FrontmatterSequence], - /// [`SpaceOrTab`][Name::SpaceOrTab] - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// > | --- - /// ^^^ - /// | title: Neptune - /// > | --- - /// ^^^ - /// ```` - FrontmatterFence, - /// Frontmatter sequence. - /// - /// ## Info - /// - /// * **Context**: - /// [`FrontmatterFence`][Name::FrontmatterFence] - /// * **Content model**: - /// void - /// * **Construct**: - /// [`frontmatter`][crate::construct::frontmatter] - /// - /// ## Example - /// - /// ````markdown - /// > | --- - /// ^^^ - /// | title: Neptune - /// > | --- - /// ^^^ - /// ```` - FrontmatterSequence, } /// List of void events, used to make sure everything is working well. -pub const VOID_EVENTS: [Name; 43] = [ +pub const VOID_EVENTS: [Name; 46] = [ Name::AttentionSequence, Name::AutolinkEmail, Name::AutolinkMarker, @@ -1949,6 +2003,9 @@ pub const VOID_EVENTS: [Name; 43] = [ Name::DefinitionTitleMarker, Name::EmphasisSequence, Name::FrontmatterChunk, + Name::GfmAutolinkLiteralEmail, + Name::GfmAutolinkLiteralProtocol, + Name::GfmAutolinkLiteralWww, Name::FrontmatterSequence, Name::HardBreakEscape, Name::HardBreakTrailing, @@ -2013,6 +2070,40 @@ pub struct Point { pub vs: usize, } +impl Point { + /// Create a new point, that is shifted from the close earlier current + /// point, to `index.` + // To do: tabs. + pub fn shift_to(&self, bytes: &[u8], index: usize) -> Point { + let mut next = self.clone(); + debug_assert!(index > next.index, "expect"); + + while next.index < index { + match bytes[next.index] { + b'\n' | b'\r' => unreachable!("cannot move past line endings"), + b'\t' => { + unreachable!("to do: tab") + // let remainder = next.column % TAB_SIZE; + // let vs = if remainder == 0 { + // 0 + // } else { + // TAB_SIZE - remainder + // }; + + // next.index += 1; + // next.column += 1 + vs; + } + _ => { + next.index += 1; + next.column += 1; + } + } + } + + next + } +} + /// Event kinds. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Kind { @@ -166,6 +166,13 @@ pub struct Constructs { /// ^^^ /// ```` pub frontmatter: bool, + /// GFM: autolink literal. + /// + /// ```markdown + /// > | https://example.com + /// ^^^^^^^^^^^^^^^^^^^ + /// ``` + pub gfm_autolink_literal: bool, /// Hard break (escape). /// /// ```markdown @@ -263,6 +270,7 @@ impl Default for Constructs { code_text: true, definition: true, frontmatter: false, + gfm_autolink_literal: false, hard_break_escape: true, hard_break_trailing: true, heading_atx: true, @@ -278,6 +286,19 @@ impl Default for Constructs { } } +impl Constructs { + /// GFM. + /// + /// This turns on `CommonMark` + GFM. + #[must_use] + pub fn gfm() -> Self { + Self { + gfm_autolink_literal: true, + ..Self::default() + } + } +} + /// Configuration (optional). #[derive(Clone, Debug, Default)] pub struct Options { diff --git a/src/util/classify_character.rs b/src/util/classify_character.rs new file mode 100644 index 0000000..b938502 --- /dev/null +++ b/src/util/classify_character.rs @@ -0,0 +1,72 @@ +//! Utilities to classify characters as whitespace, punctuation, or rest. + +use crate::unicode::PUNCTUATION; + +/// Character kinds. +#[derive(Debug, PartialEq, Eq)] +pub enum Kind { + /// Whitespace. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Whitespace, + /// Punctuation. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^^ ^ ^ ^ + /// ``` + Punctuation, + /// Everything else. + /// + /// ## Example + /// + /// ```markdown + /// > | **a_b_ c**. + /// ^ ^ ^ + /// ``` + Other, +} + +/// Classify whether a character code represents whitespace, punctuation, or +/// something else. +/// +/// Used for attention (emphasis, strong), whose sequences can open or close +/// based on the class of surrounding characters. +/// +/// > 👉 **Note** that eof (`None`) is seen as whitespace. +/// +/// ## References +/// +/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) +pub fn classify(char: char) -> Kind { + // Unicode whitespace. + if char.is_whitespace() { + Kind::Whitespace + } + // Unicode punctuation. + else if PUNCTUATION.contains(&char) { + Kind::Punctuation + } + // Everything else. + else { + Kind::Other + } +} + +/// Like [`classify`], but supports eof as whitespace. +pub fn classify_opt(char_opt: Option<char>) -> Kind { + if let Some(char) = char_opt { + classify(char) + } + // EOF. + else { + Kind::Whitespace + } +} diff --git a/src/util/mod.rs b/src/util/mod.rs index f51845c..022c7d6 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ //! Utilities used when processing markdown. +pub mod classify_character; pub mod decode_character_reference; pub mod edit_map; pub mod encode; diff --git a/tests/gfm_autolink_literal.rs b/tests/gfm_autolink_literal.rs new file mode 100644 index 0000000..9551751 --- /dev/null +++ b/tests/gfm_autolink_literal.rs @@ -0,0 +1,256 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Constructs, Options}; +use pretty_assertions::assert_eq; + +#[test] +fn gfm_autolink_literal() { + let gfm = Options { + constructs: Constructs::gfm(), + ..Options::default() + }; + + assert_eq!( + micromark("https://example.com"), + "<p>https://example.com</p>", + "should ignore protocol urls by default" + ); + assert_eq!( + micromark("www.example.com"), + "<p>www.example.com</p>", + "should ignore www urls by default" + ); + assert_eq!( + micromark("user@example.com"), + "<p>user@example.com</p>", + "should ignore email urls by default" + ); + + assert_eq!( + micromark_with_options("https://example.com", &gfm), + "<p><a href=\"https://example.com\">https://example.com</a></p>", + "should support protocol urls if enabled" + ); + assert_eq!( + micromark_with_options("www.example.com", &gfm), + "<p><a href=\"http://www.example.com\">www.example.com</a></p>", + "should support www urls if enabled" + ); + assert_eq!( + micromark_with_options("user@example.com", &gfm), + "<p><a href=\"mailto:user@example.com\">user@example.com</a></p>", + "should support email urls if enabled" + ); + + assert_eq!( + micromark_with_options("user@example.com", &gfm), + "<p><a href=\"mailto:user@example.com\">user@example.com</a></p>", + "should support a closing paren at TLD (email)" + ); + + assert_eq!( + micromark_with_options("www.a.)", &gfm), + "<p><a href=\"http://www.a\">www.a</a>.)</p>", + "should support a closing paren at TLD (www)" + ); + + assert_eq!( + micromark_with_options("www.a b", &gfm), + "<p><a href=\"http://www.a\">www.a</a> b</p>", + "should support no TLD" + ); + + assert_eq!( + micromark_with_options("www.a/b c", &gfm), + "<p><a href=\"http://www.a/b\">www.a/b</a> c</p>", + "should support a path instead of TLD" + ); + + assert_eq!( + micromark_with_options("www.�a", &gfm), + "<p><a href=\"http://www.%EF%BF%BDa\">www.�a</a></p>", + "should support a replacement character in a domain" + ); + + assert_eq!( + micromark_with_options("http://點看.com", &gfm), + "<p><a href=\"http://%E9%BB%9E%E7%9C%8B.com\">http://點看.com</a></p>", + "should support non-ascii characters in a domain (http)" + ); + + assert_eq!( + micromark_with_options("www.點看.com", &gfm), + "<p><a href=\"http://www.%E9%BB%9E%E7%9C%8B.com\">www.點看.com</a></p>", + "should support non-ascii characters in a domain (www)" + ); + + assert_eq!( + micromark_with_options("點看@example.com", &gfm), + "<p>點看@example.com</p>", + "should *not* support non-ascii characters in atext (email)" + ); + + assert_eq!( + micromark_with_options("example@點看.com", &gfm), + "<p>example@點看.com</p>", + "should *not* support non-ascii characters in a domain (email)" + ); + + assert_eq!( + micromark_with_options("www.a.com/點看", &gfm), + "<p><a href=\"http://www.a.com/%E9%BB%9E%E7%9C%8B\">www.a.com/點看</a></p>", + "should support non-ascii characters in a path" + ); + + assert_eq!( + micromark_with_options("www.-a.b", &gfm), + "<p><a href=\"http://www.-a.b\">www.-a.b</a></p>", + "should support a dash to start a domain" + ); + + assert_eq!( + micromark_with_options("www.$", &gfm), + "<p><a href=\"http://www.$\">www.$</a></p>", + "should support a dollar as a domain name" + ); + + assert_eq!( + micromark_with_options("www.a..b.c", &gfm), + "<p><a href=\"http://www.a..b.c\">www.a..b.c</a></p>", + "should support adjacent dots in a domain name" + ); + + assert_eq!( + micromark_with_options("www.a&a;", &gfm), + "<p><a href=\"http://www.a\">www.a</a>&a;</p>", + "should support named character references in domains" + ); + + assert_eq!( + micromark_with_options("https://a.bc/d/e/).", &gfm), + "<p><a href=\"https://a.bc/d/e/\">https://a.bc/d/e/</a>).</p>", + "should support a closing paren and period after a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc/d/e/.)", &gfm), + "<p><a href=\"https://a.bc/d/e/\">https://a.bc/d/e/</a>.)</p>", + "should support a period and closing paren after a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc).", &gfm), + "<p><a href=\"https://a.bc\">https://a.bc</a>).</p>", + "should support a closing paren and period after a domain" + ); + + assert_eq!( + micromark_with_options("https://a.bc.)", &gfm), + "<p><a href=\"https://a.bc\">https://a.bc</a>.)</p>", + "should support a period and closing paren after a domain" + ); + + assert_eq!( + micromark_with_options("https://a.bc).d", &gfm), + "<p><a href=\"https://a.bc).d\">https://a.bc).d</a></p>", + "should support a closing paren and period in a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc.)d", &gfm), + "<p><a href=\"https://a.bc.)d\">https://a.bc.)d</a></p>", + "should support a period and closing paren in a path" + ); + + assert_eq!( + micromark_with_options("https://a.bc/))d", &gfm), + "<p><a href=\"https://a.bc/))d\">https://a.bc/))d</a></p>", + "should support two closing parens in a path" + ); + + assert_eq!( + micromark_with_options("ftp://a/b/c.txt", &gfm), + "<p>ftp://a/b/c.txt</p>", + "should not support ftp links" + ); + + // Note: GH comments/issues/PRs do not link this, but Gists/readmes do. + // Fixing it would mean defiating from `cmark-gfm`: + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. + // assert_eq!( + // micromark_with_options(",www.example.com", &gfm), + // "<p>,<a href=\"http://www.example.com\">www.example.com</a></p>", + // "should support www links after Unicode punctuation", + // ); + + assert_eq!( + micromark_with_options(",https://example.com", &gfm), + "<p>,<a href=\"https://example.com\">https://example.com</a></p>", + "should support http links after Unicode punctuation" + ); + + assert_eq!( + micromark_with_options(",example@example.com", &gfm), + "<p>,<a href=\"mailto:example@example.com\">example@example.com</a></p>", + "should support email links after Unicode punctuation" + ); + + assert_eq!( + micromark_with_options( + "http://user:password@host:port/path?key=value#fragment", + &gfm + ), + "<p>http://user:password@host:port/path?key=value#fragment</p>", + "should not link character reference for `:`" + ); + + assert_eq!( + micromark_with_options("http://example.com/ab<cd", &gfm), + "<p><a href=\"http://example.com/ab\">http://example.com/ab</a><cd</p>", + "should stop domains/paths at `<`" + ); + + assert_eq!( + micromark_with_options( + r###" +[ www.example.com + +[ https://example.com + +[ contact@example.com + +[ www.example.com ] + +[ https://example.com ] + +[ contact@example.com ] + +[ www.example.com ](#) + +[ https://example.com ](#) + +[ contact@example.com ](#) + +![ www.example.com ](#) + +![ https://example.com ](#) + +![ contact@example.com ](#) +"###, + &gfm + ), + r###"<p>[ <a href="http://www.example.com">www.example.com</a></p> +<p>[ <a href="https://example.com">https://example.com</a></p> +<p>[ <a href="mailto:contact@example.com">contact@example.com</a></p> +<p>[ <a href="http://www.example.com">www.example.com</a> ]</p> +<p>[ <a href="https://example.com">https://example.com</a> ]</p> +<p>[ <a href="mailto:contact@example.com">contact@example.com</a> ]</p> +<p><a href="#"> www.example.com </a></p> +<p><a href="#"> https://example.com </a></p> +<p><a href="#"> contact@example.com </a></p> +<p><img src="#" alt=" www.example.com " /></p> +<p><img src="#" alt=" https://example.com " /></p> +<p><img src="#" alt=" contact@example.com " /></p> +"###, + "should interplay with brackets, links, and images" + ); +} |