diff options
Diffstat (limited to 'src/construct')
-rw-r--r-- | src/construct/attention.rs | 70 | ||||
-rw-r--r-- | src/construct/gfm_autolink_literal.rs | 382 | ||||
-rw-r--r-- | src/construct/mod.rs | 9 | ||||
-rw-r--r-- | src/construct/partial_data.rs | 2 | ||||
-rw-r--r-- | src/construct/string.rs | 7 | ||||
-rw-r--r-- | src/construct/text.rs | 12 |
6 files changed, 407 insertions, 75 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 8df0f61..ef960d4 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -62,42 +62,10 @@ use crate::event::{Event, Kind, Name, Point}; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; -use crate::unicode::PUNCTUATION; +use crate::util::classify_character::{classify_opt, Kind as CharacterKind}; use crate::util::slice::Slice; use alloc::{string::String, vec, vec::Vec}; -/// Character code kinds. -#[derive(Debug, PartialEq)] -enum CharacterKind { - /// Whitespace. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Whitespace, - /// Punctuation. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^^ ^ ^ ^ - /// ``` - Punctuation, - /// Everything else. - /// - /// ## Example - /// - /// ```markdown - /// > | **a_b_ c**. - /// ^ ^ ^ - /// ``` - Other, -} - /// Attentention sequence that we can take markers from. #[derive(Debug)] struct Sequence { @@ -192,8 +160,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) .head() .unwrap(); - let before = classify_character(char_before); - let after = classify_character(char_after); + let before = classify_opt(char_before); + let after = classify_opt(char_after); let open = after == CharacterKind::Other || (after == CharacterKind::Punctuation && before != CharacterKind::Other); // To do: GFM strikethrough? @@ -429,35 +397,3 @@ fn match_sequences( next } - -/// Classify whether a character code represents whitespace, punctuation, or -/// something else. -/// -/// Used for attention (emphasis, strong), whose sequences can open or close -/// based on the class of surrounding characters. -/// -/// > 👉 **Note** that eof (`None`) is seen as whitespace. -/// -/// ## References -/// -/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -fn classify_character(char: Option<char>) -> CharacterKind { - if let Some(char) = char { - // Unicode whitespace. - if char.is_whitespace() { - CharacterKind::Whitespace - } - // Unicode punctuation. - else if PUNCTUATION.contains(&char) { - CharacterKind::Punctuation - } - // Everything else. - else { - CharacterKind::Other - } - } - // EOF. - else { - CharacterKind::Whitespace - } -} diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs new file mode 100644 index 0000000..7fdeb01 --- /dev/null +++ b/src/construct/gfm_autolink_literal.rs @@ -0,0 +1,382 @@ +//! To do. + +use crate::event::{Event, Kind, Name}; +use crate::tokenizer::Tokenizer; +use crate::util::classify_character::{classify, Kind as CharacterKind}; +use crate::util::slice::{Position, Slice}; +use alloc::vec::Vec; +extern crate std; +use core::str; + +// To do: doc al functions. + +pub fn resolve(tokenizer: &mut Tokenizer) { + tokenizer.map.consume(&mut tokenizer.events); + + let mut index = 0; + let mut links = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.kind == Kind::Enter { + if event.name == Name::Link { + links += 1; + } + } else { + if event.name == Name::Data && links == 0 { + let slice = Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event(&tokenizer.events, index), + ); + let bytes = slice.bytes; + let mut byte_index = 0; + let mut replace = Vec::new(); + let mut point = tokenizer.events[index - 1].point.clone(); + let start_index = point.index; + let mut start = 0; + + while byte_index < bytes.len() { + if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') { + if let Some(autolink) = peek(bytes, byte_index) { + byte_index = autolink.1; + + // If there is something between the last link + // (or the start) and this link. + if start != autolink.0 { + replace.push(Event { + kind: Kind::Enter, + name: Name::Data, + point: point.clone(), + link: None, + }); + point = point.shift_to( + tokenizer.parse_state.bytes, + start_index + autolink.0, + ); + replace.push(Event { + kind: Kind::Exit, + name: Name::Data, + point: point.clone(), + link: None, + }); + } + + // Add the link. + replace.push(Event { + kind: Kind::Enter, + name: autolink.2.clone(), + point: point.clone(), + link: None, + }); + point = point + .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1); + replace.push(Event { + kind: Kind::Exit, + name: autolink.2.clone(), + point: point.clone(), + link: None, + }); + start = autolink.1; + } + } + + byte_index += 1; + } + + // If there was a link, and we have more bytes left. + if start != 0 && start < bytes.len() { + replace.push(Event { + kind: Kind::Enter, + name: Name::Data, + point: point.clone(), + link: None, + }); + replace.push(Event { + kind: Kind::Exit, + name: Name::Data, + point: event.point.clone(), + link: None, + }); + } + + // If there were links. + if !replace.is_empty() { + tokenizer.map.add(index - 1, 2, replace); + } + } + + if event.name == Name::Link { + links -= 1; + } + } + + index += 1; + } +} + +fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> { + // Protocol. + if let Some(protocol_end) = peek_protocol(bytes, index) { + if let Some(domain_end) = peek_domain(bytes, protocol_end, true) { + let end = truncate(bytes, protocol_end, domain_end); + + // Cannot be empty. + if end != protocol_end { + return Some((index, end, Name::GfmAutolinkLiteralProtocol)); + } + } + } + + // Www. + if peek_www(bytes, index).is_some() { + // Note: we discard the `www.` we parsed, we now try to parse it as a domain. + let domain_end = peek_domain(bytes, index, false).unwrap_or(index); + let end = truncate(bytes, index, domain_end); + return Some((index, end, Name::GfmAutolinkLiteralWww)); + } + + // Email. + if bytes[index] == b'@' { + if let Some(start) = peek_atext(bytes, index) { + if let Some(end) = peek_email_domain(bytes, index + 1) { + let end = truncate(bytes, start, end); + return Some((start, end, Name::GfmAutolinkLiteralEmail)); + } + } + } + + None +} + +/// Move past `http://`, `https://`, case-insensitive. +fn peek_protocol(bytes: &[u8], mut index: usize) -> Option<usize> { + // `http` + if index + 3 < bytes.len() + && matches!(bytes[index], b'H' | b'h') + && matches!(bytes[index + 1], b'T' | b't') + && matches!(bytes[index + 2], b'T' | b't') + && matches!(bytes[index + 3], b'P' | b'p') + { + index += 4; + + // `s`, optional. + if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') { + index += 1; + } + + // `://` + if index + 3 < bytes.len() + && bytes[index] == b':' + && bytes[index + 1] == b'/' + && bytes[index + 2] == b'/' + { + return Some(index + 3); + } + } + + None +} + +/// Move past `www.`, case-insensitive. +fn peek_www(bytes: &[u8], index: usize) -> Option<usize> { + // `www.` + if index + 3 < bytes.len() + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>. + && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~')) + && matches!(bytes[index], b'W' | b'w') + && matches!(bytes[index + 1], b'W' | b'w') + && matches!(bytes[index + 2], b'W' | b'w') + && bytes[index + 3] == b'.' + { + Some(index + 4) + } else { + None + } +} + +/// Move past `example.com`. +fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option<usize> { + let mut dots = false; + let mut penultime = false; + let mut last = false; + // To do: expose this from slice? + // To do: do it ourselves? <https://github.com/commonmark/cmark/blob/8a023286198a7e408398e282f293e3b0baebb644/src/utf8.c#L150>, <https://doc.rust-lang.org/core/str/fn.next_code_point.html>, <https://www.reddit.com/r/rust/comments/4g2zu0/lazy_unicode_iterator_from_byte_iteratorslice/>, <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>. + let char_indices = str::from_utf8(&bytes[start..]) + .unwrap() + .char_indices() + .collect::<Vec<_>>(); + let mut index = 0; + + while index < char_indices.len() { + match char_indices[index].1 { + '_' => last = true, + '.' => { + penultime = last; + last = false; + dots = true; + } + '-' => {} + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. + char if classify(char) == CharacterKind::Other => {} + _ => break, + } + + index += 1; + } + + // No underscores allowed in last two parts. + // A valid domain needs to have at least a dot. + if penultime || last || (!allow_short && !dots) { + None + } else { + // Now peek past `/path?search#hash` (anything except whitespace). + while index < char_indices.len() { + if classify(char_indices[index].1) == CharacterKind::Whitespace { + break; + } + + index += 1; + } + + Some(if index == char_indices.len() { + bytes.len() + } else { + start + char_indices[index].0 + }) + } +} + +/// Move back past `contact`. +fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> { + let mut index = end; + + // Take simplified atext. + // See `email_atext` in `autolink.rs` for a similar algorithm. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L301>. + while index > 0 + && matches!(bytes[index - 1], b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z') + { + index -= 1; + } + + // Do not allow a slash “inside” atext. + // The reference code is a bit weird, but that’s what it results in. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L307>. + // Other than slash, every preceding character is allowed. + if index == end || (index > 0 && bytes[index - 1] == b'/') { + None + } else { + Some(index) + } +} + +/// Move past `example.com`. +fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> { + let mut index = start; + let mut dot = false; + + // Move past “domain”. + // The reference code is a bit overly complex as it handles the `@`, of which there may be just one. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L318> + while index < bytes.len() { + match bytes[index] { + // Alphanumerical, `-`, and `_`. + b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z' => {} + // Dot followed by alphanumerical (not `-` or `_`). + b'.' if index + 1 < bytes.len() + && matches!(bytes[index + 1], b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => + { + dot = true; + } + _ => break, + } + + index += 1; + } + + // Domain must not be empty, must include a dot, and must end in alphabetical or `.`. + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L332>. + if index > start && dot && matches!(bytes[index - 1], b'.' | b'A'..=b'Z' | b'a'..=b'z') { + Some(index) + } else { + None + } +} + +/// Split trialing stuff from a URL. +fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { + let mut index = start; + + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L42> + while index < end { + if bytes[index] == b'<' { + end = index; + break; + } + index += 1; + } + + let mut split = end; + + // Move before trailing punctuation. + while split > start { + match bytes[split - 1] { + b'!' | b'"' | b'&' | b'\'' | b')' | b',' | b'.' | b':' | b'<' | b'>' | b'?' | b']' + | b'}' => {} + // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L61>. + // Note: we can’t move across actual references, because those have been parsed already. + b';' => { + let mut new_split = split - 1; + // Move back past alphabeticals. + while new_split > start && matches!(bytes[new_split - 1], b'A'..=b'Z' | b'a'..=b'z') + { + new_split -= 1; + } + + // Nonempty character reference: + if new_split > start && bytes[new_split - 1] == b'&' && new_split < split - 1 { + split = new_split - 1; + continue; + } + + // Otherwise it’s just a `;`. + } + _ => break, + } + split -= 1; + } + + // If there was trailing punctuation, try to balance parens. + if split != end { + let mut open = 0; + let mut close = 0; + let mut paren_index = start; + + // Count parens in `url` (not in trail). + while paren_index < split { + match bytes[paren_index] { + b'(' => open += 1, + b')' => close += 1, + _ => {} + } + + paren_index += 1; + } + + let mut trail_index = split; + + // If there are more opening than closing parens, try to balance them + // from the trail. + while open > close && trail_index < end { + if bytes[trail_index] == b')' { + split = trail_index; + close += 1; + } + + trail_index += 1; + } + } + + split +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 1c1c6f7..ba1a0b3 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -28,7 +28,7 @@ //! For example, [code (fenced)][code_fenced] and //! [code (indented)][code_indented] are considered different constructs. //! -//! The following constructs are found in markdown: +//! The following constructs are found in markdown (CommonMark): //! //! * [attention (strong, emphasis)][attention] //! * [autolink][] @@ -40,7 +40,6 @@ //! * [code (indented)][code_indented] //! * [code (text)][code_text] //! * [definition][] -//! * [frontmatter][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] //! * [heading (setext)][heading_setext] @@ -56,6 +55,11 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][partial_whitespace]. //! +//! The following constructs are extensions found in markdown: +//! +//! * [frontmatter][] +//! * [gfm autolink literal][gfm_autolink_literal] +//! //! There are also several small subroutines typically used in different places: //! //! * [bom][partial_bom] @@ -141,6 +145,7 @@ pub mod definition; pub mod document; pub mod flow; pub mod frontmatter; +pub mod gfm_autolink_literal; pub mod hard_break_escape; pub mod heading_atx; pub mod heading_setext; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index bc6d7f4..b6f1f47 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -7,7 +7,6 @@ //! [text]: crate::construct::text use crate::event::{Kind, Name}; -use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use alloc::vec; @@ -51,7 +50,6 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State { } } - tokenizer.register_resolver_before(ResolveName::Data); State::Ok } diff --git a/src/construct/string.rs b/src/construct/string.rs index 698a51d..dba1ac1 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -27,7 +27,6 @@ const MARKERS: [u8; 2] = [b'&', b'\\']; /// ^ /// ```` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver(ResolveName::String); tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::StringBefore) } @@ -40,7 +39,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ```` pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Ok, + None => { + tokenizer.register_resolver(ResolveName::Data); + tokenizer.register_resolver(ResolveName::String); + State::Ok + } Some(b'&') => { tokenizer.attempt( State::Next(StateName::StringBefore), diff --git a/src/construct/text.rs b/src/construct/text.rs index 5c13dba..06ba378 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -20,6 +20,7 @@ //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][crate::construct::partial_whitespace]. +use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_literal; use crate::construct::partial_whitespace::resolve_whitespace; use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; @@ -45,7 +46,6 @@ const MARKERS: [u8; 9] = [ /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - tokenizer.register_resolver(ResolveName::Text); tokenizer.tokenize_state.markers = &MARKERS; State::Retry(StateName::TextBefore) } @@ -58,7 +58,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None => State::Ok, + None => { + tokenizer.register_resolver(ResolveName::Data); + tokenizer.register_resolver(ResolveName::Text); + State::Ok + } Some(b'!') => { tokenizer.attempt( State::Next(StateName::TextBefore), @@ -170,4 +174,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) { tokenizer.parse_state.constructs.hard_break_trailing, true, ); + + if tokenizer.parse_state.constructs.gfm_autolink_literal { + resolve_gfm_autolink_literal(tokenizer); + } } |