aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-18 18:33:10 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-18 18:33:17 +0200
commit25e267afbc0789ea36508d45c3ea3545b84223bb (patch)
tree8dee2a78ad1df29e9df7cf151091a5d265fd7ecb /src/construct
parent1dbf02d8c1955316c6cc43a427f506b91c87ef3a (diff)
downloadmarkdown-rs-25e267afbc0789ea36508d45c3ea3545b84223bb.tar.gz
markdown-rs-25e267afbc0789ea36508d45c3ea3545b84223bb.tar.bz2
markdown-rs-25e267afbc0789ea36508d45c3ea3545b84223bb.zip
Add support for GFM autolink literals
Diffstat (limited to 'src/construct')
-rw-r--r--src/construct/attention.rs70
-rw-r--r--src/construct/gfm_autolink_literal.rs382
-rw-r--r--src/construct/mod.rs9
-rw-r--r--src/construct/partial_data.rs2
-rw-r--r--src/construct/string.rs7
-rw-r--r--src/construct/text.rs12
6 files changed, 407 insertions, 75 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index 8df0f61..ef960d4 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -62,42 +62,10 @@ use crate::event::{Event, Kind, Name, Point};
use crate::resolve::Name as ResolveName;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
-use crate::unicode::PUNCTUATION;
+use crate::util::classify_character::{classify_opt, Kind as CharacterKind};
use crate::util::slice::Slice;
use alloc::{string::String, vec, vec::Vec};
-/// Character code kinds.
-#[derive(Debug, PartialEq)]
-enum CharacterKind {
- /// Whitespace.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// > | **a_b_ c**.
- /// ^ ^ ^
- /// ```
- Whitespace,
- /// Punctuation.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// > | **a_b_ c**.
- /// ^^ ^ ^ ^
- /// ```
- Punctuation,
- /// Everything else.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// > | **a_b_ c**.
- /// ^ ^ ^
- /// ```
- Other,
-}
-
/// Attentention sequence that we can take markers from.
#[derive(Debug)]
struct Sequence {
@@ -192,8 +160,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) {
let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
.head()
.unwrap();
- let before = classify_character(char_before);
- let after = classify_character(char_after);
+ let before = classify_opt(char_before);
+ let after = classify_opt(char_after);
let open = after == CharacterKind::Other
|| (after == CharacterKind::Punctuation && before != CharacterKind::Other);
// To do: GFM strikethrough?
@@ -429,35 +397,3 @@ fn match_sequences(
next
}
-
-/// Classify whether a character code represents whitespace, punctuation, or
-/// something else.
-///
-/// Used for attention (emphasis, strong), whose sequences can open or close
-/// based on the class of surrounding characters.
-///
-/// > 👉 **Note** that eof (`None`) is seen as whitespace.
-///
-/// ## References
-///
-/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
-fn classify_character(char: Option<char>) -> CharacterKind {
- if let Some(char) = char {
- // Unicode whitespace.
- if char.is_whitespace() {
- CharacterKind::Whitespace
- }
- // Unicode punctuation.
- else if PUNCTUATION.contains(&char) {
- CharacterKind::Punctuation
- }
- // Everything else.
- else {
- CharacterKind::Other
- }
- }
- // EOF.
- else {
- CharacterKind::Whitespace
- }
-}
diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs
new file mode 100644
index 0000000..7fdeb01
--- /dev/null
+++ b/src/construct/gfm_autolink_literal.rs
@@ -0,0 +1,382 @@
+//! To do.
+
+use crate::event::{Event, Kind, Name};
+use crate::tokenizer::Tokenizer;
+use crate::util::classify_character::{classify, Kind as CharacterKind};
+use crate::util::slice::{Position, Slice};
+use alloc::vec::Vec;
+extern crate std;
+use core::str;
+
+// To do: doc al functions.
+
+pub fn resolve(tokenizer: &mut Tokenizer) {
+ tokenizer.map.consume(&mut tokenizer.events);
+
+ let mut index = 0;
+ let mut links = 0;
+
+ while index < tokenizer.events.len() {
+ let event = &tokenizer.events[index];
+
+ if event.kind == Kind::Enter {
+ if event.name == Name::Link {
+ links += 1;
+ }
+ } else {
+ if event.name == Name::Data && links == 0 {
+ let slice = Slice::from_position(
+ tokenizer.parse_state.bytes,
+ &Position::from_exit_event(&tokenizer.events, index),
+ );
+ let bytes = slice.bytes;
+ let mut byte_index = 0;
+ let mut replace = Vec::new();
+ let mut point = tokenizer.events[index - 1].point.clone();
+ let start_index = point.index;
+ let mut start = 0;
+
+ while byte_index < bytes.len() {
+ if matches!(bytes[byte_index], b'H' | b'h' | b'W' | b'w' | b'@') {
+ if let Some(autolink) = peek(bytes, byte_index) {
+ byte_index = autolink.1;
+
+ // If there is something between the last link
+ // (or the start) and this link.
+ if start != autolink.0 {
+ replace.push(Event {
+ kind: Kind::Enter,
+ name: Name::Data,
+ point: point.clone(),
+ link: None,
+ });
+ point = point.shift_to(
+ tokenizer.parse_state.bytes,
+ start_index + autolink.0,
+ );
+ replace.push(Event {
+ kind: Kind::Exit,
+ name: Name::Data,
+ point: point.clone(),
+ link: None,
+ });
+ }
+
+ // Add the link.
+ replace.push(Event {
+ kind: Kind::Enter,
+ name: autolink.2.clone(),
+ point: point.clone(),
+ link: None,
+ });
+ point = point
+ .shift_to(tokenizer.parse_state.bytes, start_index + autolink.1);
+ replace.push(Event {
+ kind: Kind::Exit,
+ name: autolink.2.clone(),
+ point: point.clone(),
+ link: None,
+ });
+ start = autolink.1;
+ }
+ }
+
+ byte_index += 1;
+ }
+
+ // If there was a link, and we have more bytes left.
+ if start != 0 && start < bytes.len() {
+ replace.push(Event {
+ kind: Kind::Enter,
+ name: Name::Data,
+ point: point.clone(),
+ link: None,
+ });
+ replace.push(Event {
+ kind: Kind::Exit,
+ name: Name::Data,
+ point: event.point.clone(),
+ link: None,
+ });
+ }
+
+ // If there were links.
+ if !replace.is_empty() {
+ tokenizer.map.add(index - 1, 2, replace);
+ }
+ }
+
+ if event.name == Name::Link {
+ links -= 1;
+ }
+ }
+
+ index += 1;
+ }
+}
+
+fn peek(bytes: &[u8], index: usize) -> Option<(usize, usize, Name)> {
+ // Protocol.
+ if let Some(protocol_end) = peek_protocol(bytes, index) {
+ if let Some(domain_end) = peek_domain(bytes, protocol_end, true) {
+ let end = truncate(bytes, protocol_end, domain_end);
+
+ // Cannot be empty.
+ if end != protocol_end {
+ return Some((index, end, Name::GfmAutolinkLiteralProtocol));
+ }
+ }
+ }
+
+ // Www.
+ if peek_www(bytes, index).is_some() {
+ // Note: we discard the `www.` we parsed, we now try to parse it as a domain.
+ let domain_end = peek_domain(bytes, index, false).unwrap_or(index);
+ let end = truncate(bytes, index, domain_end);
+ return Some((index, end, Name::GfmAutolinkLiteralWww));
+ }
+
+ // Email.
+ if bytes[index] == b'@' {
+ if let Some(start) = peek_atext(bytes, index) {
+ if let Some(end) = peek_email_domain(bytes, index + 1) {
+ let end = truncate(bytes, start, end);
+ return Some((start, end, Name::GfmAutolinkLiteralEmail));
+ }
+ }
+ }
+
+ None
+}
+
+/// Move past `http://`, `https://`, case-insensitive.
+fn peek_protocol(bytes: &[u8], mut index: usize) -> Option<usize> {
+ // `http`
+ if index + 3 < bytes.len()
+ && matches!(bytes[index], b'H' | b'h')
+ && matches!(bytes[index + 1], b'T' | b't')
+ && matches!(bytes[index + 2], b'T' | b't')
+ && matches!(bytes[index + 3], b'P' | b'p')
+ {
+ index += 4;
+
+ // `s`, optional.
+ if index + 1 < bytes.len() && matches!(bytes[index], b'S' | b's') {
+ index += 1;
+ }
+
+ // `://`
+ if index + 3 < bytes.len()
+ && bytes[index] == b':'
+ && bytes[index + 1] == b'/'
+ && bytes[index + 2] == b'/'
+ {
+ return Some(index + 3);
+ }
+ }
+
+ None
+}
+
+/// Move past `www.`, case-insensitive.
+fn peek_www(bytes: &[u8], index: usize) -> Option<usize> {
+ // `www.`
+ if index + 3 < bytes.len()
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>.
+ && (index == 0 || matches!(bytes[index - 1], b'\t' | b'\n' | b'\r' | b' ' | b'(' | b'*' | b'_' | b'~'))
+ && matches!(bytes[index], b'W' | b'w')
+ && matches!(bytes[index + 1], b'W' | b'w')
+ && matches!(bytes[index + 2], b'W' | b'w')
+ && bytes[index + 3] == b'.'
+ {
+ Some(index + 4)
+ } else {
+ None
+ }
+}
+
+/// Move past `example.com`.
+fn peek_domain(bytes: &[u8], start: usize, allow_short: bool) -> Option<usize> {
+ let mut dots = false;
+ let mut penultime = false;
+ let mut last = false;
+ // To do: expose this from slice?
+ // To do: do it ourselves? <https://github.com/commonmark/cmark/blob/8a023286198a7e408398e282f293e3b0baebb644/src/utf8.c#L150>, <https://doc.rust-lang.org/core/str/fn.next_code_point.html>, <https://www.reddit.com/r/rust/comments/4g2zu0/lazy_unicode_iterator_from_byte_iteratorslice/>, <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>.
+ let char_indices = str::from_utf8(&bytes[start..])
+ .unwrap()
+ .char_indices()
+ .collect::<Vec<_>>();
+ let mut index = 0;
+
+ while index < char_indices.len() {
+ match char_indices[index].1 {
+ '_' => last = true,
+ '.' => {
+ penultime = last;
+ last = false;
+ dots = true;
+ }
+ '-' => {}
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
+ char if classify(char) == CharacterKind::Other => {}
+ _ => break,
+ }
+
+ index += 1;
+ }
+
+ // No underscores allowed in last two parts.
+ // A valid domain needs to have at least a dot.
+ if penultime || last || (!allow_short && !dots) {
+ None
+ } else {
+ // Now peek past `/path?search#hash` (anything except whitespace).
+ while index < char_indices.len() {
+ if classify(char_indices[index].1) == CharacterKind::Whitespace {
+ break;
+ }
+
+ index += 1;
+ }
+
+ Some(if index == char_indices.len() {
+ bytes.len()
+ } else {
+ start + char_indices[index].0
+ })
+ }
+}
+
+/// Move back past `contact`.
+fn peek_atext(bytes: &[u8], end: usize) -> Option<usize> {
+ let mut index = end;
+
+ // Take simplified atext.
+ // See `email_atext` in `autolink.rs` for a similar algorithm.
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L301>.
+ while index > 0
+ && matches!(bytes[index - 1], b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z')
+ {
+ index -= 1;
+ }
+
+ // Do not allow a slash “inside” atext.
+ // The reference code is a bit weird, but that’s what it results in.
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L307>.
+ // Other than slash, every preceding character is allowed.
+ if index == end || (index > 0 && bytes[index - 1] == b'/') {
+ None
+ } else {
+ Some(index)
+ }
+}
+
+/// Move past `example.com`.
+fn peek_email_domain(bytes: &[u8], start: usize) -> Option<usize> {
+ let mut index = start;
+ let mut dot = false;
+
+ // Move past “domain”.
+ // The reference code is a bit overly complex as it handles the `@`, of which there may be just one.
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L318>
+ while index < bytes.len() {
+ match bytes[index] {
+ // Alphanumerical, `-`, and `_`.
+ b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'_' | b'a'..=b'z' => {}
+ // Dot followed by alphanumerical (not `-` or `_`).
+ b'.' if index + 1 < bytes.len()
+ && matches!(bytes[index + 1], b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') =>
+ {
+ dot = true;
+ }
+ _ => break,
+ }
+
+ index += 1;
+ }
+
+ // Domain must not be empty, must include a dot, and must end in alphabetical or `.`.
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L332>.
+ if index > start && dot && matches!(bytes[index - 1], b'.' | b'A'..=b'Z' | b'a'..=b'z') {
+ Some(index)
+ } else {
+ None
+ }
+}
+
+/// Split trialing stuff from a URL.
+fn truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {
+ let mut index = start;
+
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L42>
+ while index < end {
+ if bytes[index] == b'<' {
+ end = index;
+ break;
+ }
+ index += 1;
+ }
+
+ let mut split = end;
+
+ // Move before trailing punctuation.
+ while split > start {
+ match bytes[split - 1] {
+ b'!' | b'"' | b'&' | b'\'' | b')' | b',' | b'.' | b':' | b'<' | b'>' | b'?' | b']'
+ | b'}' => {}
+ // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L61>.
+ // Note: we can’t move across actual references, because those have been parsed already.
+ b';' => {
+ let mut new_split = split - 1;
+ // Move back past alphabeticals.
+ while new_split > start && matches!(bytes[new_split - 1], b'A'..=b'Z' | b'a'..=b'z')
+ {
+ new_split -= 1;
+ }
+
+ // Nonempty character reference:
+ if new_split > start && bytes[new_split - 1] == b'&' && new_split < split - 1 {
+ split = new_split - 1;
+ continue;
+ }
+
+ // Otherwise it’s just a `;`.
+ }
+ _ => break,
+ }
+ split -= 1;
+ }
+
+ // If there was trailing punctuation, try to balance parens.
+ if split != end {
+ let mut open = 0;
+ let mut close = 0;
+ let mut paren_index = start;
+
+ // Count parens in `url` (not in trail).
+ while paren_index < split {
+ match bytes[paren_index] {
+ b'(' => open += 1,
+ b')' => close += 1,
+ _ => {}
+ }
+
+ paren_index += 1;
+ }
+
+ let mut trail_index = split;
+
+ // If there are more opening than closing parens, try to balance them
+ // from the trail.
+ while open > close && trail_index < end {
+ if bytes[trail_index] == b')' {
+ split = trail_index;
+ close += 1;
+ }
+
+ trail_index += 1;
+ }
+ }
+
+ split
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 1c1c6f7..ba1a0b3 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -28,7 +28,7 @@
//! For example, [code (fenced)][code_fenced] and
//! [code (indented)][code_indented] are considered different constructs.
//!
-//! The following constructs are found in markdown:
+//! The following constructs are found in markdown (CommonMark):
//!
//! * [attention (strong, emphasis)][attention]
//! * [autolink][]
@@ -40,7 +40,6 @@
//! * [code (indented)][code_indented]
//! * [code (text)][code_text]
//! * [definition][]
-//! * [frontmatter][]
//! * [hard break (escape)][hard_break_escape]
//! * [heading (atx)][heading_atx]
//! * [heading (setext)][heading_setext]
@@ -56,6 +55,11 @@
//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
//! > [whitespace][partial_whitespace].
//!
+//! The following constructs are extensions found in markdown:
+//!
+//! * [frontmatter][]
+//! * [gfm autolink literal][gfm_autolink_literal]
+//!
//! There are also several small subroutines typically used in different places:
//!
//! * [bom][partial_bom]
@@ -141,6 +145,7 @@ pub mod definition;
pub mod document;
pub mod flow;
pub mod frontmatter;
+pub mod gfm_autolink_literal;
pub mod hard_break_escape;
pub mod heading_atx;
pub mod heading_setext;
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs
index bc6d7f4..b6f1f47 100644
--- a/src/construct/partial_data.rs
+++ b/src/construct/partial_data.rs
@@ -7,7 +7,6 @@
//! [text]: crate::construct::text
use crate::event::{Kind, Name};
-use crate::resolve::Name as ResolveName;
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use alloc::vec;
@@ -51,7 +50,6 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State {
}
}
- tokenizer.register_resolver_before(ResolveName::Data);
State::Ok
}
diff --git a/src/construct/string.rs b/src/construct/string.rs
index 698a51d..dba1ac1 100644
--- a/src/construct/string.rs
+++ b/src/construct/string.rs
@@ -27,7 +27,6 @@ const MARKERS: [u8; 2] = [b'&', b'\\'];
/// ^
/// ````
pub fn start(tokenizer: &mut Tokenizer) -> State {
- tokenizer.register_resolver(ResolveName::String);
tokenizer.tokenize_state.markers = &MARKERS;
State::Retry(StateName::StringBefore)
}
@@ -40,7 +39,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ````
pub fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- None => State::Ok,
+ None => {
+ tokenizer.register_resolver(ResolveName::Data);
+ tokenizer.register_resolver(ResolveName::String);
+ State::Ok
+ }
Some(b'&') => {
tokenizer.attempt(
State::Next(StateName::StringBefore),
diff --git a/src/construct/text.rs b/src/construct/text.rs
index 5c13dba..06ba378 100644
--- a/src/construct/text.rs
+++ b/src/construct/text.rs
@@ -20,6 +20,7 @@
//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
//! > [whitespace][crate::construct::partial_whitespace].
+use crate::construct::gfm_autolink_literal::resolve as resolve_gfm_autolink_literal;
use crate::construct::partial_whitespace::resolve_whitespace;
use crate::resolve::Name as ResolveName;
use crate::state::{Name as StateName, State};
@@ -45,7 +46,6 @@ const MARKERS: [u8; 9] = [
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- tokenizer.register_resolver(ResolveName::Text);
tokenizer.tokenize_state.markers = &MARKERS;
State::Retry(StateName::TextBefore)
}
@@ -58,7 +58,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- None => State::Ok,
+ None => {
+ tokenizer.register_resolver(ResolveName::Data);
+ tokenizer.register_resolver(ResolveName::Text);
+ State::Ok
+ }
Some(b'!') => {
tokenizer.attempt(
State::Next(StateName::TextBefore),
@@ -170,4 +174,8 @@ pub fn resolve(tokenizer: &mut Tokenizer) {
tokenizer.parse_state.constructs.hard_break_trailing,
true,
);
+
+ if tokenizer.parse_state.constructs.gfm_autolink_literal {
+ resolve_gfm_autolink_literal(tokenizer);
+ }
}