Add support for GFM tagfilter

author: Titus Wormer <tituswormer@gmail.com> 2022-09-01 12:18:43 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-01 12:18:43 +0200
commit: 6fd5d61ed9b8cb66c13f44893d50025c9a87b217 (patch)
tree: dc471f61a8b4cec968a98ad61b3d4f14745d6c3b /src/util
parent: fa363dbba79f50001a22d1c90b8fb2009101d48c (diff)
download: markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.gz
markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.bz2
markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.zip
4 files changed, 154 insertions, 29 deletions
diff --git a/src/util/constant.rs b/src/util/constant.rs
index f397f38..d6a6651 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;
 /// [frontmatter]: crate::construct::frontmatter
 pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3;
 
+/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][].
+///
+/// This is currently the size of `plaintext`.
+pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9;
+
+/// List of HTML tag names that are escaped by GFMs tag filter.
+///
+/// Tag name matching must be performed insensitive to case, and thus this list
+/// includes lowercase tag names.
+///
+/// ## References
+///
+/// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [
+    "iframe",
+    "noembed",
+    "noframes",
+    "plaintext",
+    "script",
+    "style",
+    "textarea",
+    "title",
+    "xmp",
+];
+
 /// The number of preceding spaces needed for a [hard break
 /// (trailing)][whitespace] to form.
 ///
@@ -2427,6 +2452,12 @@ mod tests {
         );
 
         assert_eq!(
+            GFM_HTML_TAGFILTER_SIZE_MAX,
+            longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(),
+            "`GFM_HTML_TAGFILTER_SIZE_MAX`"
+        );
+
+        assert_eq!(
             HTML_RAW_SIZE_MAX,
             longest(&HTML_RAW_NAMES).unwrap().len(),
             "`HTML_RAW_SIZE_MAX`"
diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs
new file mode 100644
index 0000000..8023c66
--- /dev/null
+++ b/src/util/gfm_tagfilter.rs
@@ -0,0 +1,77 @@
+//! Make dangerous HTML a tiny bit safer.
+
+use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX};
+use alloc::string::String;
+use core::str;
+
+/// Make dangerous HTML a tiny bit safer.
+///
+/// The tagfilter is kinda weird and kinda useless.
+/// The tag filter is a naïve attempt at XSS protection.
+/// You should use a proper HTML sanitizing algorithm.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::gfm_tagfilter::gfm_tagfilter;
+///
+/// assert_eq!(gfm_tagfilter("<iframe>"), "&lt;iframe>");
+/// ```
+///
+/// ## References
+///
+/// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+/// *   [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c)
+pub fn gfm_tagfilter(value: &str) -> String {
+    let bytes = value.as_bytes();
+    // It’ll grow a bit bigger for each encoded `<`.
+    let mut result = String::with_capacity(bytes.len());
+    let mut index = 0;
+    let mut start = 0;
+    let len = bytes.len();
+
+    while index < len {
+        if bytes[index] == b'<' {
+            let mut name_start = index + 1;
+
+            // Optional `/`.
+            if name_start < len && bytes[name_start] == b'/' {
+                name_start += 1;
+            }
+
+            // Tag name.
+            let mut name_end = name_start;
+
+            while name_end < len
+                && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX
+                && bytes[name_end].is_ascii_alphabetic()
+            {
+                name_end += 1;
+            }
+
+            // Non-empty.
+            if name_end != name_start &&
+                // HTML whitespace, closing slash, or closing angle bracket.
+                matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') &&
+                // Known name.
+                GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end])
+                .unwrap()
+                .to_ascii_lowercase().as_str())
+            {
+                result.push_str(&value[start..index]);
+                result.push_str("&lt;");
+                start = index + 1;
+            }
+
+            // There was no `<` before `name_end`, so move to that next.
+            index = name_end;
+            continue;
+        }
+
+        index += 1;
+    }
+
+    result.push_str(&value[start..]);
+
+    result
+}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index d2ec0ed..e5823cf 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -5,6 +5,7 @@ pub mod constant;
 pub mod decode_character_reference;
 pub mod edit_map;
 pub mod encode;
+pub mod gfm_tagfilter;
 pub mod normalize_identifier;
 pub mod sanitize_uri;
 pub mod skip;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 969a4d8..0099347 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -10,55 +10,71 @@ use alloc::{
 /// Make a value safe for injection as a URL.
 ///
 /// This encodes unsafe characters with percent-encoding and skips already
-/// encoded sequences (see [`normalize_uri`][] below).
+/// encoded sequences (see [`normalize`][] below).
 /// Further unsafe characters are encoded as character references (see
 /// [`encode`][]).
 ///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::sanitize_uri::sanitize;
+///
+/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)");
+/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// ```
+///
+/// ## References
+///
+/// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
+pub fn sanitize(value: &str) -> String {
+    encode(&*normalize(value), true)
+}
+
+/// Make a value safe for injection as a URL, and check protocols.
+///
+/// This first uses [`sanitize`][sanitize].
 /// Then, a vec of (lowercase) allowed protocols can be given, in which case
-/// the URL is sanitized.
+/// the URL is ignored or kept.
 ///
-/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])`
-/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`.
+/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]`
+/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`.
 /// If the URL includes an unknown protocol (one not matched by `protocol`, such
 /// as a dangerous example, `javascript:`), the value is ignored.
 ///
 /// ## Examples
 ///
 /// ```rust ignore
-/// use micromark::util::sanitize_url::sanitize_url;
+/// use micromark::util::sanitize_uri::sanitize_with_protocols;
 ///
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)");
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), "");
-/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com");
-/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), "");
+/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com");
+/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25");
 /// ```
 ///
 /// ## References
 ///
 /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
-pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
-    let value = encode(&*normalize_uri(value), true);
+pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String {
+    let value = sanitize(value);
 
-    if let Some(protocols) = protocols {
-        let end = value.find(|c| matches!(c, '?' | '#' | '/'));
-        let mut colon = value.find(|c| matches!(c, ':'));
+    let end = value.find(|c| matches!(c, '?' | '#' | '/'));
+    let mut colon = value.find(|c| matches!(c, ':'));
 
-        // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
-        if let Some(end) = end {
-            if let Some(index) = colon {
-                if index > end {
-                    colon = None;
-                }
+    // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
+    if let Some(end) = end {
+        if let Some(index) = colon {
+            if index > end {
+                colon = None;
             }
         }
+    }
 
-        // If there is no protocol, it’s relative, and fine.
-        if let Some(colon) = colon {
-            // If it is a protocol, it should be allowed.
-            let protocol = value[0..colon].to_lowercase();
-            if !protocols.contains(&protocol.as_str()) {
-                return "".to_string();
-            }
+    // If there is no protocol, it’s relative, and fine.
+    if let Some(colon) = colon {
+        // If it is a protocol, it should be allowed.
+        let protocol = value[0..colon].to_lowercase();
+        if !protocols.contains(&protocol.as_str()) {
+            return "".to_string();
         }
     }
 
@@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
 /// ## Examples
 ///
 /// ```rust ignore
-/// use micromark::util::sanitize_url::normalize_uri;
+/// use micromark::util::sanitize_uri::normalize;
 ///
 /// assert_eq!(sanitize_uri("https://example.com"), "https://example.com");
 /// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
@@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
 ///
 /// [definition]: crate::construct::definition
 /// [label_end]: crate::construct::label_end
-fn normalize_uri(value: &str) -> String {
+fn normalize(value: &str) -> String {
     let chars = value.chars().collect::<Vec<_>>();
     // Note: it’ll grow bigger for each non-ascii or non-safe character.
     let mut result = String::with_capacity(value.len());
author	Titus Wormer <tituswormer@gmail.com>	2022-09-01 12:18:43 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-01 12:18:43 +0200
commit	6fd5d61ed9b8cb66c13f44893d50025c9a87b217 (patch)
tree	dc471f61a8b4cec968a98ad61b3d4f14745d6c3b /src/util
parent	fa363dbba79f50001a22d1c90b8fb2009101d48c (diff)
download	markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.gz markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.bz2 markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.zip