Add support for GFM tagfilter

author: Titus Wormer <tituswormer@gmail.com> 2022-09-01 12:18:43 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-01 12:18:43 +0200
commit: 6fd5d61ed9b8cb66c13f44893d50025c9a87b217 (patch)
tree: dc471f61a8b4cec968a98ad61b3d4f14745d6c3b /src
parent: fa363dbba79f50001a22d1c90b8fb2009101d48c (diff)
download: markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.gz
markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.bz2
markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.zip
8 files changed, 246 insertions, 77 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 5626f8a..681ec00 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -4,8 +4,9 @@ use crate::util::{
     constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC},
     decode_character_reference::{decode_named, decode_numeric},
     encode::encode,
+    gfm_tagfilter::gfm_tagfilter,
     normalize_identifier::normalize_identifier,
-    sanitize_uri::sanitize_uri,
+    sanitize_uri::{sanitize, sanitize_with_protocols},
     skip,
     slice::{Position, Slice},
 };
@@ -156,16 +157,8 @@ struct CompileContext<'a> {
     /// Whether to encode HTML.
     pub encode_html: bool,
     // Configuration
-    /// Whether to sanitize `href`s, and in which case, which protocols to
-    /// allow.
-    pub protocol_href: Option<Vec<&'static str>>,
-    /// Whether to sanitize `src`s, and in which case, which protocols to
-    /// allow.
-    pub protocol_src: Option<Vec<&'static str>>,
     /// Line ending to use.
     pub line_ending_default: LineEnding,
-    /// Whether to allow HTML.
-    pub allow_dangerous_html: bool,
     // Intermediate results.
     /// Stack of buffers.
     pub buffers: Vec<String>,
@@ -203,18 +196,7 @@ impl<'a> CompileContext<'a> {
             slurp_one_line_ending: false,
             image_alt_inside: false,
             encode_html: true,
-            protocol_href: if options.allow_dangerous_protocol {
-                None
-            } else {
-                Some(SAFE_PROTOCOL_HREF.to_vec())
-            },
-            protocol_src: if options.allow_dangerous_protocol {
-                None
-            } else {
-                Some(SAFE_PROTOCOL_SRC.to_vec())
-            },
             line_ending_default: line_ending,
-            allow_dangerous_html: options.allow_dangerous_html,
             buffers: vec![String::new()],
             index: 0,
             options,
@@ -701,14 +683,14 @@ fn on_enter_gfm_task_list_item_check(context: &mut CompileContext) {
 /// Handle [`Enter`][Kind::Enter]:[`HtmlFlow`][Name::HtmlFlow].
 fn on_enter_html_flow(context: &mut CompileContext) {
     context.line_ending_if_needed();
-    if context.allow_dangerous_html {
+    if context.options.allow_dangerous_html {
         context.encode_html = false;
     }
 }
 
 /// Handle [`Enter`][Kind::Enter]:[`HtmlText`][Name::HtmlText].
 fn on_enter_html_text(context: &mut CompileContext) {
-    if context.allow_dangerous_html {
+    if context.options.allow_dangerous_html {
         context.encode_html = false;
     }
 }
@@ -1198,7 +1180,7 @@ fn on_exit_gfm_footnote_call(context: &mut CompileContext) {
     let indices = context.media_stack.pop().unwrap().label_id.unwrap();
     let id =
         normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str());
-    let safe_id = sanitize_uri(&id.to_lowercase(), &None);
+    let safe_id = sanitize(&id.to_lowercase());
     let mut call_index = 0;
 
     // See if this has been called before.
@@ -1428,14 +1410,19 @@ fn on_exit_html(context: &mut CompileContext) {
 
 /// Handle [`Exit`][Kind::Exit]:{[`HtmlFlowData`][Name::HtmlFlowData],[`HtmlTextData`][Name::HtmlTextData]}.
 fn on_exit_html_data(context: &mut CompileContext) {
-    context.push(&encode(
-        Slice::from_position(
-            context.bytes,
-            &Position::from_exit_event(context.events, context.index),
-        )
-        .as_str(),
-        context.encode_html,
-    ));
+    let slice = Slice::from_position(
+        context.bytes,
+        &Position::from_exit_event(context.events, context.index),
+    );
+    let value = slice.as_str();
+
+    let encoded = if context.options.gfm_tagfilter && context.options.allow_dangerous_html {
+        encode(&gfm_tagfilter(value), context.encode_html)
+    } else {
+        encode(value, context.encode_html)
+    };
+
+    context.push(&encoded);
 }
 
 /// Handle [`Exit`][Kind::Exit]:[`Label`][Name::Label].
@@ -1585,14 +1572,19 @@ fn on_exit_media(context: &mut CompileContext) {
         };
 
         if let Some(destination) = destination {
-            context.push(&sanitize_uri(
-                destination,
-                if media.image {
-                    &context.protocol_src
-                } else {
-                    &context.protocol_href
-                },
-            ));
+            let url = if context.options.allow_dangerous_protocol {
+                sanitize(destination)
+            } else {
+                sanitize_with_protocols(
+                    destination,
+                    if media.image {
+                        &SAFE_PROTOCOL_SRC
+                    } else {
+                        &SAFE_PROTOCOL_HREF
+                    },
+                )
+            };
+            context.push(&url);
         }
 
         if media.image {
@@ -1728,7 +1720,7 @@ fn generate_footnote_section(context: &mut CompileContext) {
 /// Generate a footnote item from a call.
 fn generate_footnote_item(context: &mut CompileContext, index: usize) {
     let id = &context.gfm_footnote_definition_calls[index].0;
-    let safe_id = sanitize_uri(&id.to_lowercase(), &None);
+    let safe_id = sanitize(&id.to_lowercase());
 
     // Find definition: we’ll always find it.
     let mut definition_index = 0;
@@ -1833,14 +1825,19 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) {
 fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) {
     if !context.image_alt_inside {
         context.push("<a href=\"");
-        if let Some(protocol) = protocol {
-            context.push(&sanitize_uri(
-                &format!("{}{}", protocol, value),
-                &context.protocol_href,
-            ));
+        let url = if let Some(protocol) = protocol {
+            format!("{}{}", protocol, value)
+        } else {
+            value.to_string()
+        };
+
+        let url = if context.options.allow_dangerous_protocol {
+            sanitize(&url)
         } else {
-            context.push(&sanitize_uri(value, &context.protocol_href));
+            sanitize_with_protocols(&url, &SAFE_PROTOCOL_HREF)
         };
+
+        context.push(&url);
         context.push("\">");
     }
 
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 1d67635..1071489 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -96,7 +96,7 @@
 //! [label]: crate::construct::partial_label
 //! [label_end]: crate::construct::label_end
 //! [title]: crate::construct::partial_title
-//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri
+//! [sanitize_uri]: crate::util::sanitize_uri::sanitize
 //! [normalize_identifier]: crate::util::normalize_identifier
 //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
 //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 8a9edfb..ce1c295 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -173,7 +173,7 @@
 //! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote
 //! [definition]: crate::construct::definition
 //! [autolink]: crate::construct::autolink
-//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri
+//! [sanitize_uri]: crate::util::sanitize_uri::sanitize
 //! [normalize_identifier]: crate::util::normalize_identifier::normalize_identifier
 //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
 //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element
diff --git a/src/lib.rs b/src/lib.rs
index 0cf4f49..e3fdfcb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -732,6 +732,52 @@ pub struct Options {
     /// ```
     pub gfm_strikethrough_single_tilde: bool,
 
+    /// Whether to support the GFM tagfilter, when `allow_dangerous_html` is on
+    /// (default: `false`).
+    ///
+    /// The tagfilter is kinda weird and kinda useless.
+    /// The tag filter is a naïve attempt at XSS protection.
+    /// You should use a proper HTML sanitizing algorithm.
+    ///
+    /// ## Examples
+    ///
+    /// ```
+    /// use micromark::{micromark_with_options, Options, Constructs};
+    ///
+    /// // With `allow_dangerous_html`, micromark passes HTML through untouched:
+    /// assert_eq!(
+    ///     micromark_with_options(
+    ///         "<iframe>",
+    ///         &Options {
+    ///             allow_dangerous_html: true,
+    ///             constructs: Constructs::gfm(),
+    ///             ..Options::default()
+    ///         }
+    ///     ),
+    ///     "<iframe>"
+    /// );
+    ///
+    /// // Pass `gfm_tagfilter: true` to make some of that safe:
+    /// assert_eq!(
+    ///     micromark_with_options(
+    ///         "<iframe>",
+    ///         &Options {
+    ///             allow_dangerous_html: true,
+    ///             constructs: Constructs::gfm(),
+    ///             gfm_tagfilter: true,
+    ///             ..Options::default()
+    ///         }
+    ///     ),
+    ///     "&lt;iframe>"
+    /// );
+    /// ```
+    ///
+    /// ## References
+    ///
+    /// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+    /// *   [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c)
+    pub gfm_tagfilter: bool,
+
     /// Whether to support math (text) (if enabled in `constructs`) with a
     /// single dollar (default: `true`).
     ///
@@ -791,6 +837,7 @@ impl Default for Options {
             gfm_footnote_back_label: None,
             gfm_footnote_clobber_prefix: None,
             gfm_strikethrough_single_tilde: true,
+            gfm_tagfilter: false,
             math_text_single_dollar: true,
         }
     }
diff --git a/src/util/constant.rs b/src/util/constant.rs
index f397f38..d6a6651 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;
 /// [frontmatter]: crate::construct::frontmatter
 pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3;
 
+/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][].
+///
+/// This is currently the size of `plaintext`.
+pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9;
+
+/// List of HTML tag names that are escaped by GFMs tag filter.
+///
+/// Tag name matching must be performed insensitive to case, and thus this list
+/// includes lowercase tag names.
+///
+/// ## References
+///
+/// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [
+    "iframe",
+    "noembed",
+    "noframes",
+    "plaintext",
+    "script",
+    "style",
+    "textarea",
+    "title",
+    "xmp",
+];
+
 /// The number of preceding spaces needed for a [hard break
 /// (trailing)][whitespace] to form.
 ///
@@ -2427,6 +2452,12 @@ mod tests {
         );
 
         assert_eq!(
+            GFM_HTML_TAGFILTER_SIZE_MAX,
+            longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(),
+            "`GFM_HTML_TAGFILTER_SIZE_MAX`"
+        );
+
+        assert_eq!(
             HTML_RAW_SIZE_MAX,
             longest(&HTML_RAW_NAMES).unwrap().len(),
             "`HTML_RAW_SIZE_MAX`"
diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs
new file mode 100644
index 0000000..8023c66
--- /dev/null
+++ b/src/util/gfm_tagfilter.rs
@@ -0,0 +1,77 @@
+//! Make dangerous HTML a tiny bit safer.
+
+use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX};
+use alloc::string::String;
+use core::str;
+
+/// Make dangerous HTML a tiny bit safer.
+///
+/// The tagfilter is kinda weird and kinda useless.
+/// The tag filter is a naïve attempt at XSS protection.
+/// You should use a proper HTML sanitizing algorithm.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::gfm_tagfilter::gfm_tagfilter;
+///
+/// assert_eq!(gfm_tagfilter("<iframe>"), "&lt;iframe>");
+/// ```
+///
+/// ## References
+///
+/// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+/// *   [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c)
+pub fn gfm_tagfilter(value: &str) -> String {
+    let bytes = value.as_bytes();
+    // It’ll grow a bit bigger for each encoded `<`.
+    let mut result = String::with_capacity(bytes.len());
+    let mut index = 0;
+    let mut start = 0;
+    let len = bytes.len();
+
+    while index < len {
+        if bytes[index] == b'<' {
+            let mut name_start = index + 1;
+
+            // Optional `/`.
+            if name_start < len && bytes[name_start] == b'/' {
+                name_start += 1;
+            }
+
+            // Tag name.
+            let mut name_end = name_start;
+
+            while name_end < len
+                && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX
+                && bytes[name_end].is_ascii_alphabetic()
+            {
+                name_end += 1;
+            }
+
+            // Non-empty.
+            if name_end != name_start &&
+                // HTML whitespace, closing slash, or closing angle bracket.
+                matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') &&
+                // Known name.
+                GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end])
+                .unwrap()
+                .to_ascii_lowercase().as_str())
+            {
+                result.push_str(&value[start..index]);
+                result.push_str("&lt;");
+                start = index + 1;
+            }
+
+            // There was no `<` before `name_end`, so move to that next.
+            index = name_end;
+            continue;
+        }
+
+        index += 1;
+    }
+
+    result.push_str(&value[start..]);
+
+    result
+}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index d2ec0ed..e5823cf 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -5,6 +5,7 @@ pub mod constant;
 pub mod decode_character_reference;
 pub mod edit_map;
 pub mod encode;
+pub mod gfm_tagfilter;
 pub mod normalize_identifier;
 pub mod sanitize_uri;
 pub mod skip;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 969a4d8..0099347 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -10,55 +10,71 @@ use alloc::{
 /// Make a value safe for injection as a URL.
 ///
 /// This encodes unsafe characters with percent-encoding and skips already
-/// encoded sequences (see [`normalize_uri`][] below).
+/// encoded sequences (see [`normalize`][] below).
 /// Further unsafe characters are encoded as character references (see
 /// [`encode`][]).
 ///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::sanitize_uri::sanitize;
+///
+/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)");
+/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// ```
+///
+/// ## References
+///
+/// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
+pub fn sanitize(value: &str) -> String {
+    encode(&*normalize(value), true)
+}
+
+/// Make a value safe for injection as a URL, and check protocols.
+///
+/// This first uses [`sanitize`][sanitize].
 /// Then, a vec of (lowercase) allowed protocols can be given, in which case
-/// the URL is sanitized.
+/// the URL is ignored or kept.
 ///
-/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])`
-/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`.
+/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]`
+/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`.
 /// If the URL includes an unknown protocol (one not matched by `protocol`, such
 /// as a dangerous example, `javascript:`), the value is ignored.
 ///
 /// ## Examples
 ///
 /// ```rust ignore
-/// use micromark::util::sanitize_url::sanitize_url;
+/// use micromark::util::sanitize_uri::sanitize_with_protocols;
 ///
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)");
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), "");
-/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com");
-/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), "");
+/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com");
+/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25");
 /// ```
 ///
 /// ## References
 ///
 /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
-pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
-    let value = encode(&*normalize_uri(value), true);
+pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String {
+    let value = sanitize(value);
 
-    if let Some(protocols) = protocols {
-        let end = value.find(|c| matches!(c, '?' | '#' | '/'));
-        let mut colon = value.find(|c| matches!(c, ':'));
+    let end = value.find(|c| matches!(c, '?' | '#' | '/'));
+    let mut colon = value.find(|c| matches!(c, ':'));
 
-        // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
-        if let Some(end) = end {
-            if let Some(index) = colon {
-                if index > end {
-                    colon = None;
-                }
+    // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
+    if let Some(end) = end {
+        if let Some(index) = colon {
+            if index > end {
+                colon = None;
             }
         }
+    }
 
-        // If there is no protocol, it’s relative, and fine.
-        if let Some(colon) = colon {
-            // If it is a protocol, it should be allowed.
-            let protocol = value[0..colon].to_lowercase();
-            if !protocols.contains(&protocol.as_str()) {
-                return "".to_string();
-            }
+    // If there is no protocol, it’s relative, and fine.
+    if let Some(colon) = colon {
+        // If it is a protocol, it should be allowed.
+        let protocol = value[0..colon].to_lowercase();
+        if !protocols.contains(&protocol.as_str()) {
+            return "".to_string();
         }
     }
 
@@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
 /// ## Examples
 ///
 /// ```rust ignore
-/// use micromark::util::sanitize_url::normalize_uri;
+/// use micromark::util::sanitize_uri::normalize;
 ///
 /// assert_eq!(sanitize_uri("https://example.com"), "https://example.com");
 /// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
@@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
 ///
 /// [definition]: crate::construct::definition
 /// [label_end]: crate::construct::label_end
-fn normalize_uri(value: &str) -> String {
+fn normalize(value: &str) -> String {
     let chars = value.chars().collect::<Vec<_>>();
     // Note: it’ll grow bigger for each non-ascii or non-safe character.
     let mut result = String::with_capacity(value.len());
author	Titus Wormer <tituswormer@gmail.com>	2022-09-01 12:18:43 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-01 12:18:43 +0200
commit	6fd5d61ed9b8cb66c13f44893d50025c9a87b217 (patch)
tree	dc471f61a8b4cec968a98ad61b3d4f14745d6c3b /src
parent	fa363dbba79f50001a22d1c90b8fb2009101d48c (diff)
download	markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.gz markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.bz2 markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.zip