diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-09-01 12:18:43 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-09-01 12:18:43 +0200 |
commit | 6fd5d61ed9b8cb66c13f44893d50025c9a87b217 (patch) | |
tree | dc471f61a8b4cec968a98ad61b3d4f14745d6c3b /src/util | |
parent | fa363dbba79f50001a22d1c90b8fb2009101d48c (diff) | |
download | markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.gz markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.bz2 markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.zip |
Add support for GFM tagfilter
Diffstat (limited to 'src/util')
-rw-r--r-- | src/util/constant.rs | 31 | ||||
-rw-r--r-- | src/util/gfm_tagfilter.rs | 77 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | src/util/sanitize_uri.rs | 74 |
4 files changed, 154 insertions, 29 deletions
diff --git a/src/util/constant.rs b/src/util/constant.rs index f397f38..d6a6651 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3; /// [frontmatter]: crate::construct::frontmatter pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3; +/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][]. +/// +/// This is currently the size of `plaintext`. +pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9; + +/// List of HTML tag names that are escaped by GFMs tag filter. +/// +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// ## References +/// +/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [ + "iframe", + "noembed", + "noframes", + "plaintext", + "script", + "style", + "textarea", + "title", + "xmp", +]; + /// The number of preceding spaces needed for a [hard break /// (trailing)][whitespace] to form. /// @@ -2427,6 +2452,12 @@ mod tests { ); assert_eq!( + GFM_HTML_TAGFILTER_SIZE_MAX, + longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(), + "`GFM_HTML_TAGFILTER_SIZE_MAX`" + ); + + assert_eq!( HTML_RAW_SIZE_MAX, longest(&HTML_RAW_NAMES).unwrap().len(), "`HTML_RAW_SIZE_MAX`" diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs new file mode 100644 index 0000000..8023c66 --- /dev/null +++ b/src/util/gfm_tagfilter.rs @@ -0,0 +1,77 @@ +//! Make dangerous HTML a tiny bit safer. + +use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX}; +use alloc::string::String; +use core::str; + +/// Make dangerous HTML a tiny bit safer. +/// +/// The tagfilter is kinda weird and kinda useless. +/// The tag filter is a naïve attempt at XSS protection. +/// You should use a proper HTML sanitizing algorithm. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::gfm_tagfilter::gfm_tagfilter; +/// +/// assert_eq!(gfm_tagfilter("<iframe>"), "<iframe>"); +/// ``` +/// +/// ## References +/// +/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +/// * [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c) +pub fn gfm_tagfilter(value: &str) -> String { + let bytes = value.as_bytes(); + // It’ll grow a bit bigger for each encoded `<`. + let mut result = String::with_capacity(bytes.len()); + let mut index = 0; + let mut start = 0; + let len = bytes.len(); + + while index < len { + if bytes[index] == b'<' { + let mut name_start = index + 1; + + // Optional `/`. + if name_start < len && bytes[name_start] == b'/' { + name_start += 1; + } + + // Tag name. + let mut name_end = name_start; + + while name_end < len + && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX + && bytes[name_end].is_ascii_alphabetic() + { + name_end += 1; + } + + // Non-empty. + if name_end != name_start && + // HTML whitespace, closing slash, or closing angle bracket. + matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') && + // Known name. + GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end]) + .unwrap() + .to_ascii_lowercase().as_str()) + { + result.push_str(&value[start..index]); + result.push_str("<"); + start = index + 1; + } + + // There was no `<` before `name_end`, so move to that next. + index = name_end; + continue; + } + + index += 1; + } + + result.push_str(&value[start..]); + + result +} diff --git a/src/util/mod.rs b/src/util/mod.rs index d2ec0ed..e5823cf 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -5,6 +5,7 @@ pub mod constant; pub mod decode_character_reference; pub mod edit_map; pub mod encode; +pub mod gfm_tagfilter; pub mod normalize_identifier; pub mod sanitize_uri; pub mod skip; diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 969a4d8..0099347 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -10,55 +10,71 @@ use alloc::{ /// Make a value safe for injection as a URL. /// /// This encodes unsafe characters with percent-encoding and skips already -/// encoded sequences (see [`normalize_uri`][] below). +/// encoded sequences (see [`normalize`][] below). /// Further unsafe characters are encoded as character references (see /// [`encode`][]). /// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::sanitize_uri::sanitize; +/// +/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)"); +/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25"); +/// ``` +/// +/// ## References +/// +/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) +pub fn sanitize(value: &str) -> String { + encode(&*normalize(value), true) +} + +/// Make a value safe for injection as a URL, and check protocols. +/// +/// This first uses [`sanitize`][sanitize]. /// Then, a vec of (lowercase) allowed protocols can be given, in which case -/// the URL is sanitized. +/// the URL is ignored or kept. /// -/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])` -/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`. +/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]` +/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`. /// If the URL includes an unknown protocol (one not matched by `protocol`, such /// as a dangerous example, `javascript:`), the value is ignored. /// /// ## Examples /// /// ```rust ignore -/// use micromark::util::sanitize_url::sanitize_url; +/// use micromark::util::sanitize_uri::sanitize_with_protocols; /// -/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)"); -/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), ""); -/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com"); -/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25"); +/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), ""); +/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com"); +/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25"); /// ``` /// /// ## References /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) -pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { - let value = encode(&*normalize_uri(value), true); +pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String { + let value = sanitize(value); - if let Some(protocols) = protocols { - let end = value.find(|c| matches!(c, '?' | '#' | '/')); - let mut colon = value.find(|c| matches!(c, ':')); + let end = value.find(|c| matches!(c, '?' | '#' | '/')); + let mut colon = value.find(|c| matches!(c, ':')); - // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. - if let Some(end) = end { - if let Some(index) = colon { - if index > end { - colon = None; - } + // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. + if let Some(end) = end { + if let Some(index) = colon { + if index > end { + colon = None; } } + } - // If there is no protocol, it’s relative, and fine. - if let Some(colon) = colon { - // If it is a protocol, it should be allowed. - let protocol = value[0..colon].to_lowercase(); - if !protocols.contains(&protocol.as_str()) { - return "".to_string(); - } + // If there is no protocol, it’s relative, and fine. + if let Some(colon) = colon { + // If it is a protocol, it should be allowed. + let protocol = value[0..colon].to_lowercase(); + if !protocols.contains(&protocol.as_str()) { + return "".to_string(); } } @@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { /// ## Examples /// /// ```rust ignore -/// use micromark::util::sanitize_url::normalize_uri; +/// use micromark::util::sanitize_uri::normalize; /// /// assert_eq!(sanitize_uri("https://example.com"), "https://example.com"); /// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25"); @@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { /// /// [definition]: crate::construct::definition /// [label_end]: crate::construct::label_end -fn normalize_uri(value: &str) -> String { +fn normalize(value: &str) -> String { let chars = value.chars().collect::<Vec<_>>(); // Note: it’ll grow bigger for each non-ascii or non-safe character. let mut result = String::with_capacity(value.len()); |