aboutsummaryrefslogtreecommitdiffstats
path: root/src/util
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/util/constant.rs31
-rw-r--r--src/util/gfm_tagfilter.rs77
-rw-r--r--src/util/mod.rs1
-rw-r--r--src/util/sanitize_uri.rs74
4 files changed, 154 insertions, 29 deletions
diff --git a/src/util/constant.rs b/src/util/constant.rs
index f397f38..d6a6651 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;
/// [frontmatter]: crate::construct::frontmatter
pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3;
+/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][].
+///
+/// This is currently the size of `plaintext`.
+pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9;
+
+/// List of HTML tag names that are escaped by GFMs tag filter.
+///
+/// Tag name matching must be performed insensitive to case, and thus this list
+/// includes lowercase tag names.
+///
+/// ## References
+///
+/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [
+ "iframe",
+ "noembed",
+ "noframes",
+ "plaintext",
+ "script",
+ "style",
+ "textarea",
+ "title",
+ "xmp",
+];
+
/// The number of preceding spaces needed for a [hard break
/// (trailing)][whitespace] to form.
///
@@ -2427,6 +2452,12 @@ mod tests {
);
assert_eq!(
+ GFM_HTML_TAGFILTER_SIZE_MAX,
+ longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(),
+ "`GFM_HTML_TAGFILTER_SIZE_MAX`"
+ );
+
+ assert_eq!(
HTML_RAW_SIZE_MAX,
longest(&HTML_RAW_NAMES).unwrap().len(),
"`HTML_RAW_SIZE_MAX`"
diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs
new file mode 100644
index 0000000..8023c66
--- /dev/null
+++ b/src/util/gfm_tagfilter.rs
@@ -0,0 +1,77 @@
+//! Make dangerous HTML a tiny bit safer.
+
+use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX};
+use alloc::string::String;
+use core::str;
+
+/// Make dangerous HTML a tiny bit safer.
+///
+/// The tagfilter is kinda weird and kinda useless.
+/// The tag filter is a naïve attempt at XSS protection.
+/// You should use a proper HTML sanitizing algorithm.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::gfm_tagfilter::gfm_tagfilter;
+///
+/// assert_eq!(gfm_tagfilter("<iframe>"), "&lt;iframe>");
+/// ```
+///
+/// ## References
+///
+/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+/// * [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c)
+pub fn gfm_tagfilter(value: &str) -> String {
+ let bytes = value.as_bytes();
+ // It’ll grow a bit bigger for each encoded `<`.
+ let mut result = String::with_capacity(bytes.len());
+ let mut index = 0;
+ let mut start = 0;
+ let len = bytes.len();
+
+ while index < len {
+ if bytes[index] == b'<' {
+ let mut name_start = index + 1;
+
+ // Optional `/`.
+ if name_start < len && bytes[name_start] == b'/' {
+ name_start += 1;
+ }
+
+ // Tag name.
+ let mut name_end = name_start;
+
+ while name_end < len
+ && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX
+ && bytes[name_end].is_ascii_alphabetic()
+ {
+ name_end += 1;
+ }
+
+ // Non-empty.
+ if name_end != name_start &&
+ // HTML whitespace, closing slash, or closing angle bracket.
+ matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') &&
+ // Known name.
+ GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end])
+ .unwrap()
+ .to_ascii_lowercase().as_str())
+ {
+ result.push_str(&value[start..index]);
+ result.push_str("&lt;");
+ start = index + 1;
+ }
+
+ // There was no `<` before `name_end`, so move to that next.
+ index = name_end;
+ continue;
+ }
+
+ index += 1;
+ }
+
+ result.push_str(&value[start..]);
+
+ result
+}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index d2ec0ed..e5823cf 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -5,6 +5,7 @@ pub mod constant;
pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
+pub mod gfm_tagfilter;
pub mod normalize_identifier;
pub mod sanitize_uri;
pub mod skip;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 969a4d8..0099347 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -10,55 +10,71 @@ use alloc::{
/// Make a value safe for injection as a URL.
///
/// This encodes unsafe characters with percent-encoding and skips already
-/// encoded sequences (see [`normalize_uri`][] below).
+/// encoded sequences (see [`normalize`][] below).
/// Further unsafe characters are encoded as character references (see
/// [`encode`][]).
///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::sanitize_uri::sanitize;
+///
+/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)");
+/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// ```
+///
+/// ## References
+///
+/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
+pub fn sanitize(value: &str) -> String {
+ encode(&*normalize(value), true)
+}
+
+/// Make a value safe for injection as a URL, and check protocols.
+///
+/// This first uses [`sanitize`][sanitize].
/// Then, a vec of (lowercase) allowed protocols can be given, in which case
-/// the URL is sanitized.
+/// the URL is ignored or kept.
///
-/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])`
-/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`.
+/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]`
+/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`.
/// If the URL includes an unknown protocol (one not matched by `protocol`, such
/// as a dangerous example, `javascript:`), the value is ignored.
///
/// ## Examples
///
/// ```rust ignore
-/// use micromark::util::sanitize_url::sanitize_url;
+/// use micromark::util::sanitize_uri::sanitize_with_protocols;
///
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)");
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), "");
-/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com");
-/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), "");
+/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com");
+/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25");
/// ```
///
/// ## References
///
/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
-pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
- let value = encode(&*normalize_uri(value), true);
+pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String {
+ let value = sanitize(value);
- if let Some(protocols) = protocols {
- let end = value.find(|c| matches!(c, '?' | '#' | '/'));
- let mut colon = value.find(|c| matches!(c, ':'));
+ let end = value.find(|c| matches!(c, '?' | '#' | '/'));
+ let mut colon = value.find(|c| matches!(c, ':'));
- // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
- if let Some(end) = end {
- if let Some(index) = colon {
- if index > end {
- colon = None;
- }
+ // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
+ if let Some(end) = end {
+ if let Some(index) = colon {
+ if index > end {
+ colon = None;
}
}
+ }
- // If there is no protocol, it’s relative, and fine.
- if let Some(colon) = colon {
- // If it is a protocol, it should be allowed.
- let protocol = value[0..colon].to_lowercase();
- if !protocols.contains(&protocol.as_str()) {
- return "".to_string();
- }
+ // If there is no protocol, it’s relative, and fine.
+ if let Some(colon) = colon {
+ // If it is a protocol, it should be allowed.
+ let protocol = value[0..colon].to_lowercase();
+ if !protocols.contains(&protocol.as_str()) {
+ return "".to_string();
}
}
@@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
/// ## Examples
///
/// ```rust ignore
-/// use micromark::util::sanitize_url::normalize_uri;
+/// use micromark::util::sanitize_uri::normalize;
///
/// assert_eq!(sanitize_uri("https://example.com"), "https://example.com");
/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
@@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
///
/// [definition]: crate::construct::definition
/// [label_end]: crate::construct::label_end
-fn normalize_uri(value: &str) -> String {
+fn normalize(value: &str) -> String {
let chars = value.chars().collect::<Vec<_>>();
// Note: it’ll grow bigger for each non-ascii or non-safe character.
let mut result = String::with_capacity(value.len());