diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-09-01 12:18:43 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-09-01 12:18:43 +0200 |
commit | 6fd5d61ed9b8cb66c13f44893d50025c9a87b217 (patch) | |
tree | dc471f61a8b4cec968a98ad61b3d4f14745d6c3b /src | |
parent | fa363dbba79f50001a22d1c90b8fb2009101d48c (diff) | |
download | markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.gz markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.tar.bz2 markdown-rs-6fd5d61ed9b8cb66c13f44893d50025c9a87b217.zip |
Add support for GFM tagfilter
Diffstat (limited to 'src')
-rw-r--r-- | src/compiler.rs | 89 | ||||
-rw-r--r-- | src/construct/definition.rs | 2 | ||||
-rw-r--r-- | src/construct/label_end.rs | 2 | ||||
-rw-r--r-- | src/lib.rs | 47 | ||||
-rw-r--r-- | src/util/constant.rs | 31 | ||||
-rw-r--r-- | src/util/gfm_tagfilter.rs | 77 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | src/util/sanitize_uri.rs | 74 |
8 files changed, 246 insertions, 77 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 5626f8a..681ec00 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -4,8 +4,9 @@ use crate::util::{ constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}, decode_character_reference::{decode_named, decode_numeric}, encode::encode, + gfm_tagfilter::gfm_tagfilter, normalize_identifier::normalize_identifier, - sanitize_uri::sanitize_uri, + sanitize_uri::{sanitize, sanitize_with_protocols}, skip, slice::{Position, Slice}, }; @@ -156,16 +157,8 @@ struct CompileContext<'a> { /// Whether to encode HTML. pub encode_html: bool, // Configuration - /// Whether to sanitize `href`s, and in which case, which protocols to - /// allow. - pub protocol_href: Option<Vec<&'static str>>, - /// Whether to sanitize `src`s, and in which case, which protocols to - /// allow. - pub protocol_src: Option<Vec<&'static str>>, /// Line ending to use. pub line_ending_default: LineEnding, - /// Whether to allow HTML. - pub allow_dangerous_html: bool, // Intermediate results. /// Stack of buffers. pub buffers: Vec<String>, @@ -203,18 +196,7 @@ impl<'a> CompileContext<'a> { slurp_one_line_ending: false, image_alt_inside: false, encode_html: true, - protocol_href: if options.allow_dangerous_protocol { - None - } else { - Some(SAFE_PROTOCOL_HREF.to_vec()) - }, - protocol_src: if options.allow_dangerous_protocol { - None - } else { - Some(SAFE_PROTOCOL_SRC.to_vec()) - }, line_ending_default: line_ending, - allow_dangerous_html: options.allow_dangerous_html, buffers: vec![String::new()], index: 0, options, @@ -701,14 +683,14 @@ fn on_enter_gfm_task_list_item_check(context: &mut CompileContext) { /// Handle [`Enter`][Kind::Enter]:[`HtmlFlow`][Name::HtmlFlow]. fn on_enter_html_flow(context: &mut CompileContext) { context.line_ending_if_needed(); - if context.allow_dangerous_html { + if context.options.allow_dangerous_html { context.encode_html = false; } } /// Handle [`Enter`][Kind::Enter]:[`HtmlText`][Name::HtmlText]. fn on_enter_html_text(context: &mut CompileContext) { - if context.allow_dangerous_html { + if context.options.allow_dangerous_html { context.encode_html = false; } } @@ -1198,7 +1180,7 @@ fn on_exit_gfm_footnote_call(context: &mut CompileContext) { let indices = context.media_stack.pop().unwrap().label_id.unwrap(); let id = normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str()); - let safe_id = sanitize_uri(&id.to_lowercase(), &None); + let safe_id = sanitize(&id.to_lowercase()); let mut call_index = 0; // See if this has been called before. @@ -1428,14 +1410,19 @@ fn on_exit_html(context: &mut CompileContext) { /// Handle [`Exit`][Kind::Exit]:{[`HtmlFlowData`][Name::HtmlFlowData],[`HtmlTextData`][Name::HtmlTextData]}. fn on_exit_html_data(context: &mut CompileContext) { - context.push(&encode( - Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .as_str(), - context.encode_html, - )); + let slice = Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ); + let value = slice.as_str(); + + let encoded = if context.options.gfm_tagfilter && context.options.allow_dangerous_html { + encode(&gfm_tagfilter(value), context.encode_html) + } else { + encode(value, context.encode_html) + }; + + context.push(&encoded); } /// Handle [`Exit`][Kind::Exit]:[`Label`][Name::Label]. @@ -1585,14 +1572,19 @@ fn on_exit_media(context: &mut CompileContext) { }; if let Some(destination) = destination { - context.push(&sanitize_uri( - destination, - if media.image { - &context.protocol_src - } else { - &context.protocol_href - }, - )); + let url = if context.options.allow_dangerous_protocol { + sanitize(destination) + } else { + sanitize_with_protocols( + destination, + if media.image { + &SAFE_PROTOCOL_SRC + } else { + &SAFE_PROTOCOL_HREF + }, + ) + }; + context.push(&url); } if media.image { @@ -1728,7 +1720,7 @@ fn generate_footnote_section(context: &mut CompileContext) { /// Generate a footnote item from a call. fn generate_footnote_item(context: &mut CompileContext, index: usize) { let id = &context.gfm_footnote_definition_calls[index].0; - let safe_id = sanitize_uri(&id.to_lowercase(), &None); + let safe_id = sanitize(&id.to_lowercase()); // Find definition: we’ll always find it. let mut definition_index = 0; @@ -1833,14 +1825,19 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) { fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { if !context.image_alt_inside { context.push("<a href=\""); - if let Some(protocol) = protocol { - context.push(&sanitize_uri( - &format!("{}{}", protocol, value), - &context.protocol_href, - )); + let url = if let Some(protocol) = protocol { + format!("{}{}", protocol, value) + } else { + value.to_string() + }; + + let url = if context.options.allow_dangerous_protocol { + sanitize(&url) } else { - context.push(&sanitize_uri(value, &context.protocol_href)); + sanitize_with_protocols(&url, &SAFE_PROTOCOL_HREF) }; + + context.push(&url); context.push("\">"); } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 1d67635..1071489 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -96,7 +96,7 @@ //! [label]: crate::construct::partial_label //! [label_end]: crate::construct::label_end //! [title]: crate::construct::partial_title -//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize //! [normalize_identifier]: crate::util::normalize_identifier //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 8a9edfb..ce1c295 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -173,7 +173,7 @@ //! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote //! [definition]: crate::construct::definition //! [autolink]: crate::construct::autolink -//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize //! [normalize_identifier]: crate::util::normalize_identifier::normalize_identifier //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element @@ -732,6 +732,52 @@ pub struct Options { /// ``` pub gfm_strikethrough_single_tilde: bool, + /// Whether to support the GFM tagfilter, when `allow_dangerous_html` is on + /// (default: `false`). + /// + /// The tagfilter is kinda weird and kinda useless. + /// The tag filter is a naïve attempt at XSS protection. + /// You should use a proper HTML sanitizing algorithm. + /// + /// ## Examples + /// + /// ``` + /// use micromark::{micromark_with_options, Options, Constructs}; + /// + /// // With `allow_dangerous_html`, micromark passes HTML through untouched: + /// assert_eq!( + /// micromark_with_options( + /// "<iframe>", + /// &Options { + /// allow_dangerous_html: true, + /// constructs: Constructs::gfm(), + /// ..Options::default() + /// } + /// ), + /// "<iframe>" + /// ); + /// + /// // Pass `gfm_tagfilter: true` to make some of that safe: + /// assert_eq!( + /// micromark_with_options( + /// "<iframe>", + /// &Options { + /// allow_dangerous_html: true, + /// constructs: Constructs::gfm(), + /// gfm_tagfilter: true, + /// ..Options::default() + /// } + /// ), + /// "<iframe>" + /// ); + /// ``` + /// + /// ## References + /// + /// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) + /// * [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c) + pub gfm_tagfilter: bool, + /// Whether to support math (text) (if enabled in `constructs`) with a /// single dollar (default: `true`). /// @@ -791,6 +837,7 @@ impl Default for Options { gfm_footnote_back_label: None, gfm_footnote_clobber_prefix: None, gfm_strikethrough_single_tilde: true, + gfm_tagfilter: false, math_text_single_dollar: true, } } diff --git a/src/util/constant.rs b/src/util/constant.rs index f397f38..d6a6651 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3; /// [frontmatter]: crate::construct::frontmatter pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3; +/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][]. +/// +/// This is currently the size of `plaintext`. +pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9; + +/// List of HTML tag names that are escaped by GFMs tag filter. +/// +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// ## References +/// +/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [ + "iframe", + "noembed", + "noframes", + "plaintext", + "script", + "style", + "textarea", + "title", + "xmp", +]; + /// The number of preceding spaces needed for a [hard break /// (trailing)][whitespace] to form. /// @@ -2427,6 +2452,12 @@ mod tests { ); assert_eq!( + GFM_HTML_TAGFILTER_SIZE_MAX, + longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(), + "`GFM_HTML_TAGFILTER_SIZE_MAX`" + ); + + assert_eq!( HTML_RAW_SIZE_MAX, longest(&HTML_RAW_NAMES).unwrap().len(), "`HTML_RAW_SIZE_MAX`" diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs new file mode 100644 index 0000000..8023c66 --- /dev/null +++ b/src/util/gfm_tagfilter.rs @@ -0,0 +1,77 @@ +//! Make dangerous HTML a tiny bit safer. + +use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX}; +use alloc::string::String; +use core::str; + +/// Make dangerous HTML a tiny bit safer. +/// +/// The tagfilter is kinda weird and kinda useless. +/// The tag filter is a naïve attempt at XSS protection. +/// You should use a proper HTML sanitizing algorithm. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::gfm_tagfilter::gfm_tagfilter; +/// +/// assert_eq!(gfm_tagfilter("<iframe>"), "<iframe>"); +/// ``` +/// +/// ## References +/// +/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +/// * [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c) +pub fn gfm_tagfilter(value: &str) -> String { + let bytes = value.as_bytes(); + // It’ll grow a bit bigger for each encoded `<`. + let mut result = String::with_capacity(bytes.len()); + let mut index = 0; + let mut start = 0; + let len = bytes.len(); + + while index < len { + if bytes[index] == b'<' { + let mut name_start = index + 1; + + // Optional `/`. + if name_start < len && bytes[name_start] == b'/' { + name_start += 1; + } + + // Tag name. + let mut name_end = name_start; + + while name_end < len + && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX + && bytes[name_end].is_ascii_alphabetic() + { + name_end += 1; + } + + // Non-empty. + if name_end != name_start && + // HTML whitespace, closing slash, or closing angle bracket. + matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') && + // Known name. + GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end]) + .unwrap() + .to_ascii_lowercase().as_str()) + { + result.push_str(&value[start..index]); + result.push_str("<"); + start = index + 1; + } + + // There was no `<` before `name_end`, so move to that next. + index = name_end; + continue; + } + + index += 1; + } + + result.push_str(&value[start..]); + + result +} diff --git a/src/util/mod.rs b/src/util/mod.rs index d2ec0ed..e5823cf 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -5,6 +5,7 @@ pub mod constant; pub mod decode_character_reference; pub mod edit_map; pub mod encode; +pub mod gfm_tagfilter; pub mod normalize_identifier; pub mod sanitize_uri; pub mod skip; diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 969a4d8..0099347 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -10,55 +10,71 @@ use alloc::{ /// Make a value safe for injection as a URL. /// /// This encodes unsafe characters with percent-encoding and skips already -/// encoded sequences (see [`normalize_uri`][] below). +/// encoded sequences (see [`normalize`][] below). /// Further unsafe characters are encoded as character references (see /// [`encode`][]). /// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::sanitize_uri::sanitize; +/// +/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)"); +/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25"); +/// ``` +/// +/// ## References +/// +/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) +pub fn sanitize(value: &str) -> String { + encode(&*normalize(value), true) +} + +/// Make a value safe for injection as a URL, and check protocols. +/// +/// This first uses [`sanitize`][sanitize]. /// Then, a vec of (lowercase) allowed protocols can be given, in which case -/// the URL is sanitized. +/// the URL is ignored or kept. /// -/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])` -/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`. +/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]` +/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`. /// If the URL includes an unknown protocol (one not matched by `protocol`, such /// as a dangerous example, `javascript:`), the value is ignored. /// /// ## Examples /// /// ```rust ignore -/// use micromark::util::sanitize_url::sanitize_url; +/// use micromark::util::sanitize_uri::sanitize_with_protocols; /// -/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)"); -/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), ""); -/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com"); -/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25"); +/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), ""); +/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com"); +/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25"); /// ``` /// /// ## References /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) -pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { - let value = encode(&*normalize_uri(value), true); +pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String { + let value = sanitize(value); - if let Some(protocols) = protocols { - let end = value.find(|c| matches!(c, '?' | '#' | '/')); - let mut colon = value.find(|c| matches!(c, ':')); + let end = value.find(|c| matches!(c, '?' | '#' | '/')); + let mut colon = value.find(|c| matches!(c, ':')); - // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. - if let Some(end) = end { - if let Some(index) = colon { - if index > end { - colon = None; - } + // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. + if let Some(end) = end { + if let Some(index) = colon { + if index > end { + colon = None; } } + } - // If there is no protocol, it’s relative, and fine. - if let Some(colon) = colon { - // If it is a protocol, it should be allowed. - let protocol = value[0..colon].to_lowercase(); - if !protocols.contains(&protocol.as_str()) { - return "".to_string(); - } + // If there is no protocol, it’s relative, and fine. + if let Some(colon) = colon { + // If it is a protocol, it should be allowed. + let protocol = value[0..colon].to_lowercase(); + if !protocols.contains(&protocol.as_str()) { + return "".to_string(); } } @@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { /// ## Examples /// /// ```rust ignore -/// use micromark::util::sanitize_url::normalize_uri; +/// use micromark::util::sanitize_uri::normalize; /// /// assert_eq!(sanitize_uri("https://example.com"), "https://example.com"); /// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25"); @@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { /// /// [definition]: crate::construct::definition /// [label_end]: crate::construct::label_end -fn normalize_uri(value: &str) -> String { +fn normalize(value: &str) -> String { let chars = value.chars().collect::<Vec<_>>(); // Note: it’ll grow bigger for each non-ascii or non-safe character. let mut result = String::with_capacity(value.len()); |