From 6fd5d61ed9b8cb66c13f44893d50025c9a87b217 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 1 Sep 2022 12:18:43 +0200 Subject: Add support for GFM tagfilter --- src/compiler.rs | 89 ++++++++++++++++++++++----------------------- src/construct/definition.rs | 2 +- src/construct/label_end.rs | 2 +- src/lib.rs | 47 ++++++++++++++++++++++++ src/util/constant.rs | 31 ++++++++++++++++ src/util/gfm_tagfilter.rs | 77 +++++++++++++++++++++++++++++++++++++++ src/util/mod.rs | 1 + src/util/sanitize_uri.rs | 74 ++++++++++++++++++++++--------------- 8 files changed, 246 insertions(+), 77 deletions(-) create mode 100644 src/util/gfm_tagfilter.rs (limited to 'src') diff --git a/src/compiler.rs b/src/compiler.rs index 5626f8a..681ec00 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -4,8 +4,9 @@ use crate::util::{ constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}, decode_character_reference::{decode_named, decode_numeric}, encode::encode, + gfm_tagfilter::gfm_tagfilter, normalize_identifier::normalize_identifier, - sanitize_uri::sanitize_uri, + sanitize_uri::{sanitize, sanitize_with_protocols}, skip, slice::{Position, Slice}, }; @@ -156,16 +157,8 @@ struct CompileContext<'a> { /// Whether to encode HTML. pub encode_html: bool, // Configuration - /// Whether to sanitize `href`s, and in which case, which protocols to - /// allow. - pub protocol_href: Option>, - /// Whether to sanitize `src`s, and in which case, which protocols to - /// allow. - pub protocol_src: Option>, /// Line ending to use. pub line_ending_default: LineEnding, - /// Whether to allow HTML. - pub allow_dangerous_html: bool, // Intermediate results. /// Stack of buffers. pub buffers: Vec, @@ -203,18 +196,7 @@ impl<'a> CompileContext<'a> { slurp_one_line_ending: false, image_alt_inside: false, encode_html: true, - protocol_href: if options.allow_dangerous_protocol { - None - } else { - Some(SAFE_PROTOCOL_HREF.to_vec()) - }, - protocol_src: if options.allow_dangerous_protocol { - None - } else { - Some(SAFE_PROTOCOL_SRC.to_vec()) - }, line_ending_default: line_ending, - allow_dangerous_html: options.allow_dangerous_html, buffers: vec![String::new()], index: 0, options, @@ -701,14 +683,14 @@ fn on_enter_gfm_task_list_item_check(context: &mut CompileContext) { /// Handle [`Enter`][Kind::Enter]:[`HtmlFlow`][Name::HtmlFlow]. fn on_enter_html_flow(context: &mut CompileContext) { context.line_ending_if_needed(); - if context.allow_dangerous_html { + if context.options.allow_dangerous_html { context.encode_html = false; } } /// Handle [`Enter`][Kind::Enter]:[`HtmlText`][Name::HtmlText]. fn on_enter_html_text(context: &mut CompileContext) { - if context.allow_dangerous_html { + if context.options.allow_dangerous_html { context.encode_html = false; } } @@ -1198,7 +1180,7 @@ fn on_exit_gfm_footnote_call(context: &mut CompileContext) { let indices = context.media_stack.pop().unwrap().label_id.unwrap(); let id = normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str()); - let safe_id = sanitize_uri(&id.to_lowercase(), &None); + let safe_id = sanitize(&id.to_lowercase()); let mut call_index = 0; // See if this has been called before. @@ -1428,14 +1410,19 @@ fn on_exit_html(context: &mut CompileContext) { /// Handle [`Exit`][Kind::Exit]:{[`HtmlFlowData`][Name::HtmlFlowData],[`HtmlTextData`][Name::HtmlTextData]}. fn on_exit_html_data(context: &mut CompileContext) { - context.push(&encode( - Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .as_str(), - context.encode_html, - )); + let slice = Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ); + let value = slice.as_str(); + + let encoded = if context.options.gfm_tagfilter && context.options.allow_dangerous_html { + encode(&gfm_tagfilter(value), context.encode_html) + } else { + encode(value, context.encode_html) + }; + + context.push(&encoded); } /// Handle [`Exit`][Kind::Exit]:[`Label`][Name::Label]. @@ -1585,14 +1572,19 @@ fn on_exit_media(context: &mut CompileContext) { }; if let Some(destination) = destination { - context.push(&sanitize_uri( - destination, - if media.image { - &context.protocol_src - } else { - &context.protocol_href - }, - )); + let url = if context.options.allow_dangerous_protocol { + sanitize(destination) + } else { + sanitize_with_protocols( + destination, + if media.image { + &SAFE_PROTOCOL_SRC + } else { + &SAFE_PROTOCOL_HREF + }, + ) + }; + context.push(&url); } if media.image { @@ -1728,7 +1720,7 @@ fn generate_footnote_section(context: &mut CompileContext) { /// Generate a footnote item from a call. fn generate_footnote_item(context: &mut CompileContext, index: usize) { let id = &context.gfm_footnote_definition_calls[index].0; - let safe_id = sanitize_uri(&id.to_lowercase(), &None); + let safe_id = sanitize(&id.to_lowercase()); // Find definition: we’ll always find it. let mut definition_index = 0; @@ -1833,14 +1825,19 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) { fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) { if !context.image_alt_inside { context.push(""); } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 1d67635..1071489 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -96,7 +96,7 @@ //! [label]: crate::construct::partial_label //! [label_end]: crate::construct::label_end //! [title]: crate::construct::partial_title -//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize //! [normalize_identifier]: crate::util::normalize_identifier //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 8a9edfb..ce1c295 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -173,7 +173,7 @@ //! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote //! [definition]: crate::construct::definition //! [autolink]: crate::construct::autolink -//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize //! [normalize_identifier]: crate::util::normalize_identifier::normalize_identifier //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element diff --git a/src/lib.rs b/src/lib.rs index 0cf4f49..e3fdfcb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -732,6 +732,52 @@ pub struct Options { /// ``` pub gfm_strikethrough_single_tilde: bool, + /// Whether to support the GFM tagfilter, when `allow_dangerous_html` is on + /// (default: `false`). + /// + /// The tagfilter is kinda weird and kinda useless. + /// The tag filter is a naïve attempt at XSS protection. + /// You should use a proper HTML sanitizing algorithm. + /// + /// ## Examples + /// + /// ``` + /// use micromark::{micromark_with_options, Options, Constructs}; + /// + /// // With `allow_dangerous_html`, micromark passes HTML through untouched: + /// assert_eq!( + /// micromark_with_options( + /// "