diff options
Diffstat (limited to '')
| -rw-r--r-- | src/compiler.rs | 89 | ||||
| -rw-r--r-- | src/construct/definition.rs | 2 | ||||
| -rw-r--r-- | src/construct/label_end.rs | 2 | ||||
| -rw-r--r-- | src/lib.rs | 47 | ||||
| -rw-r--r-- | src/util/constant.rs | 31 | ||||
| -rw-r--r-- | src/util/gfm_tagfilter.rs | 77 | ||||
| -rw-r--r-- | src/util/mod.rs | 1 | ||||
| -rw-r--r-- | src/util/sanitize_uri.rs | 74 | 
8 files changed, 246 insertions, 77 deletions
| diff --git a/src/compiler.rs b/src/compiler.rs index 5626f8a..681ec00 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -4,8 +4,9 @@ use crate::util::{      constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC},      decode_character_reference::{decode_named, decode_numeric},      encode::encode, +    gfm_tagfilter::gfm_tagfilter,      normalize_identifier::normalize_identifier, -    sanitize_uri::sanitize_uri, +    sanitize_uri::{sanitize, sanitize_with_protocols},      skip,      slice::{Position, Slice},  }; @@ -156,16 +157,8 @@ struct CompileContext<'a> {      /// Whether to encode HTML.      pub encode_html: bool,      // Configuration -    /// Whether to sanitize `href`s, and in which case, which protocols to -    /// allow. -    pub protocol_href: Option<Vec<&'static str>>, -    /// Whether to sanitize `src`s, and in which case, which protocols to -    /// allow. -    pub protocol_src: Option<Vec<&'static str>>,      /// Line ending to use.      pub line_ending_default: LineEnding, -    /// Whether to allow HTML. -    pub allow_dangerous_html: bool,      // Intermediate results.      /// Stack of buffers.      pub buffers: Vec<String>, @@ -203,18 +196,7 @@ impl<'a> CompileContext<'a> {              slurp_one_line_ending: false,              image_alt_inside: false,              encode_html: true, -            protocol_href: if options.allow_dangerous_protocol { -                None -            } else { -                Some(SAFE_PROTOCOL_HREF.to_vec()) -            }, -            protocol_src: if options.allow_dangerous_protocol { -                None -            } else { -                Some(SAFE_PROTOCOL_SRC.to_vec()) -            },              line_ending_default: line_ending, -            allow_dangerous_html: options.allow_dangerous_html,              buffers: vec![String::new()],              index: 0,              options, @@ -701,14 +683,14 @@ fn on_enter_gfm_task_list_item_check(context: &mut CompileContext) {  /// Handle [`Enter`][Kind::Enter]:[`HtmlFlow`][Name::HtmlFlow].  fn on_enter_html_flow(context: &mut CompileContext) {      context.line_ending_if_needed(); -    if context.allow_dangerous_html { +    if context.options.allow_dangerous_html {          context.encode_html = false;      }  }  /// Handle [`Enter`][Kind::Enter]:[`HtmlText`][Name::HtmlText].  fn on_enter_html_text(context: &mut CompileContext) { -    if context.allow_dangerous_html { +    if context.options.allow_dangerous_html {          context.encode_html = false;      }  } @@ -1198,7 +1180,7 @@ fn on_exit_gfm_footnote_call(context: &mut CompileContext) {      let indices = context.media_stack.pop().unwrap().label_id.unwrap();      let id =          normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str()); -    let safe_id = sanitize_uri(&id.to_lowercase(), &None); +    let safe_id = sanitize(&id.to_lowercase());      let mut call_index = 0;      // See if this has been called before. @@ -1428,14 +1410,19 @@ fn on_exit_html(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:{[`HtmlFlowData`][Name::HtmlFlowData],[`HtmlTextData`][Name::HtmlTextData]}.  fn on_exit_html_data(context: &mut CompileContext) { -    context.push(&encode( -        Slice::from_position( -            context.bytes, -            &Position::from_exit_event(context.events, context.index), -        ) -        .as_str(), -        context.encode_html, -    )); +    let slice = Slice::from_position( +        context.bytes, +        &Position::from_exit_event(context.events, context.index), +    ); +    let value = slice.as_str(); + +    let encoded = if context.options.gfm_tagfilter && context.options.allow_dangerous_html { +        encode(&gfm_tagfilter(value), context.encode_html) +    } else { +        encode(value, context.encode_html) +    }; + +    context.push(&encoded);  }  /// Handle [`Exit`][Kind::Exit]:[`Label`][Name::Label]. @@ -1585,14 +1572,19 @@ fn on_exit_media(context: &mut CompileContext) {          };          if let Some(destination) = destination { -            context.push(&sanitize_uri( -                destination, -                if media.image { -                    &context.protocol_src -                } else { -                    &context.protocol_href -                }, -            )); +            let url = if context.options.allow_dangerous_protocol { +                sanitize(destination) +            } else { +                sanitize_with_protocols( +                    destination, +                    if media.image { +                        &SAFE_PROTOCOL_SRC +                    } else { +                        &SAFE_PROTOCOL_HREF +                    }, +                ) +            }; +            context.push(&url);          }          if media.image { @@ -1728,7 +1720,7 @@ fn generate_footnote_section(context: &mut CompileContext) {  /// Generate a footnote item from a call.  fn generate_footnote_item(context: &mut CompileContext, index: usize) {      let id = &context.gfm_footnote_definition_calls[index].0; -    let safe_id = sanitize_uri(&id.to_lowercase(), &None); +    let safe_id = sanitize(&id.to_lowercase());      // Find definition: we’ll always find it.      let mut definition_index = 0; @@ -1833,14 +1825,19 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) {  fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) {      if !context.image_alt_inside {          context.push("<a href=\""); -        if let Some(protocol) = protocol { -            context.push(&sanitize_uri( -                &format!("{}{}", protocol, value), -                &context.protocol_href, -            )); +        let url = if let Some(protocol) = protocol { +            format!("{}{}", protocol, value) +        } else { +            value.to_string() +        }; + +        let url = if context.options.allow_dangerous_protocol { +            sanitize(&url)          } else { -            context.push(&sanitize_uri(value, &context.protocol_href)); +            sanitize_with_protocols(&url, &SAFE_PROTOCOL_HREF)          }; + +        context.push(&url);          context.push("\">");      } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 1d67635..1071489 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -96,7 +96,7 @@  //! [label]: crate::construct::partial_label  //! [label_end]: crate::construct::label_end  //! [title]: crate::construct::partial_title -//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize  //! [normalize_identifier]: crate::util::normalize_identifier  //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element  //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 8a9edfb..ce1c295 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -173,7 +173,7 @@  //! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote  //! [definition]: crate::construct::definition  //! [autolink]: crate::construct::autolink -//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize  //! [normalize_identifier]: crate::util::normalize_identifier::normalize_identifier  //! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element  //! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element @@ -732,6 +732,52 @@ pub struct Options {      /// ```      pub gfm_strikethrough_single_tilde: bool, +    /// Whether to support the GFM tagfilter, when `allow_dangerous_html` is on +    /// (default: `false`). +    /// +    /// The tagfilter is kinda weird and kinda useless. +    /// The tag filter is a naïve attempt at XSS protection. +    /// You should use a proper HTML sanitizing algorithm. +    /// +    /// ## Examples +    /// +    /// ``` +    /// use micromark::{micromark_with_options, Options, Constructs}; +    /// +    /// // With `allow_dangerous_html`, micromark passes HTML through untouched: +    /// assert_eq!( +    ///     micromark_with_options( +    ///         "<iframe>", +    ///         &Options { +    ///             allow_dangerous_html: true, +    ///             constructs: Constructs::gfm(), +    ///             ..Options::default() +    ///         } +    ///     ), +    ///     "<iframe>" +    /// ); +    /// +    /// // Pass `gfm_tagfilter: true` to make some of that safe: +    /// assert_eq!( +    ///     micromark_with_options( +    ///         "<iframe>", +    ///         &Options { +    ///             allow_dangerous_html: true, +    ///             constructs: Constructs::gfm(), +    ///             gfm_tagfilter: true, +    ///             ..Options::default() +    ///         } +    ///     ), +    ///     "<iframe>" +    /// ); +    /// ``` +    /// +    /// ## References +    /// +    /// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +    /// *   [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c) +    pub gfm_tagfilter: bool, +      /// Whether to support math (text) (if enabled in `constructs`) with a      /// single dollar (default: `true`).      /// @@ -791,6 +837,7 @@ impl Default for Options {              gfm_footnote_back_label: None,              gfm_footnote_clobber_prefix: None,              gfm_strikethrough_single_tilde: true, +            gfm_tagfilter: false,              math_text_single_dollar: true,          }      } diff --git a/src/util/constant.rs b/src/util/constant.rs index f397f38..d6a6651 100644 --- a/src/util/constant.rs +++ b/src/util/constant.rs @@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;  /// [frontmatter]: crate::construct::frontmatter  pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3; +/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][]. +/// +/// This is currently the size of `plaintext`. +pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9; + +/// List of HTML tag names that are escaped by GFMs tag filter. +/// +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// ## References +/// +/// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [ +    "iframe", +    "noembed", +    "noframes", +    "plaintext", +    "script", +    "style", +    "textarea", +    "title", +    "xmp", +]; +  /// The number of preceding spaces needed for a [hard break  /// (trailing)][whitespace] to form.  /// @@ -2427,6 +2452,12 @@ mod tests {          );          assert_eq!( +            GFM_HTML_TAGFILTER_SIZE_MAX, +            longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(), +            "`GFM_HTML_TAGFILTER_SIZE_MAX`" +        ); + +        assert_eq!(              HTML_RAW_SIZE_MAX,              longest(&HTML_RAW_NAMES).unwrap().len(),              "`HTML_RAW_SIZE_MAX`" diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs new file mode 100644 index 0000000..8023c66 --- /dev/null +++ b/src/util/gfm_tagfilter.rs @@ -0,0 +1,77 @@ +//! Make dangerous HTML a tiny bit safer. + +use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX}; +use alloc::string::String; +use core::str; + +/// Make dangerous HTML a tiny bit safer. +/// +/// The tagfilter is kinda weird and kinda useless. +/// The tag filter is a naïve attempt at XSS protection. +/// You should use a proper HTML sanitizing algorithm. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::gfm_tagfilter::gfm_tagfilter; +/// +/// assert_eq!(gfm_tagfilter("<iframe>"), "<iframe>"); +/// ``` +/// +/// ## References +/// +/// *   [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-) +/// *   [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c) +pub fn gfm_tagfilter(value: &str) -> String { +    let bytes = value.as_bytes(); +    // It’ll grow a bit bigger for each encoded `<`. +    let mut result = String::with_capacity(bytes.len()); +    let mut index = 0; +    let mut start = 0; +    let len = bytes.len(); + +    while index < len { +        if bytes[index] == b'<' { +            let mut name_start = index + 1; + +            // Optional `/`. +            if name_start < len && bytes[name_start] == b'/' { +                name_start += 1; +            } + +            // Tag name. +            let mut name_end = name_start; + +            while name_end < len +                && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX +                && bytes[name_end].is_ascii_alphabetic() +            { +                name_end += 1; +            } + +            // Non-empty. +            if name_end != name_start && +                // HTML whitespace, closing slash, or closing angle bracket. +                matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') && +                // Known name. +                GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end]) +                .unwrap() +                .to_ascii_lowercase().as_str()) +            { +                result.push_str(&value[start..index]); +                result.push_str("<"); +                start = index + 1; +            } + +            // There was no `<` before `name_end`, so move to that next. +            index = name_end; +            continue; +        } + +        index += 1; +    } + +    result.push_str(&value[start..]); + +    result +} diff --git a/src/util/mod.rs b/src/util/mod.rs index d2ec0ed..e5823cf 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -5,6 +5,7 @@ pub mod constant;  pub mod decode_character_reference;  pub mod edit_map;  pub mod encode; +pub mod gfm_tagfilter;  pub mod normalize_identifier;  pub mod sanitize_uri;  pub mod skip; diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 969a4d8..0099347 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -10,55 +10,71 @@ use alloc::{  /// Make a value safe for injection as a URL.  ///  /// This encodes unsafe characters with percent-encoding and skips already -/// encoded sequences (see [`normalize_uri`][] below). +/// encoded sequences (see [`normalize`][] below).  /// Further unsafe characters are encoded as character references (see  /// [`encode`][]).  /// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::sanitize_uri::sanitize; +/// +/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)"); +/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25"); +/// ``` +/// +/// ## References +/// +/// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) +pub fn sanitize(value: &str) -> String { +    encode(&*normalize(value), true) +} + +/// Make a value safe for injection as a URL, and check protocols. +/// +/// This first uses [`sanitize`][sanitize].  /// Then, a vec of (lowercase) allowed protocols can be given, in which case -/// the URL is sanitized. +/// the URL is ignored or kept.  /// -/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])` -/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`. +/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]` +/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`.  /// If the URL includes an unknown protocol (one not matched by `protocol`, such  /// as a dangerous example, `javascript:`), the value is ignored.  ///  /// ## Examples  ///  /// ```rust ignore -/// use micromark::util::sanitize_url::sanitize_url; +/// use micromark::util::sanitize_uri::sanitize_with_protocols;  /// -/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)"); -/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), ""); -/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com"); -/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25"); +/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), ""); +/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com"); +/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25");  /// ```  ///  /// ## References  ///  /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) -pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { -    let value = encode(&*normalize_uri(value), true); +pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String { +    let value = sanitize(value); -    if let Some(protocols) = protocols { -        let end = value.find(|c| matches!(c, '?' | '#' | '/')); -        let mut colon = value.find(|c| matches!(c, ':')); +    let end = value.find(|c| matches!(c, '?' | '#' | '/')); +    let mut colon = value.find(|c| matches!(c, ':')); -        // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. -        if let Some(end) = end { -            if let Some(index) = colon { -                if index > end { -                    colon = None; -                } +    // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. +    if let Some(end) = end { +        if let Some(index) = colon { +            if index > end { +                colon = None;              }          } +    } -        // If there is no protocol, it’s relative, and fine. -        if let Some(colon) = colon { -            // If it is a protocol, it should be allowed. -            let protocol = value[0..colon].to_lowercase(); -            if !protocols.contains(&protocol.as_str()) { -                return "".to_string(); -            } +    // If there is no protocol, it’s relative, and fine. +    if let Some(colon) = colon { +        // If it is a protocol, it should be allowed. +        let protocol = value[0..colon].to_lowercase(); +        if !protocols.contains(&protocol.as_str()) { +            return "".to_string();          }      } @@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {  /// ## Examples  ///  /// ```rust ignore -/// use micromark::util::sanitize_url::normalize_uri; +/// use micromark::util::sanitize_uri::normalize;  ///  /// assert_eq!(sanitize_uri("https://example.com"), "https://example.com");  /// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25"); @@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {  ///  /// [definition]: crate::construct::definition  /// [label_end]: crate::construct::label_end -fn normalize_uri(value: &str) -> String { +fn normalize(value: &str) -> String {      let chars = value.chars().collect::<Vec<_>>();      // Note: it’ll grow bigger for each non-ascii or non-safe character.      let mut result = String::with_capacity(value.len()); | 
