aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/compiler.rs89
-rw-r--r--src/construct/definition.rs2
-rw-r--r--src/construct/label_end.rs2
-rw-r--r--src/lib.rs47
-rw-r--r--src/util/constant.rs31
-rw-r--r--src/util/gfm_tagfilter.rs77
-rw-r--r--src/util/mod.rs1
-rw-r--r--src/util/sanitize_uri.rs74
8 files changed, 246 insertions, 77 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 5626f8a..681ec00 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -4,8 +4,9 @@ use crate::util::{
constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC},
decode_character_reference::{decode_named, decode_numeric},
encode::encode,
+ gfm_tagfilter::gfm_tagfilter,
normalize_identifier::normalize_identifier,
- sanitize_uri::sanitize_uri,
+ sanitize_uri::{sanitize, sanitize_with_protocols},
skip,
slice::{Position, Slice},
};
@@ -156,16 +157,8 @@ struct CompileContext<'a> {
/// Whether to encode HTML.
pub encode_html: bool,
// Configuration
- /// Whether to sanitize `href`s, and in which case, which protocols to
- /// allow.
- pub protocol_href: Option<Vec<&'static str>>,
- /// Whether to sanitize `src`s, and in which case, which protocols to
- /// allow.
- pub protocol_src: Option<Vec<&'static str>>,
/// Line ending to use.
pub line_ending_default: LineEnding,
- /// Whether to allow HTML.
- pub allow_dangerous_html: bool,
// Intermediate results.
/// Stack of buffers.
pub buffers: Vec<String>,
@@ -203,18 +196,7 @@ impl<'a> CompileContext<'a> {
slurp_one_line_ending: false,
image_alt_inside: false,
encode_html: true,
- protocol_href: if options.allow_dangerous_protocol {
- None
- } else {
- Some(SAFE_PROTOCOL_HREF.to_vec())
- },
- protocol_src: if options.allow_dangerous_protocol {
- None
- } else {
- Some(SAFE_PROTOCOL_SRC.to_vec())
- },
line_ending_default: line_ending,
- allow_dangerous_html: options.allow_dangerous_html,
buffers: vec![String::new()],
index: 0,
options,
@@ -701,14 +683,14 @@ fn on_enter_gfm_task_list_item_check(context: &mut CompileContext) {
/// Handle [`Enter`][Kind::Enter]:[`HtmlFlow`][Name::HtmlFlow].
fn on_enter_html_flow(context: &mut CompileContext) {
context.line_ending_if_needed();
- if context.allow_dangerous_html {
+ if context.options.allow_dangerous_html {
context.encode_html = false;
}
}
/// Handle [`Enter`][Kind::Enter]:[`HtmlText`][Name::HtmlText].
fn on_enter_html_text(context: &mut CompileContext) {
- if context.allow_dangerous_html {
+ if context.options.allow_dangerous_html {
context.encode_html = false;
}
}
@@ -1198,7 +1180,7 @@ fn on_exit_gfm_footnote_call(context: &mut CompileContext) {
let indices = context.media_stack.pop().unwrap().label_id.unwrap();
let id =
normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str());
- let safe_id = sanitize_uri(&id.to_lowercase(), &None);
+ let safe_id = sanitize(&id.to_lowercase());
let mut call_index = 0;
// See if this has been called before.
@@ -1428,14 +1410,19 @@ fn on_exit_html(context: &mut CompileContext) {
/// Handle [`Exit`][Kind::Exit]:{[`HtmlFlowData`][Name::HtmlFlowData],[`HtmlTextData`][Name::HtmlTextData]}.
fn on_exit_html_data(context: &mut CompileContext) {
- context.push(&encode(
- Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .as_str(),
- context.encode_html,
- ));
+ let slice = Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ );
+ let value = slice.as_str();
+
+ let encoded = if context.options.gfm_tagfilter && context.options.allow_dangerous_html {
+ encode(&gfm_tagfilter(value), context.encode_html)
+ } else {
+ encode(value, context.encode_html)
+ };
+
+ context.push(&encoded);
}
/// Handle [`Exit`][Kind::Exit]:[`Label`][Name::Label].
@@ -1585,14 +1572,19 @@ fn on_exit_media(context: &mut CompileContext) {
};
if let Some(destination) = destination {
- context.push(&sanitize_uri(
- destination,
- if media.image {
- &context.protocol_src
- } else {
- &context.protocol_href
- },
- ));
+ let url = if context.options.allow_dangerous_protocol {
+ sanitize(destination)
+ } else {
+ sanitize_with_protocols(
+ destination,
+ if media.image {
+ &SAFE_PROTOCOL_SRC
+ } else {
+ &SAFE_PROTOCOL_HREF
+ },
+ )
+ };
+ context.push(&url);
}
if media.image {
@@ -1728,7 +1720,7 @@ fn generate_footnote_section(context: &mut CompileContext) {
/// Generate a footnote item from a call.
fn generate_footnote_item(context: &mut CompileContext, index: usize) {
let id = &context.gfm_footnote_definition_calls[index].0;
- let safe_id = sanitize_uri(&id.to_lowercase(), &None);
+ let safe_id = sanitize(&id.to_lowercase());
// Find definition: we’ll always find it.
let mut definition_index = 0;
@@ -1833,14 +1825,19 @@ fn generate_footnote_item(context: &mut CompileContext, index: usize) {
fn generate_autolink(context: &mut CompileContext, protocol: Option<&str>, value: &str) {
if !context.image_alt_inside {
context.push("<a href=\"");
- if let Some(protocol) = protocol {
- context.push(&sanitize_uri(
- &format!("{}{}", protocol, value),
- &context.protocol_href,
- ));
+ let url = if let Some(protocol) = protocol {
+ format!("{}{}", protocol, value)
+ } else {
+ value.to_string()
+ };
+
+ let url = if context.options.allow_dangerous_protocol {
+ sanitize(&url)
} else {
- context.push(&sanitize_uri(value, &context.protocol_href));
+ sanitize_with_protocols(&url, &SAFE_PROTOCOL_HREF)
};
+
+ context.push(&url);
context.push("\">");
}
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 1d67635..1071489 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -96,7 +96,7 @@
//! [label]: crate::construct::partial_label
//! [label_end]: crate::construct::label_end
//! [title]: crate::construct::partial_title
-//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri
+//! [sanitize_uri]: crate::util::sanitize_uri::sanitize
//! [normalize_identifier]: crate::util::normalize_identifier
//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
//! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 8a9edfb..ce1c295 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -173,7 +173,7 @@
//! [gfm_label_start_footnote]: crate::construct::gfm_label_start_footnote
//! [definition]: crate::construct::definition
//! [autolink]: crate::construct::autolink
-//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri
+//! [sanitize_uri]: crate::util::sanitize_uri::sanitize
//! [normalize_identifier]: crate::util::normalize_identifier::normalize_identifier
//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
//! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element
diff --git a/src/lib.rs b/src/lib.rs
index 0cf4f49..e3fdfcb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -732,6 +732,52 @@ pub struct Options {
/// ```
pub gfm_strikethrough_single_tilde: bool,
+ /// Whether to support the GFM tagfilter, when `allow_dangerous_html` is on
+ /// (default: `false`).
+ ///
+ /// The tagfilter is kinda weird and kinda useless.
+ /// The tag filter is a naïve attempt at XSS protection.
+ /// You should use a proper HTML sanitizing algorithm.
+ ///
+ /// ## Examples
+ ///
+ /// ```
+ /// use micromark::{micromark_with_options, Options, Constructs};
+ ///
+ /// // With `allow_dangerous_html`, micromark passes HTML through untouched:
+ /// assert_eq!(
+ /// micromark_with_options(
+ /// "<iframe>",
+ /// &Options {
+ /// allow_dangerous_html: true,
+ /// constructs: Constructs::gfm(),
+ /// ..Options::default()
+ /// }
+ /// ),
+ /// "<iframe>"
+ /// );
+ ///
+ /// // Pass `gfm_tagfilter: true` to make some of that safe:
+ /// assert_eq!(
+ /// micromark_with_options(
+ /// "<iframe>",
+ /// &Options {
+ /// allow_dangerous_html: true,
+ /// constructs: Constructs::gfm(),
+ /// gfm_tagfilter: true,
+ /// ..Options::default()
+ /// }
+ /// ),
+ /// "&lt;iframe>"
+ /// );
+ /// ```
+ ///
+ /// ## References
+ ///
+ /// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+ /// * [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c)
+ pub gfm_tagfilter: bool,
+
/// Whether to support math (text) (if enabled in `constructs`) with a
/// single dollar (default: `true`).
///
@@ -791,6 +837,7 @@ impl Default for Options {
gfm_footnote_back_label: None,
gfm_footnote_clobber_prefix: None,
gfm_strikethrough_single_tilde: true,
+ gfm_tagfilter: false,
math_text_single_dollar: true,
}
}
diff --git a/src/util/constant.rs b/src/util/constant.rs
index f397f38..d6a6651 100644
--- a/src/util/constant.rs
+++ b/src/util/constant.rs
@@ -74,6 +74,31 @@ pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;
/// [frontmatter]: crate::construct::frontmatter
pub const FRONTMATTER_SEQUENCE_SIZE: usize = 3;
+/// The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][].
+///
+/// This is currently the size of `plaintext`.
+pub const GFM_HTML_TAGFILTER_SIZE_MAX: usize = 9;
+
+/// List of HTML tag names that are escaped by GFMs tag filter.
+///
+/// Tag name matching must be performed insensitive to case, and thus this list
+/// includes lowercase tag names.
+///
+/// ## References
+///
+/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+pub const GFM_HTML_TAGFILTER_NAMES: [&str; 9] = [
+ "iframe",
+ "noembed",
+ "noframes",
+ "plaintext",
+ "script",
+ "style",
+ "textarea",
+ "title",
+ "xmp",
+];
+
/// The number of preceding spaces needed for a [hard break
/// (trailing)][whitespace] to form.
///
@@ -2427,6 +2452,12 @@ mod tests {
);
assert_eq!(
+ GFM_HTML_TAGFILTER_SIZE_MAX,
+ longest(&GFM_HTML_TAGFILTER_NAMES).unwrap().len(),
+ "`GFM_HTML_TAGFILTER_SIZE_MAX`"
+ );
+
+ assert_eq!(
HTML_RAW_SIZE_MAX,
longest(&HTML_RAW_NAMES).unwrap().len(),
"`HTML_RAW_SIZE_MAX`"
diff --git a/src/util/gfm_tagfilter.rs b/src/util/gfm_tagfilter.rs
new file mode 100644
index 0000000..8023c66
--- /dev/null
+++ b/src/util/gfm_tagfilter.rs
@@ -0,0 +1,77 @@
+//! Make dangerous HTML a tiny bit safer.
+
+use crate::util::constant::{GFM_HTML_TAGFILTER_NAMES, GFM_HTML_TAGFILTER_SIZE_MAX};
+use alloc::string::String;
+use core::str;
+
+/// Make dangerous HTML a tiny bit safer.
+///
+/// The tagfilter is kinda weird and kinda useless.
+/// The tag filter is a naïve attempt at XSS protection.
+/// You should use a proper HTML sanitizing algorithm.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::gfm_tagfilter::gfm_tagfilter;
+///
+/// assert_eq!(gfm_tagfilter("<iframe>"), "&lt;iframe>");
+/// ```
+///
+/// ## References
+///
+/// * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
+/// * [`cmark-gfm#extensions/tagfilter.c`](https://github.com/github/cmark-gfm/blob/master/extensions/tagfilter.c)
+pub fn gfm_tagfilter(value: &str) -> String {
+ let bytes = value.as_bytes();
+ // It’ll grow a bit bigger for each encoded `<`.
+ let mut result = String::with_capacity(bytes.len());
+ let mut index = 0;
+ let mut start = 0;
+ let len = bytes.len();
+
+ while index < len {
+ if bytes[index] == b'<' {
+ let mut name_start = index + 1;
+
+ // Optional `/`.
+ if name_start < len && bytes[name_start] == b'/' {
+ name_start += 1;
+ }
+
+ // Tag name.
+ let mut name_end = name_start;
+
+ while name_end < len
+ && name_end - name_start < GFM_HTML_TAGFILTER_SIZE_MAX
+ && bytes[name_end].is_ascii_alphabetic()
+ {
+ name_end += 1;
+ }
+
+ // Non-empty.
+ if name_end != name_start &&
+ // HTML whitespace, closing slash, or closing angle bracket.
+ matches!(bytes[name_end], b'\t' | b'\n' | 12 /* `\f` */ | b'\r' | b' ' | b'/' | b'>') &&
+ // Known name.
+ GFM_HTML_TAGFILTER_NAMES.contains(&str::from_utf8(&bytes[name_start..name_end])
+ .unwrap()
+ .to_ascii_lowercase().as_str())
+ {
+ result.push_str(&value[start..index]);
+ result.push_str("&lt;");
+ start = index + 1;
+ }
+
+ // There was no `<` before `name_end`, so move to that next.
+ index = name_end;
+ continue;
+ }
+
+ index += 1;
+ }
+
+ result.push_str(&value[start..]);
+
+ result
+}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index d2ec0ed..e5823cf 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -5,6 +5,7 @@ pub mod constant;
pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
+pub mod gfm_tagfilter;
pub mod normalize_identifier;
pub mod sanitize_uri;
pub mod skip;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 969a4d8..0099347 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -10,55 +10,71 @@ use alloc::{
/// Make a value safe for injection as a URL.
///
/// This encodes unsafe characters with percent-encoding and skips already
-/// encoded sequences (see [`normalize_uri`][] below).
+/// encoded sequences (see [`normalize`][] below).
/// Further unsafe characters are encoded as character references (see
/// [`encode`][]).
///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::sanitize_uri::sanitize;
+///
+/// assert_eq!(sanitize("javascript:alert(1)"), "javascript:alert(1)");
+/// assert_eq!(sanitize("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// ```
+///
+/// ## References
+///
+/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
+pub fn sanitize(value: &str) -> String {
+ encode(&*normalize(value), true)
+}
+
+/// Make a value safe for injection as a URL, and check protocols.
+///
+/// This first uses [`sanitize`][sanitize].
/// Then, a vec of (lowercase) allowed protocols can be given, in which case
-/// the URL is sanitized.
+/// the URL is ignored or kept.
///
-/// For example, `Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"])`
-/// can be used for `a[href]`, or `Some(vec!["http", "https"])` for `img[src]`.
+/// For example, `&["http", "https", "irc", "ircs", "mailto", "xmpp"]`
+/// can be used for `a[href]`, or `&["http", "https"]` for `img[src]`.
/// If the URL includes an unknown protocol (one not matched by `protocol`, such
/// as a dangerous example, `javascript:`), the value is ignored.
///
/// ## Examples
///
/// ```rust ignore
-/// use micromark::util::sanitize_url::sanitize_url;
+/// use micromark::util::sanitize_uri::sanitize_with_protocols;
///
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &None), "javascript:alert(1)");
-/// assert_eq!(sanitize_uri("javascript:alert(1)", &Some(vec!["http", "https"])), "");
-/// assert_eq!(sanitize_uri("https://example.com", &Some(vec!["http", "https"])), "https://example.com");
-/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%", &Some(vec!["http", "https"])), "https://a%F0%9F%91%8Db.c/%20/%25");
+/// assert_eq!(sanitize_with_protocols("javascript:alert(1)", &["http", "https"]), "");
+/// assert_eq!(sanitize_with_protocols("https://example.com", &["http", "https"]), "https://example.com");
+/// assert_eq!(sanitize_with_protocols("https://a👍b.c/%20/%", &["http", "https"]), "https://a%F0%9F%91%8Db.c/%20/%25");
/// ```
///
/// ## References
///
/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
-pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
- let value = encode(&*normalize_uri(value), true);
+pub fn sanitize_with_protocols(value: &str, protocols: &[&str]) -> String {
+ let value = sanitize(value);
- if let Some(protocols) = protocols {
- let end = value.find(|c| matches!(c, '?' | '#' | '/'));
- let mut colon = value.find(|c| matches!(c, ':'));
+ let end = value.find(|c| matches!(c, '?' | '#' | '/'));
+ let mut colon = value.find(|c| matches!(c, ':'));
- // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
- if let Some(end) = end {
- if let Some(index) = colon {
- if index > end {
- colon = None;
- }
+ // If the first colon is after `?`, `#`, or `/`, it’s not a protocol.
+ if let Some(end) = end {
+ if let Some(index) = colon {
+ if index > end {
+ colon = None;
}
}
+ }
- // If there is no protocol, it’s relative, and fine.
- if let Some(colon) = colon {
- // If it is a protocol, it should be allowed.
- let protocol = value[0..colon].to_lowercase();
- if !protocols.contains(&protocol.as_str()) {
- return "".to_string();
- }
+ // If there is no protocol, it’s relative, and fine.
+ if let Some(colon) = colon {
+ // If it is a protocol, it should be allowed.
+ let protocol = value[0..colon].to_lowercase();
+ if !protocols.contains(&protocol.as_str()) {
+ return "".to_string();
}
}
@@ -74,7 +90,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
/// ## Examples
///
/// ```rust ignore
-/// use micromark::util::sanitize_url::normalize_uri;
+/// use micromark::util::sanitize_uri::normalize;
///
/// assert_eq!(sanitize_uri("https://example.com"), "https://example.com");
/// assert_eq!(sanitize_uri("https://a👍b.c/%20/%"), "https://a%F0%9F%91%8Db.c/%20/%25");
@@ -86,7 +102,7 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
///
/// [definition]: crate::construct::definition
/// [label_end]: crate::construct::label_end
-fn normalize_uri(value: &str) -> String {
+fn normalize(value: &str) -> String {
let chars = value.chars().collect::<Vec<_>>();
// Note: it’ll grow bigger for each non-ascii or non-safe character.
let mut result = String::with_capacity(value.len());