diff options
Diffstat (limited to '')
-rw-r--r-- | src/util/decode_character_reference.rs | 42 | ||||
-rw-r--r-- | src/util/encode.rs | 48 | ||||
-rw-r--r-- | src/util/normalize_identifier.rs | 39 | ||||
-rw-r--r-- | src/util/sanitize_uri.rs | 2 | ||||
-rw-r--r-- | src/util/slice.rs | 36 |
5 files changed, 91 insertions, 76 deletions
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs index 5277f90..f8fd18f 100644 --- a/src/util/decode_character_reference.rs +++ b/src/util/decode_character_reference.rs @@ -57,9 +57,9 @@ pub fn decode_named(value: &str) -> String { /// ```rust ignore /// use micromark::util::decode_character_reference::decode_numeric; /// -/// assert_eq!(decode_numeric("123", 10), '{'); -/// assert_eq!(decode_numeric("9", 16), '\t'); -/// assert_eq!(decode_numeric("0", 10), '�'); // Not allowed. +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. /// ``` /// /// ## Panics @@ -74,27 +74,19 @@ pub fn decode_named(value: &str) -> String { /// /// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) /// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_numeric(value: &str, radix: u32) -> char { - let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int"); - - if - // C0 except for HT, LF, FF, CR, space - code < 0x09 || - code == 0x0B || - (code > 0x0D && code < 0x20) || - // Control character (DEL) of the basic block and C1 controls. - (code > 0x7E && code < 0xA0) || - // Lone high surrogates and low surrogates. - (code > 0xd7ff && code < 0xe000) || - // Noncharacters. - (code > 0xfdcf && code < 0xfdf0) || - ((code & 0xffff) == 0xffff) || - ((code & 0xffff) == 0xfffe) || - // Out of range - code > 0x0010_ffff - { - char::REPLACEMENT_CHARACTER - } else { - char::from_u32(code).expect("expected valid `code`") +pub fn decode_numeric(value: &str, radix: u32) -> String { + if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { + if !matches!(char, + // C0 except for HT, LF, FF, CR, space + '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | + // Control character (DEL) of c0, and C1 controls. + '\u{7F}'..='\u{9F}' + // Lone surrogates, noncharacters, and out of range are handled by + // Rust. + ) { + return char.to_string(); + } } + + char::REPLACEMENT_CHARACTER.to_string() } diff --git a/src/util/encode.rs b/src/util/encode.rs index 91c5462..d37a2de 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,37 +20,33 @@ /// ## References /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String { - let check = if encode_html { check_all } else { check_nil }; - let mut value = value.into(); - +pub fn encode(value: &str, encode_html: bool) -> String { // It’ll grow a bit bigger for each dangerous character. let mut result = String::with_capacity(value.len()); + let bytes = value.as_bytes(); + let mut index = 0; + let mut start = 0; - while let Some(indice) = value.find(check) { - let after = value.split_off(indice + 1); - let dangerous = value.pop().unwrap(); - result.push_str(&value); - result.push_str(match dangerous { - '\0' => "�", - '&' => "&", - '"' => """, - '<' => "<", - '>' => ">", - _ => unreachable!("xxx"), - }); - value = after; - } + while index < bytes.len() { + let byte = bytes[index]; + if matches!(byte, b'\0') || (encode_html && matches!(byte, b'&' | b'"' | b'<' | b'>')) { + result.push_str(&value[start..index]); + result.push_str(match byte { + b'\0' => "�", + b'&' => "&", + b'"' => """, + b'<' => "<", + b'>' => ">", + _ => panic!("impossible"), + }); - result.push_str(&value); + start = index + 1; + } - result -} + index += 1; + } -fn check_all(char: char) -> bool { - matches!(char, '\0' | '&' | '"' | '<' | '>') -} + result.push_str(&value[start..]); -fn check_nil(char: char) -> bool { - matches!(char, '\0') + result } diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 42a2bb0..f5b12d0 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -34,25 +34,34 @@ pub fn normalize_identifier(value: &str) -> String { // Note: it’ll grow a bit smaller for consecutive whitespace. let mut result = String::with_capacity(value.len()); - let mut at_start = true; - let mut at_whitespace = true; + let bytes = value.as_bytes(); + let mut in_whitespace = true; + let mut index = 0; + let mut start = 0; - // Collapse markdown whitespace and trim it. - for char in value.chars() { - match char { - '\t' | '\n' | '\r' | ' ' => { - at_whitespace = true; + while index < bytes.len() { + if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') { + // First whitespace we see after non-whitespace. + if !in_whitespace { + result.push_str(&value[start..index]); + in_whitespace = true; } - _ => { - if at_whitespace && !at_start { - result.push(' '); - } - - result.push(char); - at_start = false; - at_whitespace = false; + } + // First non-whitespace we see after whitespace. + else if in_whitespace { + if start != 0 { + result.push(' '); } + + start = index; + in_whitespace = false; } + + index += 1; + } + + if !in_whitespace { + result.push_str(&value[start..]); } // Some characters are considered “uppercase”, but if their lowercase diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 8c09549..051e1e1 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,7 +32,7 @@ use crate::util::encode::encode; /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { - let value = encode(normalize_uri(value), true); + let value = encode(&*normalize_uri(value), true); if let Some(protocols) = protocols { let end = value.find(|c| matches!(c, '?' | '#' | '/')); diff --git a/src/util/slice.rs b/src/util/slice.rs index cd3641e..d899dac 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -2,6 +2,7 @@ use crate::constant::TAB_SIZE; use crate::tokenizer::{Event, EventType, Point}; +use std::str; /// A range between two places. #[derive(Debug)] @@ -78,6 +79,15 @@ impl<'a> Slice<'a> { } } + /// To do. + pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> { + Slice { + bytes: &bytes[index..=index], + before: 0, + after: 0, + } + } + /// Get the slice belonging to a position. pub fn from_position(bytes: &'a [u8], position: &Position) -> Slice<'a> { let mut before = position.start.vs; @@ -107,14 +117,18 @@ impl<'a> Slice<'a> { } /// To do. - // To do: rename to `len`? - pub fn size(&self) -> usize { - self.bytes.len() + self.before + self.after + pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> { + Slice { + bytes: &bytes[start..end], + before: 0, + after: 0, + } } - // To do: - // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html> - // to implement an `as_str`. + /// To do. + pub fn len(&self) -> usize { + self.bytes.len() + self.before + self.after + } /// To do. pub fn head(&self) -> Option<u8> { @@ -127,16 +141,20 @@ impl<'a> Slice<'a> { } } + // To do: + pub fn as_str(&self) -> &str { + str::from_utf8(self.bytes).unwrap() + } + /// To do. pub fn serialize(&self) -> String { - let mut string = String::with_capacity(self.size()); + let mut string = String::with_capacity(self.len()); let mut index = self.before; while index > 0 { string.push(' '); index -= 1; } - // To do: invalid UTF8? - string.push_str(std::str::from_utf8(self.bytes).unwrap()); + string.push_str(self.as_str()); index = self.after; while index > 0 { string.push(' '); |