From a820d849c3e20a1d72137072d70a7c8e00306f98 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 20 Jul 2022 17:19:17 +0200 Subject: Refactor to improve allocation around strings --- src/util/codes.rs | 35 ++++++++++++++-------------- src/util/encode.rs | 50 ++++++++++++++++++---------------------- src/util/normalize_identifier.rs | 13 ++++------- src/util/sanitize_uri.rs | 49 +++++++++++++++++---------------------- 4 files changed, 65 insertions(+), 82 deletions(-) (limited to 'src/util') diff --git a/src/util/codes.rs b/src/util/codes.rs index 9b6ad39..d35d7d9 100644 --- a/src/util/codes.rs +++ b/src/util/codes.rs @@ -5,19 +5,21 @@ use crate::tokenizer::Code; /// Turn a string into codes. pub fn parse(value: &str) -> Vec { - let mut codes: Vec = vec![]; + // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller + // with `Code::CarriageReturnLineFeed`. + let mut codes: Vec = Vec::with_capacity(value.len()); let mut at_start = true; let mut at_carriage_return = false; let mut column = 1; for char in value.chars() { if at_start { + at_start = false; + if char == '\u{feff}' { // Ignore. continue; } - - at_start = false; } // Send a CRLF. @@ -83,34 +85,33 @@ pub fn parse(value: &str) -> Vec { /// Serialize codes, optionally expanding tabs. pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { let mut at_tab = false; - let mut index = 0; - let mut value: Vec = vec![]; + // Note: It’ll grow a bit smaller with each + // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false, + // and bigger with `Code::CarriageReturnLineFeed`, + let mut value = String::with_capacity(codes.len()); - while index < codes.len() { - let code = codes[index]; + for code in codes { let mut at_tab_next = false; match code { Code::CarriageReturnLineFeed => { - value.push('\r'); - value.push('\n'); + value.push_str("\r\n"); } - Code::Char(char) if char == '\n' || char == '\r' => { - value.push(char); + Code::Char(char) if *char == '\n' || *char == '\r' => { + value.push(*char); } - Code::Char(char) if char == '\t' => { + Code::Char(char) if *char == '\t' => { at_tab_next = true; - value.push(if expand_tabs { ' ' } else { char }); + value.push(if expand_tabs { ' ' } else { *char }); } Code::VirtualSpace => { if !expand_tabs && at_tab { - index += 1; continue; } value.push(' '); } Code::Char(char) => { - value.push(char); + value.push(*char); } Code::None => { unreachable!("unexpected EOF code in codes"); @@ -118,9 +119,7 @@ pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { } at_tab = at_tab_next; - - index += 1; } - value.into_iter().collect() + value } diff --git a/src/util/encode.rs b/src/util/encode.rs index a3bd589..965ea5c 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,37 +20,31 @@ /// ## References /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode(value: &str) -> String { - let mut result: Vec<&str> = vec![]; - let mut start = 0; - let mut index = 0; +pub fn encode>(value: S) -> String { + let mut value = value.into(); - for byte in value.bytes() { - if let Some(replacement) = match byte { - b'&' => Some("&"), - b'"' => Some("""), - b'<' => Some("<"), - b'>' => Some(">"), - _ => None, - } { - if start != index { - result.push(&value[start..index]); - } + // It’ll grow a bit bigger for each dangerous character. + let mut result = String::with_capacity(value.len()); - result.push(replacement); - start = index + 1; - } - - index += 1; + while let Some(indice) = value.find(check) { + let after = value.split_off(indice + 1); + let dangerous = value.pop().unwrap(); + result.push_str(&value); + result.push_str(match dangerous { + '&' => "&", + '"' => """, + '<' => "<", + '>' => ">", + _ => unreachable!("xxx"), + }); + value = after; } - if start == 0 { - value.to_string() - } else { - if start < index { - result.push(&value[start..index]); - } + result.push_str(&value); - result.join("") - } + result +} + +fn check(char: char) -> bool { + matches!(char, '&' | '"' | '<' | '>') } diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index feb7239..42a2bb0 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -32,7 +32,8 @@ /// [definition]: crate::construct::definition /// [label_end]: crate::construct::label_end pub fn normalize_identifier(value: &str) -> String { - let mut codes = vec![]; + // Note: it’ll grow a bit smaller for consecutive whitespace. + let mut result = String::with_capacity(value.len()); let mut at_start = true; let mut at_whitespace = true; @@ -44,10 +45,10 @@ pub fn normalize_identifier(value: &str) -> String { } _ => { if at_whitespace && !at_start { - codes.push(' '); + result.push(' '); } - codes.push(char); + result.push(char); at_start = false; at_whitespace = false; } @@ -66,9 +67,5 @@ pub fn normalize_identifier(value: &str) -> String { // to `SS` (U+0053 U+0053). // If we’d inverse the steps, for `ẞ`, we’d first uppercase without a // change, and then lowercase to `ß`, which would not match `ss`. - codes - .iter() - .collect::() - .to_lowercase() - .to_uppercase() + result.to_lowercase().to_uppercase() } diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 55b15e4..81450ae 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,32 +32,25 @@ use crate::util::encode::encode; /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) pub fn sanitize_uri(value: &str, protocols: &Option>) -> String { - let value = encode(&normalize_uri(value)); + let value = encode(normalize_uri(value)); if let Some(protocols) = protocols { - let chars: Vec = value.chars().collect(); - let mut index = 0; - let mut colon: Option = None; - - while index < chars.len() { - let char = chars[index]; - - match char { - ':' => { - colon = Some(index); - break; + let end = value.find(|c| matches!(c, '?' | '#' | '/')); + let mut colon = value.find(|c| matches!(c, ':')); + + // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. + if let Some(end) = end { + if let Some(index) = colon { + if index > end { + colon = None; } - '?' | '#' | '/' => break, - _ => {} } - - index += 1; } - // If there is no protocol, or the first colon is after `?`, `#`, or `/`, it’s relative. - // It is a protocol, it should be allowed. + // If there is no protocol, it’s relative, and fine. if let Some(colon) = colon { - let protocol = chars[0..colon].iter().collect::().to_lowercase(); + // If it is a protocol, it should be allowed. + let protocol = value[0..colon].to_lowercase(); if !protocols.contains(&protocol.as_str()) { return "".to_string(); } @@ -85,8 +78,9 @@ pub fn sanitize_uri(value: &str, protocols: &Option>) -> String { /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) fn normalize_uri(value: &str) -> String { - let chars: Vec = value.chars().collect(); - let mut result: Vec = vec![]; + let chars = value.chars().collect::>(); + // Note: it’ll grow bigger for each non-ascii or non-safe character. + let mut result = String::with_capacity(value.len()); let mut index = 0; let mut start = 0; let mut buff = [0; 4]; @@ -104,16 +98,15 @@ fn normalize_uri(value: &str) -> String { continue; } - // Note: Rust already takes care of lone astral surrogates. + // Note: Rust already takes care of lone surrogates. // Non-ascii or not allowed ascii. if char >= '\u{0080}' || !matches!(char, '!' | '#' | '$' | '&'..=';' | '=' | '?'..='Z' | '_' | 'a'..='z' | '~') { - result.push(chars[start..index].iter().collect::()); - + result.push_str(&chars[start..index].iter().collect::()); char.encode_utf8(&mut buff); - result.push( - buff[0..char.len_utf8()] + result.push_str( + &buff[0..char.len_utf8()] .iter() .map(|&byte| format!("%{:>02X}", byte)) .collect::(), @@ -125,7 +118,7 @@ fn normalize_uri(value: &str) -> String { index += 1; } - result.push(chars[start..].iter().collect::()); + result.push_str(&chars[start..].iter().collect::()); - result.join("") + result } -- cgit