diff options
| author | 2022-07-20 17:19:17 +0200 | |
|---|---|---|
| committer | 2022-07-20 17:19:17 +0200 | |
| commit | a820d849c3e20a1d72137072d70a7c8e00306f98 (patch) | |
| tree | c2916ab31d6d481e0b53a06aa9b95dfcddd4163f | |
| parent | 7894ec75a7070591c3499fce1f409563c4edc7d7 (diff) | |
| download | markdown-rs-a820d849c3e20a1d72137072d70a7c8e00306f98.tar.gz markdown-rs-a820d849c3e20a1d72137072d70a7c8e00306f98.tar.bz2 markdown-rs-a820d849c3e20a1d72137072d70a7c8e00306f98.zip | |
Refactor to improve allocation around strings
Diffstat (limited to '')
| -rw-r--r-- | src/compiler.rs | 174 | ||||
| -rw-r--r-- | src/construct/character_reference.rs | 10 | ||||
| -rw-r--r-- | src/util/codes.rs | 35 | ||||
| -rw-r--r-- | src/util/encode.rs | 50 | ||||
| -rw-r--r-- | src/util/normalize_identifier.rs | 13 | ||||
| -rw-r--r-- | src/util/sanitize_uri.rs | 49 | 
6 files changed, 149 insertions, 182 deletions
| diff --git a/src/compiler.rs b/src/compiler.rs index c79abed..1723190 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -84,7 +84,7 @@ struct CompileContext<'a> {      pub line_ending_default: LineEnding,      pub allow_dangerous_html: bool,      /// Intermediate results. -    pub buffers: Vec<Vec<String>>, +    pub buffers: Vec<String>,      pub index: usize,  } @@ -125,78 +125,62 @@ impl<'a> CompileContext<'a> {              },              line_ending_default: line_ending,              allow_dangerous_html: options.allow_dangerous_html, -            buffers: vec![vec![]], +            buffers: vec![String::new()],              index: 0,          }      }      /// Push a buffer.      pub fn buffer(&mut self) { -        self.buffers.push(vec![]); +        self.buffers.push(String::new());      }      /// Pop a buffer, returning its value.      pub fn resume(&mut self) -> String { -        self.buffers -            .pop() -            .expect("Cannot resume w/o buffer") -            .concat() +        self.buffers.pop().expect("Cannot resume w/o buffer")      } -    pub fn push(&mut self, value: String) { +    pub fn push<'x, S: Into<&'x str>>(&mut self, value: S) { +        let value = value.into();          self.buffers              .last_mut()              .expect("Cannot push w/o buffer") -            .push(value); +            .push_str(value);          self.last_was_tag = false;      } -    pub fn tag(&mut self, value: String) { -        if self.tags { -            self.buffers -                .last_mut() -                .expect("Cannot push w/o buffer") -                .push(value); -            self.last_was_tag = true; +    pub fn push_raw<'x, S: Into<&'x str>>(&mut self, value: S) { +        let value = value.into(); +        if self.ignore_encode { +            self.push(value); +        } else { +            self.push(&*encode(value));          }      } -    /// Get the last chunk of current buffer. -    pub fn buf_tail_slice(&self) -> Option<&String> { -        self.buf_tail().last() +    pub fn tag<'x, S: Into<&'x str>>(&mut self, value: S) { +        if self.tags { +            self.push(value.into()); +            self.last_was_tag = true; +        }      }      /// Get the current buffer. -    pub fn buf_tail(&self) -> &Vec<String> { +    pub fn buf_tail(&self) -> &String {          self.buffers              .last()              .expect("at least one buffer should exist")      } -    /// Optionally encode. -    pub fn encode_opt(&self, value: &str) -> String { -        if self.ignore_encode { -            value.to_string() -        } else { -            encode(value) -        } -    } -      /// Add a line ending.      pub fn line_ending(&mut self) { -        let line_ending = self.line_ending_default.as_str().to_string(); -        // lastWasTag = false -        self.push(line_ending); +        let eol = self.line_ending_default.as_str().to_string(); +        self.push(&*eol);      }      /// Add a line ending if needed (as in, there’s no eol/eof already).      pub fn line_ending_if_needed(&mut self) { -        let slice = self.buf_tail_slice(); -        let last_char = if let Some(x) = slice { -            x.chars().last() -        } else { -            None -        }; +        let last_char = self.buf_tail().chars().last();          let mut add = true;          if let Some(x) = last_char { @@ -314,7 +298,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {          .buffers          .get(0)          .expect("expected 1 final buffer") -        .concat() +        .to_string()  }  /// Handle [`Enter`][EventType::Enter]. @@ -415,14 +399,14 @@ fn on_enter_buffer(context: &mut CompileContext) {  fn on_enter_block_quote(context: &mut CompileContext) {      context.tight_stack.push(false);      context.line_ending_if_needed(); -    context.tag("<blockquote>".to_string()); +    context.tag("<blockquote>");  }  /// Handle [`Enter`][EventType::Enter]:[`CodeIndented`][Token::CodeIndented].  fn on_enter_code_indented(context: &mut CompileContext) {      context.code_flow_seen_data = Some(false);      context.line_ending_if_needed(); -    context.tag("<pre><code>".to_string()); +    context.tag("<pre><code>");  }  /// Handle [`Enter`][EventType::Enter]:[`CodeFenced`][Token::CodeFenced]. @@ -430,14 +414,14 @@ fn on_enter_code_fenced(context: &mut CompileContext) {      context.code_flow_seen_data = Some(false);      context.line_ending_if_needed();      // Note that no `>` is used, which is added later. -    context.tag("<pre><code".to_string()); +    context.tag("<pre><code");      context.code_fenced_fences_count = Some(0);  }  /// Handle [`Enter`][EventType::Enter]:[`CodeText`][Token::CodeText].  fn on_enter_code_text(context: &mut CompileContext) {      context.code_text_inside = true; -    context.tag("<code>".to_string()); +    context.tag("<code>");      context.buffer();  } @@ -462,7 +446,7 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) {  /// Handle [`Enter`][EventType::Enter]:[`Emphasis`][Token::Emphasis].  fn on_enter_emphasis(context: &mut CompileContext) { -    context.tag("<em>".to_string()); +    context.tag("<em>");  }  /// Handle [`Enter`][EventType::Enter]:[`HtmlFlow`][Token::HtmlFlow]. @@ -563,7 +547,7 @@ fn on_enter_list(context: &mut CompileContext) {      context.tight_stack.push(!loose);      context.line_ending_if_needed();      // Note: no `>`. -    context.tag(format!( +    context.tag(&*format!(          "<{}",          if *token_type == Token::ListOrdered {              "ol" @@ -579,11 +563,11 @@ fn on_enter_list_item_marker(context: &mut CompileContext) {      let expect_first_item = context.expect_first_item.take().unwrap();      if expect_first_item { -        context.tag(">".to_string()); +        context.tag(">");      }      context.line_ending_if_needed(); -    context.tag("<li>".to_string()); +    context.tag("<li>");      context.expect_first_item = Some(false);      // “Hack” to prevent a line ending from showing up if the item is empty.      context.last_was_tag = false; @@ -595,7 +579,7 @@ fn on_enter_paragraph(context: &mut CompileContext) {      if !tight {          context.line_ending_if_needed(); -        context.tag("<p>".to_string()); +        context.tag("<p>");      }  } @@ -616,7 +600,7 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) {  /// Handle [`Enter`][EventType::Enter]:[`Strong`][Token::Strong].  fn on_enter_strong(context: &mut CompileContext) { -    context.tag("<strong>".to_string()); +    context.tag("<strong>");  }  /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail]. @@ -626,15 +610,15 @@ fn on_exit_autolink_email(context: &mut CompileContext) {          &from_exit_event(context.events, context.index),          false,      ); -    context.tag(format!( +    context.tag(&*format!(          "<a href=\"{}\">",          sanitize_uri(              format!("mailto:{}", slice.as_str()).as_str(),              &context.protocol_href          )      )); -    context.push(context.encode_opt(&slice)); -    context.tag("</a>".to_string()); +    context.push_raw(&*slice); +    context.tag("</a>");  }  /// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol]. @@ -644,17 +628,17 @@ fn on_exit_autolink_protocol(context: &mut CompileContext) {          &from_exit_event(context.events, context.index),          false,      ); -    context.tag(format!( +    context.tag(&*format!(          "<a href=\"{}\">",          sanitize_uri(slice.as_str(), &context.protocol_href)      )); -    context.push(context.encode_opt(&slice)); -    context.tag("</a>".to_string()); +    context.push_raw(&*slice); +    context.tag("</a>");  }  /// Handle [`Exit`][EventType::Exit]:{[`HardBreakEscape`][Token::HardBreakEscape],[`HardBreakTrailing`][Token::HardBreakTrailing]}.  fn on_exit_break(context: &mut CompileContext) { -    context.tag("<br />".to_string()); +    context.tag("<br />");  }  /// Handle [`Exit`][EventType::Exit]:[`BlankLineEnding`][Token::BlankLineEnding]. @@ -669,7 +653,7 @@ fn on_exit_block_quote(context: &mut CompileContext) {      context.tight_stack.pop();      context.line_ending_if_needed();      context.slurp_one_line_ending = false; -    context.tag("</blockquote>".to_string()); +    context.tag("</blockquote>");  }  /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarker`][Token::CharacterReferenceMarker]. @@ -705,17 +689,17 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {          CharacterReferenceKind::Named => decode_named(ref_string),      }; -    context.push(context.encode_opt(&value)); +    context.push_raw(&*value);  }  /// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk].  fn on_exit_code_flow_chunk(context: &mut CompileContext) {      context.code_flow_seen_data = Some(true); -    context.push(context.encode_opt(&serialize( +    context.push_raw(&*serialize(          context.codes,          &from_exit_event(context.events, context.index),          false, -    ))); +    ));  }  /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFence`][Token::CodeFencedFence]. @@ -727,7 +711,7 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {      };      if count == 0 { -        context.tag(">".to_string()); +        context.tag(">");          context.slurp_one_line_ending = true;      } @@ -737,7 +721,7 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFenceInfo`][Token::CodeFencedFenceInfo].  fn on_exit_code_fenced_fence_info(context: &mut CompileContext) {      let value = context.resume(); -    context.tag(format!(" class=\"language-{}\"", value)); +    context.tag(&*format!(" class=\"language-{}\"", value));  }  /// Handle [`Exit`][EventType::Exit]:{[`CodeFenced`][Token::CodeFenced],[`CodeIndented`][Token::CodeIndented]}. @@ -764,7 +748,7 @@ fn on_exit_code_flow(context: &mut CompileContext) {          context.line_ending_if_needed();      } -    context.tag("</code></pre>".to_string()); +    context.tag("</code></pre>");      if let Some(count) = context.code_fenced_fences_count.take() {          if count < 2 { @@ -792,12 +776,12 @@ fn on_exit_code_text(context: &mut CompileContext) {      }      context.code_text_inside = false; -    context.push(if trim { +    context.push(&*if trim {          result[1..(result.len() - 1)].to_string()      } else {          result      }); -    context.tag("</code>".to_string()); +    context.tag("</code>");  }  /// Handle [`Exit`][EventType::Exit]:*. @@ -810,11 +794,11 @@ fn on_exit_drop(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}.  fn on_exit_data(context: &mut CompileContext) {      // Just output it. -    context.push(context.encode_opt(&serialize( +    context.push_raw(&*serialize(          context.codes,          &from_exit_event(context.events, context.index),          false, -    ))); +    ));  }  /// Handle [`Exit`][EventType::Exit]:[`Definition`][Token::Definition]. @@ -870,7 +854,7 @@ fn on_exit_definition_title_string(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Emphasis].  fn on_exit_emphasis(context: &mut CompileContext) { -    context.tag("</em>".to_string()); +    context.tag("</em>");  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingAtx`][Token::HeadingAtx]. @@ -880,7 +864,7 @@ fn on_exit_heading_atx(context: &mut CompileContext) {          .take()          .expect("`atx_opening_sequence_size` must be set in headings"); -    context.tag(format!("</h{}>", rank)); +    context.tag(&*format!("</h{}>", rank));  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingAtxSequence`][Token::HeadingAtxSequence]. @@ -895,14 +879,14 @@ fn on_exit_heading_atx_sequence(context: &mut CompileContext) {          .len();          context.line_ending_if_needed();          context.atx_opening_sequence_size = Some(rank); -        context.tag(format!("<h{}>", rank)); +        context.tag(&*format!("<h{}>", rank));      }  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingAtxText`][Token::HeadingAtxText].  fn on_exit_heading_atx_text(context: &mut CompileContext) {      let value = context.resume(); -    context.push(value); +    context.push(&*value);  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingSetextText`][Token::HeadingSetextText]. @@ -925,9 +909,9 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) {      let level: usize = if head == Code::Char('-') { 2 } else { 1 };      context.line_ending_if_needed(); -    context.tag(format!("<h{}>", level)); -    context.push(text); -    context.tag(format!("</h{}>", level)); +    context.tag(&*format!("<h{}>", level)); +    context.push(&*text); +    context.tag(&*format!("</h{}>", level));  }  /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlow`][Token::HtmlFlow],[`HtmlText`][Token::HtmlText]}. @@ -942,7 +926,7 @@ fn on_exit_html_data(context: &mut CompileContext) {          &from_exit_event(context.events, context.index),          false,      ); -    context.push(context.encode_opt(&slice)); +    context.push_raw(&*slice);  }  /// Handle [`Exit`][EventType::Exit]:[`Label`][Token::Label]. @@ -965,15 +949,15 @@ fn on_exit_label_text(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:[`LineEnding`][Token::LineEnding].  fn on_exit_line_ending(context: &mut CompileContext) {      if context.code_text_inside { -        context.push(" ".to_string()); +        context.push(" ");      } else if context.slurp_one_line_ending {          context.slurp_one_line_ending = false;      } else { -        context.push(context.encode_opt(&serialize( +        context.push_raw(&*serialize(              context.codes,              &from_exit_event(context.events, context.index),              false, -        ))); +        ));      }  } @@ -986,7 +970,7 @@ fn on_exit_list(context: &mut CompileContext) {      };      context.tight_stack.pop();      context.line_ending(); -    context.tag(format!("</{}>", tag_name)); +    context.tag(&*format!("</{}>", tag_name));  }  /// Handle [`Exit`][EventType::Exit]:[`ListItem`][Token::ListItem]. @@ -1012,7 +996,7 @@ fn on_exit_list_item(context: &mut CompileContext) {          context.line_ending_if_needed();      } -    context.tag("</li>".to_string()); +    context.tag("</li>");  }  /// Handle [`Exit`][EventType::Exit]:[`ListItemValue`][Token::ListItemValue]. @@ -1028,7 +1012,9 @@ fn on_exit_list_item_value(context: &mut CompileContext) {          let value = slice.parse::<u32>().ok().unwrap();          if value != 1 { -            context.tag(format!(" start=\"{}\"", encode(&value.to_string()))); +            context.tag(" start=\""); +            context.tag(&*value.to_string()); +            context.tag("\"");          }      }  } @@ -1082,9 +1068,9 @@ fn on_exit_media(context: &mut CompileContext) {      };      let destination = if let Some(destination) = destination { -        destination.clone() +        destination      } else { -        "".to_string() +        ""      };      let title = if let Some(title) = title { @@ -1094,20 +1080,20 @@ fn on_exit_media(context: &mut CompileContext) {      };      if media.image { -        context.tag(format!( +        context.tag(&*format!(              "<img src=\"{}\" alt=\"", -            sanitize_uri(&destination, &context.protocol_src), +            sanitize_uri(destination, &context.protocol_src),          )); -        context.push(label); -        context.tag(format!("\"{} />", title)); +        context.push(&*label); +        context.tag(&*format!("\"{} />", title));      } else { -        context.tag(format!( +        context.tag(&*format!(              "<a href=\"{}\"{}>", -            sanitize_uri(&destination, &context.protocol_href), +            sanitize_uri(destination, &context.protocol_href),              title,          )); -        context.push(label); -        context.tag("</a>".to_string()); +        context.push(&*label); +        context.tag("</a>");      };  } @@ -1118,7 +1104,7 @@ fn on_exit_paragraph(context: &mut CompileContext) {      if *tight {          context.slurp_one_line_ending = true;      } else { -        context.tag("</p>".to_string()); +        context.tag("</p>");      }  } @@ -1151,11 +1137,11 @@ fn on_exit_resource_title_string(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Strong].  fn on_exit_strong(context: &mut CompileContext) { -    context.tag("</strong>".to_string()); +    context.tag("</strong>");  }  /// Handle [`Exit`][EventType::Exit]:[`ThematicBreak`][Token::ThematicBreak].  fn on_exit_thematic_break(context: &mut CompileContext) {      context.line_ending_if_needed(); -    context.tag("<hr />".to_string()); +    context.tag("<hr />");  } diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index ce7cd31..a4cbec1 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -121,7 +121,7 @@ impl Kind {  #[derive(Debug, Clone)]  struct Info {      /// All parsed characters. -    buffer: Vec<char>, +    buffer: String,      /// Kind of character reference.      kind: Kind,  } @@ -162,7 +162,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```  fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      let info = Info { -        buffer: vec![], +        buffer: String::new(),          kind: Kind::Named,      };      if let Code::Char('#') = code { @@ -216,10 +216,8 @@ fn numeric(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResu  fn value(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {      match code {          Code::Char(';') if !info.buffer.is_empty() => { -            let unknown_named = Kind::Named == info.kind && { -                let value = info.buffer.iter().collect::<String>(); -                !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) -            }; +            let unknown_named = Kind::Named == info.kind +                && !CHARACTER_REFERENCES.iter().any(|d| d.0 == info.buffer);              if unknown_named {                  (State::Nok, None) diff --git a/src/util/codes.rs b/src/util/codes.rs index 9b6ad39..d35d7d9 100644 --- a/src/util/codes.rs +++ b/src/util/codes.rs @@ -5,19 +5,21 @@ use crate::tokenizer::Code;  /// Turn a string into codes.  pub fn parse(value: &str) -> Vec<Code> { -    let mut codes: Vec<Code> = vec![]; +    // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller +    // with `Code::CarriageReturnLineFeed`. +    let mut codes: Vec<Code> = Vec::with_capacity(value.len());      let mut at_start = true;      let mut at_carriage_return = false;      let mut column = 1;      for char in value.chars() {          if at_start { +            at_start = false; +              if char == '\u{feff}' {                  // Ignore.                  continue;              } - -            at_start = false;          }          // Send a CRLF. @@ -83,34 +85,33 @@ pub fn parse(value: &str) -> Vec<Code> {  /// Serialize codes, optionally expanding tabs.  pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {      let mut at_tab = false; -    let mut index = 0; -    let mut value: Vec<char> = vec![]; +    // Note: It’ll grow a bit smaller with each +    // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false, +    // and bigger with `Code::CarriageReturnLineFeed`, +    let mut value = String::with_capacity(codes.len()); -    while index < codes.len() { -        let code = codes[index]; +    for code in codes {          let mut at_tab_next = false;          match code {              Code::CarriageReturnLineFeed => { -                value.push('\r'); -                value.push('\n'); +                value.push_str("\r\n");              } -            Code::Char(char) if char == '\n' || char == '\r' => { -                value.push(char); +            Code::Char(char) if *char == '\n' || *char == '\r' => { +                value.push(*char);              } -            Code::Char(char) if char == '\t' => { +            Code::Char(char) if *char == '\t' => {                  at_tab_next = true; -                value.push(if expand_tabs { ' ' } else { char }); +                value.push(if expand_tabs { ' ' } else { *char });              }              Code::VirtualSpace => {                  if !expand_tabs && at_tab { -                    index += 1;                      continue;                  }                  value.push(' ');              }              Code::Char(char) => { -                value.push(char); +                value.push(*char);              }              Code::None => {                  unreachable!("unexpected EOF code in codes"); @@ -118,9 +119,7 @@ pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {          }          at_tab = at_tab_next; - -        index += 1;      } -    value.into_iter().collect() +    value  } diff --git a/src/util/encode.rs b/src/util/encode.rs index a3bd589..965ea5c 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,37 +20,31 @@  /// ## References  ///  /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode(value: &str) -> String { -    let mut result: Vec<&str> = vec![]; -    let mut start = 0; -    let mut index = 0; +pub fn encode<S: Into<String>>(value: S) -> String { +    let mut value = value.into(); -    for byte in value.bytes() { -        if let Some(replacement) = match byte { -            b'&' => Some("&"), -            b'"' => Some("""), -            b'<' => Some("<"), -            b'>' => Some(">"), -            _ => None, -        } { -            if start != index { -                result.push(&value[start..index]); -            } +    // It’ll grow a bit bigger for each dangerous character. +    let mut result = String::with_capacity(value.len()); -            result.push(replacement); -            start = index + 1; -        } - -        index += 1; +    while let Some(indice) = value.find(check) { +        let after = value.split_off(indice + 1); +        let dangerous = value.pop().unwrap(); +        result.push_str(&value); +        result.push_str(match dangerous { +            '&' => "&", +            '"' => """, +            '<' => "<", +            '>' => ">", +            _ => unreachable!("xxx"), +        }); +        value = after;      } -    if start == 0 { -        value.to_string() -    } else { -        if start < index { -            result.push(&value[start..index]); -        } +    result.push_str(&value); -        result.join("") -    } +    result +} + +fn check(char: char) -> bool { +    matches!(char, '&' | '"' | '<' | '>')  } diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index feb7239..42a2bb0 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -32,7 +32,8 @@  /// [definition]: crate::construct::definition  /// [label_end]: crate::construct::label_end  pub fn normalize_identifier(value: &str) -> String { -    let mut codes = vec![]; +    // Note: it’ll grow a bit smaller for consecutive whitespace. +    let mut result = String::with_capacity(value.len());      let mut at_start = true;      let mut at_whitespace = true; @@ -44,10 +45,10 @@ pub fn normalize_identifier(value: &str) -> String {              }              _ => {                  if at_whitespace && !at_start { -                    codes.push(' '); +                    result.push(' ');                  } -                codes.push(char); +                result.push(char);                  at_start = false;                  at_whitespace = false;              } @@ -66,9 +67,5 @@ pub fn normalize_identifier(value: &str) -> String {      // to `SS` (U+0053 U+0053).      // If we’d inverse the steps, for `ẞ`, we’d first uppercase without a      // change, and then lowercase to `ß`, which would not match `ss`. -    codes -        .iter() -        .collect::<String>() -        .to_lowercase() -        .to_uppercase() +    result.to_lowercase().to_uppercase()  } diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 55b15e4..81450ae 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,32 +32,25 @@ use crate::util::encode::encode;  ///  /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)  pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { -    let value = encode(&normalize_uri(value)); +    let value = encode(normalize_uri(value));      if let Some(protocols) = protocols { -        let chars: Vec<char> = value.chars().collect(); -        let mut index = 0; -        let mut colon: Option<usize> = None; - -        while index < chars.len() { -            let char = chars[index]; - -            match char { -                ':' => { -                    colon = Some(index); -                    break; +        let end = value.find(|c| matches!(c, '?' | '#' | '/')); +        let mut colon = value.find(|c| matches!(c, ':')); + +        // If the first colon is after `?`, `#`, or `/`, it’s not a protocol. +        if let Some(end) = end { +            if let Some(index) = colon { +                if index > end { +                    colon = None;                  } -                '?' | '#' | '/' => break, -                _ => {}              } - -            index += 1;          } -        // If there is no protocol, or the first colon is after `?`, `#`, or `/`, it’s relative. -        // It is a protocol, it should be allowed. +        // If there is no protocol, it’s relative, and fine.          if let Some(colon) = colon { -            let protocol = chars[0..colon].iter().collect::<String>().to_lowercase(); +            // If it is a protocol, it should be allowed. +            let protocol = value[0..colon].to_lowercase();              if !protocols.contains(&protocol.as_str()) {                  return "".to_string();              } @@ -85,8 +78,9 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {  ///  /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)  fn normalize_uri(value: &str) -> String { -    let chars: Vec<char> = value.chars().collect(); -    let mut result: Vec<String> = vec![]; +    let chars = value.chars().collect::<Vec<_>>(); +    // Note: it’ll grow bigger for each non-ascii or non-safe character. +    let mut result = String::with_capacity(value.len());      let mut index = 0;      let mut start = 0;      let mut buff = [0; 4]; @@ -104,16 +98,15 @@ fn normalize_uri(value: &str) -> String {              continue;          } -        // Note: Rust already takes care of lone astral surrogates. +        // Note: Rust already takes care of lone surrogates.          // Non-ascii or not allowed ascii.          if char >= '\u{0080}'              || !matches!(char, '!' | '#' | '$' | '&'..=';' | '=' | '?'..='Z' | '_' | 'a'..='z' | '~')          { -            result.push(chars[start..index].iter().collect::<String>()); - +            result.push_str(&chars[start..index].iter().collect::<String>());              char.encode_utf8(&mut buff); -            result.push( -                buff[0..char.len_utf8()] +            result.push_str( +                &buff[0..char.len_utf8()]                      .iter()                      .map(|&byte| format!("%{:>02X}", byte))                      .collect::<String>(), @@ -125,7 +118,7 @@ fn normalize_uri(value: &str) -> String {          index += 1;      } -    result.push(chars[start..].iter().collect::<String>()); +    result.push_str(&chars[start..].iter().collect::<String>()); -    result.join("") +    result  } | 
