Refactor code style

author: Titus Wormer <tituswormer@gmail.com> 2022-07-05 13:03:09 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-07-05 13:03:09 +0200
commit: fd860a975b84da9a79abfa247787e6adbd5ea34c (patch)
tree: bd9db168c57478f4f37c234eac4087c2d69a6445 /src/util
parent: 0bc099f8f8b6541a962e604b7ac25445a2a9252a (diff)
download: markdown-rs-fd860a975b84da9a79abfa247787e6adbd5ea34c.tar.gz
markdown-rs-fd860a975b84da9a79abfa247787e6adbd5ea34c.tar.bz2
markdown-rs-fd860a975b84da9a79abfa247787e6adbd5ea34c.zip
5 files changed, 161 insertions, 64 deletions
diff --git a/src/util/codes.rs b/src/util/codes.rs
new file mode 100644
index 0000000..8a46d02
--- /dev/null
+++ b/src/util/codes.rs
@@ -0,0 +1,126 @@
+//! Utilities to deal with character codes.
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::Code;
+
+/// Turn a string into codes.
+pub fn parse(value: &str) -> Vec<Code> {
+    let mut codes: Vec<Code> = vec![];
+    let mut at_start = true;
+    let mut at_carriage_return = false;
+    let mut column = 1;
+
+    for char in value.chars() {
+        if at_start {
+            if char == '\u{feff}' {
+                // Ignore.
+                continue;
+            }
+
+            at_start = false;
+        }
+
+        // Send a CRLF.
+        if at_carriage_return && '\n' == char {
+            at_carriage_return = false;
+            codes.push(Code::CarriageReturnLineFeed);
+        } else {
+            // Send the previous CR: we’re not at a next `\n`.
+            if at_carriage_return {
+                at_carriage_return = false;
+                codes.push(Code::Char('\r'));
+            }
+
+            match char {
+                // Send a replacement character.
+                '\0' => {
+                    column += 1;
+                    codes.push(Code::Char('�'));
+                }
+                // Send a tab and virtual spaces.
+                '\t' => {
+                    let remainder = column % TAB_SIZE;
+                    let mut virtual_spaces = if remainder == 0 {
+                        0
+                    } else {
+                        TAB_SIZE - remainder
+                    };
+                    codes.push(Code::Char(char));
+                    column += 1;
+                    while virtual_spaces > 0 {
+                        codes.push(Code::VirtualSpace);
+                        column += 1;
+                        virtual_spaces -= 1;
+                    }
+                }
+                // Send an LF.
+                '\n' => {
+                    column = 1;
+                    codes.push(Code::Char(char));
+                }
+                // Don’t send anything yet.
+                '\r' => {
+                    column = 1;
+                    at_carriage_return = true;
+                }
+                // Send the char.
+                _ => {
+                    column += 1;
+                    codes.push(Code::Char(char));
+                }
+            }
+        };
+    }
+
+    // Send the last CR: we’re not at a next `\n`.
+    if at_carriage_return {
+        codes.push(Code::Char('\r'));
+    }
+
+    codes
+}
+
+/// Serialize codes, optionally expanding tabs.
+pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
+    let mut at_tab = false;
+    let mut index = 0;
+    let mut value: Vec<char> = vec![];
+
+    while index < codes.len() {
+        let code = codes[index];
+        let mut at_tab_next = false;
+
+        match code {
+            Code::CarriageReturnLineFeed => {
+                value.push('\r');
+                value.push('\n');
+            }
+            Code::Char(char) if char == '\n' || char == '\r' => {
+                value.push(char);
+            }
+            Code::Char(char) if char == '\t' => {
+                at_tab_next = true;
+                value.push(if expand_tabs { ' ' } else { char });
+            }
+            Code::VirtualSpace => {
+                if !expand_tabs && at_tab {
+                    index += 1;
+                    continue;
+                }
+                value.push(' ');
+            }
+            Code::Char(char) => {
+                value.push(char);
+            }
+            Code::None => {
+                unreachable!("unexpected EOF code in codes");
+            }
+        }
+
+        at_tab = at_tab_next;
+
+        index += 1;
+    }
+
+    value.into_iter().collect()
+}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 5762c22..a3bd589 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -21,11 +21,36 @@
 ///
 /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
 pub fn encode(value: &str) -> String {
-    // To do: replacing 4 times might just be slow.
-    // Perhaps we can walk the chars.
-    value
-        .replace('&', "&amp;")
-        .replace('"', "&quot;")
-        .replace('<', "&lt;")
-        .replace('>', "&gt;")
+    let mut result: Vec<&str> = vec![];
+    let mut start = 0;
+    let mut index = 0;
+
+    for byte in value.bytes() {
+        if let Some(replacement) = match byte {
+            b'&' => Some("&amp;"),
+            b'"' => Some("&quot;"),
+            b'<' => Some("&lt;"),
+            b'>' => Some("&gt;"),
+            _ => None,
+        } {
+            if start != index {
+                result.push(&value[start..index]);
+            }
+
+            result.push(replacement);
+            start = index + 1;
+        }
+
+        index += 1;
+    }
+
+    if start == 0 {
+        value.to_string()
+    } else {
+        if start < index {
+            result.push(&value[start..index]);
+        }
+
+        result.join("")
+    }
 }
diff --git a/src/util/mod.rs b/src/util/mod.rs
index 68ef275..d1a0e01 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,5 +1,6 @@
 //! Utilities used when compiling markdown.
 
+pub mod codes;
 pub mod decode_character_reference;
 pub mod edit_map;
 pub mod encode;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index 4753f7b..123a3a9 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String {
     // Collapse markdown whitespace and trim it.
     for char in value.chars() {
         match char {
-            '\t' | '\r' | '\n' | ' ' => {
+            '\t' | '\n' | '\r' | ' ' => {
                 at_whitespace = true;
             }
             _ => {
diff --git a/src/util/span.rs b/src/util/span.rs
index 02811cc..32dd00f 100644
--- a/src/util/span.rs
+++ b/src/util/span.rs
@@ -1,20 +1,15 @@
 //! Utilities to deal with semantic labels.
 
 use crate::tokenizer::{Code, Event, EventType};
+use crate::util::codes::serialize as serialize_codes;
 
 /// A struct representing the span of an opening and closing event of a token.
 #[derive(Debug)]
 pub struct Span {
-    // To do: probably needed in the future.
-    // start: Point,
     /// Absolute offset (and `index` in `codes`) of where this span starts.
     pub start_index: usize,
-    // To do: probably needed in the future.
-    // end: Point,
     /// Absolute offset (and `index` in `codes`) of where this span ends.
     pub end_index: usize,
-    // To do: probably needed in the future.
-    // token_type: TokenType,
 }
 
 /// Get a span from an event.
@@ -29,10 +24,8 @@ pub struct Span {
 /// When `micromark` is used, this function never panics.
 pub fn from_exit_event(events: &[Event], index: usize) -> Span {
     let exit = &events[index];
-    // let end = exit.point.clone();
     let end_index = exit.index;
     let token_type = exit.token_type.clone();
-    // To do: support `enter` events if needed and walk forwards?
     assert_eq!(
         exit.event_type,
         EventType::Exit,
@@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span {
         let enter = &events[enter_index];
         if enter.event_type == EventType::Enter && enter.token_type == token_type {
             return Span {
-                // start: enter.point.clone(),
                 start_index: enter.index,
-                // end,
                 end_index,
-                // token_type,
             };
         }
 
@@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {
 pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
     &codes[span.start_index..span.end_index]
 }
-
-/// Serialize a slice of codes, optionally expanding tabs.
-fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String {
-    let mut at_tab = false;
-    let mut index = 0;
-    let mut value: Vec<char> = vec![];
-
-    while index < codes.len() {
-        let code = codes[index];
-        let mut at_tab_next = false;
-
-        match code {
-            Code::CarriageReturnLineFeed => {
-                value.push('\r');
-                value.push('\n');
-            }
-            Code::Char(char) if char == '\n' || char == '\r' => {
-                value.push(char);
-            }
-            Code::Char(char) if char == '\t' => {
-                at_tab_next = true;
-                value.push(if expand_tabs { ' ' } else { char });
-            }
-            Code::VirtualSpace => {
-                if !expand_tabs && at_tab {
-                    index += 1;
-                    continue;
-                }
-                value.push(' ');
-            }
-            Code::Char(char) => {
-                value.push(char);
-            }
-            Code::None => {
-                unreachable!("unexpected EOF code in codes");
-            }
-        }
-
-        at_tab = at_tab_next;
-
-        index += 1;
-    }
-
-    value.into_iter().collect()
-}
author	Titus Wormer <tituswormer@gmail.com>	2022-07-05 13:03:09 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-07-05 13:03:09 +0200
commit	fd860a975b84da9a79abfa247787e6adbd5ea34c (patch)
tree	bd9db168c57478f4f37c234eac4087c2d69a6445 /src/util
parent	0bc099f8f8b6541a962e604b7ac25445a2a9252a (diff)
download	markdown-rs-fd860a975b84da9a79abfa247787e6adbd5ea34c.tar.gz markdown-rs-fd860a975b84da9a79abfa247787e6adbd5ea34c.tar.bz2 markdown-rs-fd860a975b84da9a79abfa247787e6adbd5ea34c.zip