aboutsummaryrefslogtreecommitdiffstats
path: root/src/util
diff options
context:
space:
mode:
Diffstat (limited to 'src/util')
-rw-r--r--src/util/codes.rs126
-rw-r--r--src/util/encode.rs39
-rw-r--r--src/util/mod.rs1
-rw-r--r--src/util/normalize_identifier.rs2
-rw-r--r--src/util/span.rs57
5 files changed, 161 insertions, 64 deletions
diff --git a/src/util/codes.rs b/src/util/codes.rs
new file mode 100644
index 0000000..8a46d02
--- /dev/null
+++ b/src/util/codes.rs
@@ -0,0 +1,126 @@
+//! Utilities to deal with character codes.
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::Code;
+
+/// Turn a string into codes.
+pub fn parse(value: &str) -> Vec<Code> {
+ let mut codes: Vec<Code> = vec![];
+ let mut at_start = true;
+ let mut at_carriage_return = false;
+ let mut column = 1;
+
+ for char in value.chars() {
+ if at_start {
+ if char == '\u{feff}' {
+ // Ignore.
+ continue;
+ }
+
+ at_start = false;
+ }
+
+ // Send a CRLF.
+ if at_carriage_return && '\n' == char {
+ at_carriage_return = false;
+ codes.push(Code::CarriageReturnLineFeed);
+ } else {
+ // Send the previous CR: we’re not at a next `\n`.
+ if at_carriage_return {
+ at_carriage_return = false;
+ codes.push(Code::Char('\r'));
+ }
+
+ match char {
+ // Send a replacement character.
+ '\0' => {
+ column += 1;
+ codes.push(Code::Char('�'));
+ }
+ // Send a tab and virtual spaces.
+ '\t' => {
+ let remainder = column % TAB_SIZE;
+ let mut virtual_spaces = if remainder == 0 {
+ 0
+ } else {
+ TAB_SIZE - remainder
+ };
+ codes.push(Code::Char(char));
+ column += 1;
+ while virtual_spaces > 0 {
+ codes.push(Code::VirtualSpace);
+ column += 1;
+ virtual_spaces -= 1;
+ }
+ }
+ // Send an LF.
+ '\n' => {
+ column = 1;
+ codes.push(Code::Char(char));
+ }
+ // Don’t send anything yet.
+ '\r' => {
+ column = 1;
+ at_carriage_return = true;
+ }
+ // Send the char.
+ _ => {
+ column += 1;
+ codes.push(Code::Char(char));
+ }
+ }
+ };
+ }
+
+ // Send the last CR: we’re not at a next `\n`.
+ if at_carriage_return {
+ codes.push(Code::Char('\r'));
+ }
+
+ codes
+}
+
+/// Serialize codes, optionally expanding tabs.
+pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
+ let mut at_tab = false;
+ let mut index = 0;
+ let mut value: Vec<char> = vec![];
+
+ while index < codes.len() {
+ let code = codes[index];
+ let mut at_tab_next = false;
+
+ match code {
+ Code::CarriageReturnLineFeed => {
+ value.push('\r');
+ value.push('\n');
+ }
+ Code::Char(char) if char == '\n' || char == '\r' => {
+ value.push(char);
+ }
+ Code::Char(char) if char == '\t' => {
+ at_tab_next = true;
+ value.push(if expand_tabs { ' ' } else { char });
+ }
+ Code::VirtualSpace => {
+ if !expand_tabs && at_tab {
+ index += 1;
+ continue;
+ }
+ value.push(' ');
+ }
+ Code::Char(char) => {
+ value.push(char);
+ }
+ Code::None => {
+ unreachable!("unexpected EOF code in codes");
+ }
+ }
+
+ at_tab = at_tab_next;
+
+ index += 1;
+ }
+
+ value.into_iter().collect()
+}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 5762c22..a3bd589 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -21,11 +21,36 @@
///
/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
pub fn encode(value: &str) -> String {
- // To do: replacing 4 times might just be slow.
- // Perhaps we can walk the chars.
- value
- .replace('&', "&amp;")
- .replace('"', "&quot;")
- .replace('<', "&lt;")
- .replace('>', "&gt;")
+ let mut result: Vec<&str> = vec![];
+ let mut start = 0;
+ let mut index = 0;
+
+ for byte in value.bytes() {
+ if let Some(replacement) = match byte {
+ b'&' => Some("&amp;"),
+ b'"' => Some("&quot;"),
+ b'<' => Some("&lt;"),
+ b'>' => Some("&gt;"),
+ _ => None,
+ } {
+ if start != index {
+ result.push(&value[start..index]);
+ }
+
+ result.push(replacement);
+ start = index + 1;
+ }
+
+ index += 1;
+ }
+
+ if start == 0 {
+ value.to_string()
+ } else {
+ if start < index {
+ result.push(&value[start..index]);
+ }
+
+ result.join("")
+ }
}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index 68ef275..d1a0e01 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,5 +1,6 @@
//! Utilities used when compiling markdown.
+pub mod codes;
pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index 4753f7b..123a3a9 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String {
// Collapse markdown whitespace and trim it.
for char in value.chars() {
match char {
- '\t' | '\r' | '\n' | ' ' => {
+ '\t' | '\n' | '\r' | ' ' => {
at_whitespace = true;
}
_ => {
diff --git a/src/util/span.rs b/src/util/span.rs
index 02811cc..32dd00f 100644
--- a/src/util/span.rs
+++ b/src/util/span.rs
@@ -1,20 +1,15 @@
//! Utilities to deal with semantic labels.
use crate::tokenizer::{Code, Event, EventType};
+use crate::util::codes::serialize as serialize_codes;
/// A struct representing the span of an opening and closing event of a token.
#[derive(Debug)]
pub struct Span {
- // To do: probably needed in the future.
- // start: Point,
/// Absolute offset (and `index` in `codes`) of where this span starts.
pub start_index: usize,
- // To do: probably needed in the future.
- // end: Point,
/// Absolute offset (and `index` in `codes`) of where this span ends.
pub end_index: usize,
- // To do: probably needed in the future.
- // token_type: TokenType,
}
/// Get a span from an event.
@@ -29,10 +24,8 @@ pub struct Span {
/// When `micromark` is used, this function never panics.
pub fn from_exit_event(events: &[Event], index: usize) -> Span {
let exit = &events[index];
- // let end = exit.point.clone();
let end_index = exit.index;
let token_type = exit.token_type.clone();
- // To do: support `enter` events if needed and walk forwards?
assert_eq!(
exit.event_type,
EventType::Exit,
@@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span {
let enter = &events[enter_index];
if enter.event_type == EventType::Enter && enter.token_type == token_type {
return Span {
- // start: enter.point.clone(),
start_index: enter.index,
- // end,
end_index,
- // token_type,
};
}
@@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {
pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
&codes[span.start_index..span.end_index]
}
-
-/// Serialize a slice of codes, optionally expanding tabs.
-fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String {
- let mut at_tab = false;
- let mut index = 0;
- let mut value: Vec<char> = vec![];
-
- while index < codes.len() {
- let code = codes[index];
- let mut at_tab_next = false;
-
- match code {
- Code::CarriageReturnLineFeed => {
- value.push('\r');
- value.push('\n');
- }
- Code::Char(char) if char == '\n' || char == '\r' => {
- value.push(char);
- }
- Code::Char(char) if char == '\t' => {
- at_tab_next = true;
- value.push(if expand_tabs { ' ' } else { char });
- }
- Code::VirtualSpace => {
- if !expand_tabs && at_tab {
- index += 1;
- continue;
- }
- value.push(' ');
- }
- Code::Char(char) => {
- value.push(char);
- }
- Code::None => {
- unreachable!("unexpected EOF code in codes");
- }
- }
-
- at_tab = at_tab_next;
-
- index += 1;
- }
-
- value.into_iter().collect()
-}