diff options
Diffstat (limited to 'src/util')
-rw-r--r-- | src/util/codes.rs | 126 | ||||
-rw-r--r-- | src/util/encode.rs | 39 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | src/util/normalize_identifier.rs | 2 | ||||
-rw-r--r-- | src/util/span.rs | 57 |
5 files changed, 161 insertions, 64 deletions
diff --git a/src/util/codes.rs b/src/util/codes.rs new file mode 100644 index 0000000..8a46d02 --- /dev/null +++ b/src/util/codes.rs @@ -0,0 +1,126 @@ +//! Utilities to deal with character codes. + +use crate::constant::TAB_SIZE; +use crate::tokenizer::Code; + +/// Turn a string into codes. +pub fn parse(value: &str) -> Vec<Code> { + let mut codes: Vec<Code> = vec![]; + let mut at_start = true; + let mut at_carriage_return = false; + let mut column = 1; + + for char in value.chars() { + if at_start { + if char == '\u{feff}' { + // Ignore. + continue; + } + + at_start = false; + } + + // Send a CRLF. + if at_carriage_return && '\n' == char { + at_carriage_return = false; + codes.push(Code::CarriageReturnLineFeed); + } else { + // Send the previous CR: we’re not at a next `\n`. + if at_carriage_return { + at_carriage_return = false; + codes.push(Code::Char('\r')); + } + + match char { + // Send a replacement character. + '\0' => { + column += 1; + codes.push(Code::Char('�')); + } + // Send a tab and virtual spaces. + '\t' => { + let remainder = column % TAB_SIZE; + let mut virtual_spaces = if remainder == 0 { + 0 + } else { + TAB_SIZE - remainder + }; + codes.push(Code::Char(char)); + column += 1; + while virtual_spaces > 0 { + codes.push(Code::VirtualSpace); + column += 1; + virtual_spaces -= 1; + } + } + // Send an LF. + '\n' => { + column = 1; + codes.push(Code::Char(char)); + } + // Don’t send anything yet. + '\r' => { + column = 1; + at_carriage_return = true; + } + // Send the char. + _ => { + column += 1; + codes.push(Code::Char(char)); + } + } + }; + } + + // Send the last CR: we’re not at a next `\n`. + if at_carriage_return { + codes.push(Code::Char('\r')); + } + + codes +} + +/// Serialize codes, optionally expanding tabs. +pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { + let mut at_tab = false; + let mut index = 0; + let mut value: Vec<char> = vec![]; + + while index < codes.len() { + let code = codes[index]; + let mut at_tab_next = false; + + match code { + Code::CarriageReturnLineFeed => { + value.push('\r'); + value.push('\n'); + } + Code::Char(char) if char == '\n' || char == '\r' => { + value.push(char); + } + Code::Char(char) if char == '\t' => { + at_tab_next = true; + value.push(if expand_tabs { ' ' } else { char }); + } + Code::VirtualSpace => { + if !expand_tabs && at_tab { + index += 1; + continue; + } + value.push(' '); + } + Code::Char(char) => { + value.push(char); + } + Code::None => { + unreachable!("unexpected EOF code in codes"); + } + } + + at_tab = at_tab_next; + + index += 1; + } + + value.into_iter().collect() +} diff --git a/src/util/encode.rs b/src/util/encode.rs index 5762c22..a3bd589 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,11 +21,36 @@ /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) pub fn encode(value: &str) -> String { - // To do: replacing 4 times might just be slow. - // Perhaps we can walk the chars. - value - .replace('&', "&") - .replace('"', """) - .replace('<', "<") - .replace('>', ">") + let mut result: Vec<&str> = vec![]; + let mut start = 0; + let mut index = 0; + + for byte in value.bytes() { + if let Some(replacement) = match byte { + b'&' => Some("&"), + b'"' => Some("""), + b'<' => Some("<"), + b'>' => Some(">"), + _ => None, + } { + if start != index { + result.push(&value[start..index]); + } + + result.push(replacement); + start = index + 1; + } + + index += 1; + } + + if start == 0 { + value.to_string() + } else { + if start < index { + result.push(&value[start..index]); + } + + result.join("") + } } diff --git a/src/util/mod.rs b/src/util/mod.rs index 68ef275..d1a0e01 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ //! Utilities used when compiling markdown. +pub mod codes; pub mod decode_character_reference; pub mod edit_map; pub mod encode; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 4753f7b..123a3a9 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String { // Collapse markdown whitespace and trim it. for char in value.chars() { match char { - '\t' | '\r' | '\n' | ' ' => { + '\t' | '\n' | '\r' | ' ' => { at_whitespace = true; } _ => { diff --git a/src/util/span.rs b/src/util/span.rs index 02811cc..32dd00f 100644 --- a/src/util/span.rs +++ b/src/util/span.rs @@ -1,20 +1,15 @@ //! Utilities to deal with semantic labels. use crate::tokenizer::{Code, Event, EventType}; +use crate::util::codes::serialize as serialize_codes; /// A struct representing the span of an opening and closing event of a token. #[derive(Debug)] pub struct Span { - // To do: probably needed in the future. - // start: Point, /// Absolute offset (and `index` in `codes`) of where this span starts. pub start_index: usize, - // To do: probably needed in the future. - // end: Point, /// Absolute offset (and `index` in `codes`) of where this span ends. pub end_index: usize, - // To do: probably needed in the future. - // token_type: TokenType, } /// Get a span from an event. @@ -29,10 +24,8 @@ pub struct Span { /// When `micromark` is used, this function never panics. pub fn from_exit_event(events: &[Event], index: usize) -> Span { let exit = &events[index]; - // let end = exit.point.clone(); let end_index = exit.index; let token_type = exit.token_type.clone(); - // To do: support `enter` events if needed and walk forwards? assert_eq!( exit.event_type, EventType::Exit, @@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span { let enter = &events[enter_index]; if enter.event_type == EventType::Enter && enter.token_type == token_type { return Span { - // start: enter.point.clone(), start_index: enter.index, - // end, end_index, - // token_type, }; } @@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String { pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] { &codes[span.start_index..span.end_index] } - -/// Serialize a slice of codes, optionally expanding tabs. -fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String { - let mut at_tab = false; - let mut index = 0; - let mut value: Vec<char> = vec![]; - - while index < codes.len() { - let code = codes[index]; - let mut at_tab_next = false; - - match code { - Code::CarriageReturnLineFeed => { - value.push('\r'); - value.push('\n'); - } - Code::Char(char) if char == '\n' || char == '\r' => { - value.push(char); - } - Code::Char(char) if char == '\t' => { - at_tab_next = true; - value.push(if expand_tabs { ' ' } else { char }); - } - Code::VirtualSpace => { - if !expand_tabs && at_tab { - index += 1; - continue; - } - value.push(' '); - } - Code::Char(char) => { - value.push(char); - } - Code::None => { - unreachable!("unexpected EOF code in codes"); - } - } - - at_tab = at_tab_next; - - index += 1; - } - - value.into_iter().collect() -} |