From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 28 Jul 2022 16:48:00 +0200 Subject: Refactor to work on `char`s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event. --- src/util/codes.rs | 125 ------------------------------------- src/util/encode.rs | 12 +++- src/util/mod.rs | 3 +- src/util/sanitize_uri.rs | 2 +- src/util/slice.rs | 156 +++++++++++++++++++++++++++++++++++++++++++++++ src/util/span.rs | 57 ----------------- 6 files changed, 167 insertions(+), 188 deletions(-) delete mode 100644 src/util/codes.rs create mode 100644 src/util/slice.rs delete mode 100644 src/util/span.rs (limited to 'src/util') diff --git a/src/util/codes.rs b/src/util/codes.rs deleted file mode 100644 index 5006a00..0000000 --- a/src/util/codes.rs +++ /dev/null @@ -1,125 +0,0 @@ -//! Utilities to deal with character codes. - -use crate::constant::TAB_SIZE; -use crate::tokenizer::Code; - -/// Turn a string into codes. -pub fn parse(value: &str) -> Vec { - // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller - // with `Code::CarriageReturnLineFeed`. - let mut codes = Vec::with_capacity(value.len()); - let mut at_start = true; - let mut at_carriage_return = false; - let mut column = 1; - - for char in value.chars() { - if at_start { - at_start = false; - - if char == '\u{feff}' { - // Ignore. - continue; - } - } - - // Send a CRLF. - if at_carriage_return && '\n' == char { - at_carriage_return = false; - codes.push(Code::CarriageReturnLineFeed); - } else { - // Send the previous CR: we’re not at a next `\n`. - if at_carriage_return { - at_carriage_return = false; - codes.push(Code::Char('\r')); - } - - match char { - // Send a replacement character. - '\0' => { - column += 1; - codes.push(Code::Char(char::REPLACEMENT_CHARACTER)); - } - // Send a tab and virtual spaces. - '\t' => { - let remainder = column % TAB_SIZE; - let mut virtual_spaces = if remainder == 0 { - 0 - } else { - TAB_SIZE - remainder - }; - codes.push(Code::Char(char)); - column += 1; - while virtual_spaces > 0 { - codes.push(Code::VirtualSpace); - column += 1; - virtual_spaces -= 1; - } - } - // Send an LF. - '\n' => { - column = 1; - codes.push(Code::Char(char)); - } - // Don’t send anything yet. - '\r' => { - column = 1; - at_carriage_return = true; - } - // Send the char. - _ => { - column += 1; - codes.push(Code::Char(char)); - } - } - }; - } - - // Send the last CR: we’re not at a next `\n`. - if at_carriage_return { - codes.push(Code::Char('\r')); - } - - codes -} - -/// Serialize codes, optionally expanding tabs. -pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { - let mut at_tab = false; - // Note: It’ll grow a bit smaller with each - // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false, - // and bigger with `Code::CarriageReturnLineFeed`, - let mut value = String::with_capacity(codes.len()); - - for code in codes { - let mut at_tab_next = false; - - match code { - Code::CarriageReturnLineFeed => { - value.push_str("\r\n"); - } - Code::Char(char) if *char == '\n' || *char == '\r' => { - value.push(*char); - } - Code::Char(char) if *char == '\t' => { - at_tab_next = true; - value.push(if expand_tabs { ' ' } else { *char }); - } - Code::VirtualSpace => { - if !expand_tabs && at_tab { - continue; - } - value.push(' '); - } - Code::Char(char) => { - value.push(*char); - } - Code::None => { - unreachable!("unexpected EOF code in codes"); - } - } - - at_tab = at_tab_next; - } - - value -} diff --git a/src/util/encode.rs b/src/util/encode.rs index 965ea5c..91c5462 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,7 +20,8 @@ /// ## References /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode>(value: S) -> String { +pub fn encode>(value: S, encode_html: bool) -> String { + let check = if encode_html { check_all } else { check_nil }; let mut value = value.into(); // It’ll grow a bit bigger for each dangerous character. @@ -31,6 +32,7 @@ pub fn encode>(value: S) -> String { let dangerous = value.pop().unwrap(); result.push_str(&value); result.push_str(match dangerous { + '\0' => "�", '&' => "&", '"' => """, '<' => "<", @@ -45,6 +47,10 @@ pub fn encode>(value: S) -> String { result } -fn check(char: char) -> bool { - matches!(char, '&' | '"' | '<' | '>') +fn check_all(char: char) -> bool { + matches!(char, '\0' | '&' | '"' | '<' | '>') +} + +fn check_nil(char: char) -> bool { + matches!(char, '\0') } diff --git a/src/util/mod.rs b/src/util/mod.rs index ae1add6..a01f31e 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,10 +1,9 @@ //! Utilities used when compiling markdown. -pub mod codes; pub mod decode_character_reference; pub mod edit_map; pub mod encode; pub mod normalize_identifier; pub mod sanitize_uri; pub mod skip; -pub mod span; +pub mod slice; diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 81450ae..8c09549 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,7 +32,7 @@ use crate::util::encode::encode; /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) pub fn sanitize_uri(value: &str, protocols: &Option>) -> String { - let value = encode(normalize_uri(value)); + let value = encode(normalize_uri(value), true); if let Some(protocols) = protocols { let end = value.find(|c| matches!(c, '?' | '#' | '/')); diff --git a/src/util/slice.rs b/src/util/slice.rs new file mode 100644 index 0000000..2134069 --- /dev/null +++ b/src/util/slice.rs @@ -0,0 +1,156 @@ +//! Utilities to deal with characters. + +use crate::constant::TAB_SIZE; +use crate::tokenizer::{Event, EventType, Point}; + +/// A range between two places. +#[derive(Debug)] +pub struct Position<'a> { + pub start: &'a Point, + pub end: &'a Point, +} + +impl<'a> Position<'a> { + /// Get a position from an exit event. + /// + /// Looks backwards for the corresponding `enter` event. + /// This does not support nested events (such as lists in lists). + /// + /// ## Panics + /// + /// This function panics if an enter event is given. + /// When `micromark` is used, this function never panics. + pub fn from_exit_event(events: &'a [Event], index: usize) -> Position<'a> { + let exit = &events[index]; + assert_eq!( + exit.event_type, + EventType::Exit, + "expected `from_exit_event` to be called on `exit` event" + ); + let mut enter_index = index - 1; + + loop { + let enter = &events[enter_index]; + if enter.event_type == EventType::Enter && enter.token_type == exit.token_type { + return Position { + start: &enter.point, + end: &exit.point, + }; + } + + enter_index -= 1; + } + } +} + +/// Chars belonging to a range. +/// +/// Includes information on virtual spaces before and after the chars. +#[derive(Debug)] +pub struct Slice<'a> { + pub chars: &'a [char], + pub before: usize, + pub after: usize, +} + +impl<'a> Slice<'a> { + /// Get the slice belonging to a position. + pub fn from_point(list: &'a [char], point: &Point) -> Slice<'a> { + let mut before = point.vs; + let mut start = point.index; + let end = if start < list.len() { start + 1 } else { start }; + + // If we have virtual spaces before, it means we are past the actual + // character at that index, and those virtual spaces. + if before > 0 { + before = TAB_SIZE - before; + start += 1; + }; + + Slice { + chars: if start < end { &list[start..end] } else { &[] }, + before, + after: 0, + } + } + + /// Get the slice belonging to a position. + pub fn from_position(list: &'a [char], position: &Position) -> Slice<'a> { + let mut before = position.start.vs; + let mut after = position.end.vs; + let mut start = position.start.index; + let mut end = position.end.index; + + // If we have virtual spaces before, it means we are past the actual + // character at that index, and those virtual spaces. + if before > 0 { + before = TAB_SIZE - before; + start += 1; + }; + + // If we have virtual spaces after, it means that character is included, + // and one less virtual space. + if after > 0 { + after -= 1; + end += 1; + } + + Slice { + chars: &list[start..end], + before, + after, + } + } + + /// To do. + pub fn size(&self) -> usize { + self.chars.len() + self.before + self.after + } + + // To do: + // When we have u8s, we could use: + // to implement an `as_str`. + + /// To do. + pub fn head(&self) -> Option { + if self.before > 0 { + Some(' ') + } else if self.chars.is_empty() { + None + } else { + Some(self.chars[0]) + } + } + + /// To do. + pub fn tail(&self) -> Option { + if self.after > 0 { + Some(' ') + } else { + let index = self.chars.len(); + if index > 0 { + Some(self.chars[index - 1]) + } else { + None + } + } + } + + /// To do. + pub fn serialize(&self) -> String { + let mut string = String::with_capacity(self.size()); + let mut index = self.before; + while index > 0 { + string.push(' '); + index -= 1; + } + string.push_str(&self.chars.iter().collect::()); + index = self.after; + while index > 0 { + string.push(' '); + index -= 1; + } + + string + } +} diff --git a/src/util/span.rs b/src/util/span.rs deleted file mode 100644 index ca25924..0000000 --- a/src/util/span.rs +++ /dev/null @@ -1,57 +0,0 @@ -//! Utilities to deal with semantic labels. - -use crate::tokenizer::{Code, Event, EventType}; -use crate::util::codes::serialize as serialize_codes; - -/// A struct representing the span of an opening and closing event of a token. -#[derive(Debug)] -pub struct Span { - /// Absolute offset (an `index` in `codes`) of where this span starts. - pub start_index: usize, - /// Absolute offset (an `index` in `codes`) of where this span ends. - pub end_index: usize, -} - -/// Get a span from an event. -/// -/// Get the span of an `exit` event, by looking backwards through the events to -/// find the corresponding `enter` event. -/// This assumes that tokens with the same are not nested. -/// -/// ## Panics -/// -/// This function panics if an enter event is given. -/// When `micromark` is used, this function never panics. -pub fn from_exit_event(events: &[Event], index: usize) -> Span { - let exit = &events[index]; - let end_index = exit.point.index; - let token_type = exit.token_type.clone(); - assert_eq!( - exit.event_type, - EventType::Exit, - "expected `from_exit_event` to be called on `exit` event" - ); - let mut enter_index = index - 1; - - loop { - let enter = &events[enter_index]; - if enter.event_type == EventType::Enter && enter.token_type == token_type { - return Span { - start_index: enter.point.index, - end_index, - }; - } - - enter_index -= 1; - } -} - -/// Serialize a span, optionally expanding tabs. -pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String { - serialize_codes(codes(all_codes, span), expand_tabs) -} - -/// Get a slice of codes from a span. -pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] { - &codes[span.start_index..span.end_index] -} -- cgit