aboutsummaryrefslogtreecommitdiffstats
path: root/src/util
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
commitf7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch)
treec1ac3f22473bd79566d835b2474d2ae9e00d6c55 /src/util
parentd729b07712ca9cc91e68af1776dac9d7008a90cb (diff)
downloadmarkdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip
Refactor to work on `char`s
Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event.
Diffstat (limited to 'src/util')
-rw-r--r--src/util/codes.rs125
-rw-r--r--src/util/encode.rs12
-rw-r--r--src/util/mod.rs3
-rw-r--r--src/util/sanitize_uri.rs2
-rw-r--r--src/util/slice.rs156
-rw-r--r--src/util/span.rs57
6 files changed, 167 insertions, 188 deletions
diff --git a/src/util/codes.rs b/src/util/codes.rs
deleted file mode 100644
index 5006a00..0000000
--- a/src/util/codes.rs
+++ /dev/null
@@ -1,125 +0,0 @@
-//! Utilities to deal with character codes.
-
-use crate::constant::TAB_SIZE;
-use crate::tokenizer::Code;
-
-/// Turn a string into codes.
-pub fn parse(value: &str) -> Vec<Code> {
- // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller
- // with `Code::CarriageReturnLineFeed`.
- let mut codes = Vec::with_capacity(value.len());
- let mut at_start = true;
- let mut at_carriage_return = false;
- let mut column = 1;
-
- for char in value.chars() {
- if at_start {
- at_start = false;
-
- if char == '\u{feff}' {
- // Ignore.
- continue;
- }
- }
-
- // Send a CRLF.
- if at_carriage_return && '\n' == char {
- at_carriage_return = false;
- codes.push(Code::CarriageReturnLineFeed);
- } else {
- // Send the previous CR: we’re not at a next `\n`.
- if at_carriage_return {
- at_carriage_return = false;
- codes.push(Code::Char('\r'));
- }
-
- match char {
- // Send a replacement character.
- '\0' => {
- column += 1;
- codes.push(Code::Char(char::REPLACEMENT_CHARACTER));
- }
- // Send a tab and virtual spaces.
- '\t' => {
- let remainder = column % TAB_SIZE;
- let mut virtual_spaces = if remainder == 0 {
- 0
- } else {
- TAB_SIZE - remainder
- };
- codes.push(Code::Char(char));
- column += 1;
- while virtual_spaces > 0 {
- codes.push(Code::VirtualSpace);
- column += 1;
- virtual_spaces -= 1;
- }
- }
- // Send an LF.
- '\n' => {
- column = 1;
- codes.push(Code::Char(char));
- }
- // Don’t send anything yet.
- '\r' => {
- column = 1;
- at_carriage_return = true;
- }
- // Send the char.
- _ => {
- column += 1;
- codes.push(Code::Char(char));
- }
- }
- };
- }
-
- // Send the last CR: we’re not at a next `\n`.
- if at_carriage_return {
- codes.push(Code::Char('\r'));
- }
-
- codes
-}
-
-/// Serialize codes, optionally expanding tabs.
-pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
- let mut at_tab = false;
- // Note: It’ll grow a bit smaller with each
- // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false,
- // and bigger with `Code::CarriageReturnLineFeed`,
- let mut value = String::with_capacity(codes.len());
-
- for code in codes {
- let mut at_tab_next = false;
-
- match code {
- Code::CarriageReturnLineFeed => {
- value.push_str("\r\n");
- }
- Code::Char(char) if *char == '\n' || *char == '\r' => {
- value.push(*char);
- }
- Code::Char(char) if *char == '\t' => {
- at_tab_next = true;
- value.push(if expand_tabs { ' ' } else { *char });
- }
- Code::VirtualSpace => {
- if !expand_tabs && at_tab {
- continue;
- }
- value.push(' ');
- }
- Code::Char(char) => {
- value.push(*char);
- }
- Code::None => {
- unreachable!("unexpected EOF code in codes");
- }
- }
-
- at_tab = at_tab_next;
- }
-
- value
-}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 965ea5c..91c5462 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -20,7 +20,8 @@
/// ## References
///
/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
-pub fn encode<S: Into<String>>(value: S) -> String {
+pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String {
+ let check = if encode_html { check_all } else { check_nil };
let mut value = value.into();
// It’ll grow a bit bigger for each dangerous character.
@@ -31,6 +32,7 @@ pub fn encode<S: Into<String>>(value: S) -> String {
let dangerous = value.pop().unwrap();
result.push_str(&value);
result.push_str(match dangerous {
+ '\0' => "�",
'&' => "&amp;",
'"' => "&quot;",
'<' => "&lt;",
@@ -45,6 +47,10 @@ pub fn encode<S: Into<String>>(value: S) -> String {
result
}
-fn check(char: char) -> bool {
- matches!(char, '&' | '"' | '<' | '>')
+fn check_all(char: char) -> bool {
+ matches!(char, '\0' | '&' | '"' | '<' | '>')
+}
+
+fn check_nil(char: char) -> bool {
+ matches!(char, '\0')
}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index ae1add6..a01f31e 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,10 +1,9 @@
//! Utilities used when compiling markdown.
-pub mod codes;
pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
pub mod normalize_identifier;
pub mod sanitize_uri;
pub mod skip;
-pub mod span;
+pub mod slice;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 81450ae..8c09549 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -32,7 +32,7 @@ use crate::util::encode::encode;
///
/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
- let value = encode(normalize_uri(value));
+ let value = encode(normalize_uri(value), true);
if let Some(protocols) = protocols {
let end = value.find(|c| matches!(c, '?' | '#' | '/'));
diff --git a/src/util/slice.rs b/src/util/slice.rs
new file mode 100644
index 0000000..2134069
--- /dev/null
+++ b/src/util/slice.rs
@@ -0,0 +1,156 @@
+//! Utilities to deal with characters.
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Event, EventType, Point};
+
+/// A range between two places.
+#[derive(Debug)]
+pub struct Position<'a> {
+ pub start: &'a Point,
+ pub end: &'a Point,
+}
+
+impl<'a> Position<'a> {
+ /// Get a position from an exit event.
+ ///
+ /// Looks backwards for the corresponding `enter` event.
+ /// This does not support nested events (such as lists in lists).
+ ///
+ /// ## Panics
+ ///
+ /// This function panics if an enter event is given.
+ /// When `micromark` is used, this function never panics.
+ pub fn from_exit_event(events: &'a [Event], index: usize) -> Position<'a> {
+ let exit = &events[index];
+ assert_eq!(
+ exit.event_type,
+ EventType::Exit,
+ "expected `from_exit_event` to be called on `exit` event"
+ );
+ let mut enter_index = index - 1;
+
+ loop {
+ let enter = &events[enter_index];
+ if enter.event_type == EventType::Enter && enter.token_type == exit.token_type {
+ return Position {
+ start: &enter.point,
+ end: &exit.point,
+ };
+ }
+
+ enter_index -= 1;
+ }
+ }
+}
+
+/// Chars belonging to a range.
+///
+/// Includes information on virtual spaces before and after the chars.
+#[derive(Debug)]
+pub struct Slice<'a> {
+ pub chars: &'a [char],
+ pub before: usize,
+ pub after: usize,
+}
+
+impl<'a> Slice<'a> {
+ /// Get the slice belonging to a position.
+ pub fn from_point(list: &'a [char], point: &Point) -> Slice<'a> {
+ let mut before = point.vs;
+ let mut start = point.index;
+ let end = if start < list.len() { start + 1 } else { start };
+
+ // If we have virtual spaces before, it means we are past the actual
+ // character at that index, and those virtual spaces.
+ if before > 0 {
+ before = TAB_SIZE - before;
+ start += 1;
+ };
+
+ Slice {
+ chars: if start < end { &list[start..end] } else { &[] },
+ before,
+ after: 0,
+ }
+ }
+
+ /// Get the slice belonging to a position.
+ pub fn from_position(list: &'a [char], position: &Position) -> Slice<'a> {
+ let mut before = position.start.vs;
+ let mut after = position.end.vs;
+ let mut start = position.start.index;
+ let mut end = position.end.index;
+
+ // If we have virtual spaces before, it means we are past the actual
+ // character at that index, and those virtual spaces.
+ if before > 0 {
+ before = TAB_SIZE - before;
+ start += 1;
+ };
+
+ // If we have virtual spaces after, it means that character is included,
+ // and one less virtual space.
+ if after > 0 {
+ after -= 1;
+ end += 1;
+ }
+
+ Slice {
+ chars: &list[start..end],
+ before,
+ after,
+ }
+ }
+
+ /// To do.
+ pub fn size(&self) -> usize {
+ self.chars.len() + self.before + self.after
+ }
+
+ // To do:
+ // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html>
+ // to implement an `as_str`.
+
+ /// To do.
+ pub fn head(&self) -> Option<char> {
+ if self.before > 0 {
+ Some(' ')
+ } else if self.chars.is_empty() {
+ None
+ } else {
+ Some(self.chars[0])
+ }
+ }
+
+ /// To do.
+ pub fn tail(&self) -> Option<char> {
+ if self.after > 0 {
+ Some(' ')
+ } else {
+ let index = self.chars.len();
+ if index > 0 {
+ Some(self.chars[index - 1])
+ } else {
+ None
+ }
+ }
+ }
+
+ /// To do.
+ pub fn serialize(&self) -> String {
+ let mut string = String::with_capacity(self.size());
+ let mut index = self.before;
+ while index > 0 {
+ string.push(' ');
+ index -= 1;
+ }
+ string.push_str(&self.chars.iter().collect::<String>());
+ index = self.after;
+ while index > 0 {
+ string.push(' ');
+ index -= 1;
+ }
+
+ string
+ }
+}
diff --git a/src/util/span.rs b/src/util/span.rs
deleted file mode 100644
index ca25924..0000000
--- a/src/util/span.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-//! Utilities to deal with semantic labels.
-
-use crate::tokenizer::{Code, Event, EventType};
-use crate::util::codes::serialize as serialize_codes;
-
-/// A struct representing the span of an opening and closing event of a token.
-#[derive(Debug)]
-pub struct Span {
- /// Absolute offset (an `index` in `codes`) of where this span starts.
- pub start_index: usize,
- /// Absolute offset (an `index` in `codes`) of where this span ends.
- pub end_index: usize,
-}
-
-/// Get a span from an event.
-///
-/// Get the span of an `exit` event, by looking backwards through the events to
-/// find the corresponding `enter` event.
-/// This assumes that tokens with the same are not nested.
-///
-/// ## Panics
-///
-/// This function panics if an enter event is given.
-/// When `micromark` is used, this function never panics.
-pub fn from_exit_event(events: &[Event], index: usize) -> Span {
- let exit = &events[index];
- let end_index = exit.point.index;
- let token_type = exit.token_type.clone();
- assert_eq!(
- exit.event_type,
- EventType::Exit,
- "expected `from_exit_event` to be called on `exit` event"
- );
- let mut enter_index = index - 1;
-
- loop {
- let enter = &events[enter_index];
- if enter.event_type == EventType::Enter && enter.token_type == token_type {
- return Span {
- start_index: enter.point.index,
- end_index,
- };
- }
-
- enter_index -= 1;
- }
-}
-
-/// Serialize a span, optionally expanding tabs.
-pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {
- serialize_codes(codes(all_codes, span), expand_tabs)
-}
-
-/// Get a slice of codes from a span.
-pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
- &codes[span.start_index..span.end_index]
-}