aboutsummaryrefslogtreecommitdiffstats
path: root/src/util
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-15 18:22:40 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-15 18:22:40 +0200
commit2379c2749916483be68dbf816a4c56cd59ced958 (patch)
tree5db8ea01782212b3f465d40f912ed87481012bbb /src/util
parent3aa45de9dc359169ccaabc07ffa986d72a010cd8 (diff)
downloadmarkdown-rs-2379c2749916483be68dbf816a4c56cd59ced958.tar.gz
markdown-rs-2379c2749916483be68dbf816a4c56cd59ced958.tar.bz2
markdown-rs-2379c2749916483be68dbf816a4c56cd59ced958.zip
Refactor to proof docs, grammars
Diffstat (limited to 'src/util')
-rw-r--r--src/util/decode_character_reference.rs10
-rw-r--r--src/util/edit_map.rs4
-rw-r--r--src/util/encode.rs5
-rw-r--r--src/util/mod.rs2
-rw-r--r--src/util/normalize_identifier.rs30
-rw-r--r--src/util/sanitize_uri.rs10
-rw-r--r--src/util/skip.rs2
-rw-r--r--src/util/slice.rs9
8 files changed, 39 insertions, 33 deletions
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs
index f8fd18f..8ed32f4 100644
--- a/src/util/decode_character_reference.rs
+++ b/src/util/decode_character_reference.rs
@@ -1,4 +1,4 @@
-//! Utilities to decode character references.
+//! Decode character references.
use crate::constant::CHARACTER_REFERENCES;
@@ -43,11 +43,11 @@ pub fn decode_named(value: &str) -> String {
/// Decode numeric character references.
///
/// Turn the number (in string form as either hexadecimal or decimal) coming
-/// from a numeric character reference into a character.
-/// Whether the base of the string form is `10` (decimal) or `16` (hexadecimal)
-/// must be passed as the `radix` parameter.
+/// from a numeric character reference into a string.
+/// The base of the string form must be passed as the `radix` parameter, as
+/// `10` (decimal) or `16` (hexadecimal).
///
-/// This returns the `char` associated with that number or a replacement
+/// This returns a `String` form of the associated character or a replacement
/// character for C0 control characters (except for ASCII whitespace), C1
/// control characters, lone surrogates, noncharacters, and out of range
/// characters.
diff --git a/src/util/edit_map.rs b/src/util/edit_map.rs
index 11ac486..33c5706 100644
--- a/src/util/edit_map.rs
+++ b/src/util/edit_map.rs
@@ -1,6 +1,6 @@
-//! Helpers to deal with several changes in events, batching them together.
+//! Deal with several changes in events, batching them together.
//!
-//! Preferably, changes should be kept to a minumum.
+//! Preferably, changes should be kept to a minimum.
//! Sometimes, it’s needed to change the list of events, because parsing can be
//! messy, and it helps to expose a cleaner interface of events to the compiler
//! and other users.
diff --git a/src/util/encode.rs b/src/util/encode.rs
index d37a2de..6530011 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -1,10 +1,11 @@
-//! Utilities to encode HTML.
+//! Encode HTML.
/// Encode dangerous html characters.
///
/// This ensures that certain characters which have special meaning in HTML are
/// dealt with.
-/// Technically, we can skip `>` and `"` in many cases, but CM includes them.
+/// Technically, we can skip `>` and `"` in many cases, but `CommonMark`
+/// includes them.
///
/// This behavior is not explained in prose in `CommonMark` but can be inferred
/// from the input/output test cases.
diff --git a/src/util/mod.rs b/src/util/mod.rs
index a01f31e..f51845c 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,4 +1,4 @@
-//! Utilities used when compiling markdown.
+//! Utilities used when processing markdown.
pub mod decode_character_reference;
pub mod edit_map;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index f5b12d0..ddc51f8 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -1,17 +1,25 @@
-//! Utility to normalize identifiers.
+//! Normalize identifiers.
/// Normalize an identifier, as found in [references][label_end] and
/// [definitions][definition], so it can be compared when matching.
///
/// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `)
-/// into one space, trims it (as in, dropping the first and last space),
-/// and then performs unicode case folding twice: first by uppercasing
-/// lowercase characters, and then lowercasing uppercase characters.
+/// into one space, trims it (as in, dropping the first and last space), and
+/// then performs unicode case folding twice: first by lowercasing uppercase
+/// characters, and then uppercasing lowercase characters.
///
/// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if
/// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a
/// different uppercase character (U+0398 (`Θ`)).
-/// Hence, to get that form, we perform both upper- and lowercase.
+/// Hence, to get that form, we perform both lower- and uppercase.
+///
+/// Performing these steps in that order works, but the inverse does not work.
+/// To illustrate, say the source markdown containes two identifiers
+/// `SS` (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to
+/// `ss` (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both
+/// uppercase to `SS` (U+0053 U+0053).
+/// If we’d inverse the steps, for `ẞ`, we’d first uppercase without a
+/// change, and then lowercase to `ß`, which would not match `ss`.
///
/// ## Examples
///
@@ -64,17 +72,5 @@ pub fn normalize_identifier(value: &str) -> String {
result.push_str(&value[start..]);
}
- // Some characters are considered “uppercase”, but if their lowercase
- // counterpart is uppercased will result in a different uppercase
- // character.
- // Hence, to get that form, we perform both lower- and uppercase.
- // Performing these steps in that order works, but the inverse does not
- // work.
- // To illustrate, say the source markdown containes two identifiers `SS`
- // (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to `ss`
- // (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both uppercase
- // to `SS` (U+0053 U+0053).
- // If we’d inverse the steps, for `ẞ`, we’d first uppercase without a
- // change, and then lowercase to `ß`, which would not match `ss`.
result.to_lowercase().to_uppercase()
}
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 051e1e1..593a70e 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -1,4 +1,4 @@
-//! Utilities to make urls safe.
+//! Make urls safe.
use crate::util::encode::encode;
@@ -60,9 +60,10 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
value
}
-/// Normalize a URL (such as used in definitions).
+/// Normalize a URL (such as used in [definitions][definition],
+/// [references][label_end]).
///
-/// Encode unsafe characters with percent-encoding, skipping already encoded
+/// It encodes unsafe characters with percent-encoding, skipping already encoded
/// sequences.
///
/// ## Examples
@@ -77,6 +78,9 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
/// ## References
///
/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
+///
+/// [definition]: crate::construct::definition
+/// [label_end]: crate::construct::label_end
fn normalize_uri(value: &str) -> String {
let chars = value.chars().collect::<Vec<_>>();
// Note: it’ll grow bigger for each non-ascii or non-safe character.
diff --git a/src/util/skip.rs b/src/util/skip.rs
index 46cbb4a..a7de408 100644
--- a/src/util/skip.rs
+++ b/src/util/skip.rs
@@ -1,4 +1,4 @@
-//! Utilities to deal with lists of events.
+//! Move across lists of events.
use crate::event::{Event, Kind, Name};
diff --git a/src/util/slice.rs b/src/util/slice.rs
index e70078a..be2a381 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -1,4 +1,4 @@
-//! Utilities to deal with characters.
+//! Deal with bytes.
use crate::constant::TAB_SIZE;
use crate::event::{Event, Kind, Point};
@@ -7,7 +7,9 @@ use std::str;
/// A range between two points.
#[derive(Debug)]
pub struct Position<'a> {
+ /// Start point.
pub start: &'a Point,
+ /// End point.
pub end: &'a Point,
}
@@ -55,11 +57,14 @@ impl<'a> Position<'a> {
/// Bytes belonging to a range.
///
-/// Includes information on virtual spaces before and after the bytes.
+/// Includes info on virtual spaces before and after the bytes.
#[derive(Debug)]
pub struct Slice<'a> {
+ /// Bytes.
pub bytes: &'a [u8],
+ /// Number of virtual spaces before the bytes.
pub before: usize,
+ /// Number of virtual spaces after the bytes.
pub after: usize,
}