diff options
Diffstat (limited to '')
-rw-r--r-- | src/util/decode_character_reference.rs | 10 | ||||
-rw-r--r-- | src/util/edit_map.rs | 4 | ||||
-rw-r--r-- | src/util/encode.rs | 5 | ||||
-rw-r--r-- | src/util/mod.rs | 2 | ||||
-rw-r--r-- | src/util/normalize_identifier.rs | 30 | ||||
-rw-r--r-- | src/util/sanitize_uri.rs | 10 | ||||
-rw-r--r-- | src/util/skip.rs | 2 | ||||
-rw-r--r-- | src/util/slice.rs | 9 |
8 files changed, 39 insertions, 33 deletions
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs index f8fd18f..8ed32f4 100644 --- a/src/util/decode_character_reference.rs +++ b/src/util/decode_character_reference.rs @@ -1,4 +1,4 @@ -//! Utilities to decode character references. +//! Decode character references. use crate::constant::CHARACTER_REFERENCES; @@ -43,11 +43,11 @@ pub fn decode_named(value: &str) -> String { /// Decode numeric character references. /// /// Turn the number (in string form as either hexadecimal or decimal) coming -/// from a numeric character reference into a character. -/// Whether the base of the string form is `10` (decimal) or `16` (hexadecimal) -/// must be passed as the `radix` parameter. +/// from a numeric character reference into a string. +/// The base of the string form must be passed as the `radix` parameter, as +/// `10` (decimal) or `16` (hexadecimal). /// -/// This returns the `char` associated with that number or a replacement +/// This returns a `String` form of the associated character or a replacement /// character for C0 control characters (except for ASCII whitespace), C1 /// control characters, lone surrogates, noncharacters, and out of range /// characters. diff --git a/src/util/edit_map.rs b/src/util/edit_map.rs index 11ac486..33c5706 100644 --- a/src/util/edit_map.rs +++ b/src/util/edit_map.rs @@ -1,6 +1,6 @@ -//! Helpers to deal with several changes in events, batching them together. +//! Deal with several changes in events, batching them together. //! -//! Preferably, changes should be kept to a minumum. +//! Preferably, changes should be kept to a minimum. //! Sometimes, it’s needed to change the list of events, because parsing can be //! messy, and it helps to expose a cleaner interface of events to the compiler //! and other users. diff --git a/src/util/encode.rs b/src/util/encode.rs index d37a2de..6530011 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -1,10 +1,11 @@ -//! Utilities to encode HTML. +//! Encode HTML. /// Encode dangerous html characters. /// /// This ensures that certain characters which have special meaning in HTML are /// dealt with. -/// Technically, we can skip `>` and `"` in many cases, but CM includes them. +/// Technically, we can skip `>` and `"` in many cases, but `CommonMark` +/// includes them. /// /// This behavior is not explained in prose in `CommonMark` but can be inferred /// from the input/output test cases. diff --git a/src/util/mod.rs b/src/util/mod.rs index a01f31e..f51845c 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,4 +1,4 @@ -//! Utilities used when compiling markdown. +//! Utilities used when processing markdown. pub mod decode_character_reference; pub mod edit_map; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index f5b12d0..ddc51f8 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -1,17 +1,25 @@ -//! Utility to normalize identifiers. +//! Normalize identifiers. /// Normalize an identifier, as found in [references][label_end] and /// [definitions][definition], so it can be compared when matching. /// /// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `) -/// into one space, trims it (as in, dropping the first and last space), -/// and then performs unicode case folding twice: first by uppercasing -/// lowercase characters, and then lowercasing uppercase characters. +/// into one space, trims it (as in, dropping the first and last space), and +/// then performs unicode case folding twice: first by lowercasing uppercase +/// characters, and then uppercasing lowercase characters. /// /// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if /// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a /// different uppercase character (U+0398 (`Θ`)). -/// Hence, to get that form, we perform both upper- and lowercase. +/// Hence, to get that form, we perform both lower- and uppercase. +/// +/// Performing these steps in that order works, but the inverse does not work. +/// To illustrate, say the source markdown containes two identifiers +/// `SS` (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to +/// `ss` (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both +/// uppercase to `SS` (U+0053 U+0053). +/// If we’d inverse the steps, for `ẞ`, we’d first uppercase without a +/// change, and then lowercase to `ß`, which would not match `ss`. /// /// ## Examples /// @@ -64,17 +72,5 @@ pub fn normalize_identifier(value: &str) -> String { result.push_str(&value[start..]); } - // Some characters are considered “uppercase”, but if their lowercase - // counterpart is uppercased will result in a different uppercase - // character. - // Hence, to get that form, we perform both lower- and uppercase. - // Performing these steps in that order works, but the inverse does not - // work. - // To illustrate, say the source markdown containes two identifiers `SS` - // (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to `ss` - // (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both uppercase - // to `SS` (U+0053 U+0053). - // If we’d inverse the steps, for `ẞ`, we’d first uppercase without a - // change, and then lowercase to `ß`, which would not match `ss`. result.to_lowercase().to_uppercase() } diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 051e1e1..593a70e 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -1,4 +1,4 @@ -//! Utilities to make urls safe. +//! Make urls safe. use crate::util::encode::encode; @@ -60,9 +60,10 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { value } -/// Normalize a URL (such as used in definitions). +/// Normalize a URL (such as used in [definitions][definition], +/// [references][label_end]). /// -/// Encode unsafe characters with percent-encoding, skipping already encoded +/// It encodes unsafe characters with percent-encoding, skipping already encoded /// sequences. /// /// ## Examples @@ -77,6 +78,9 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { /// ## References /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) +/// +/// [definition]: crate::construct::definition +/// [label_end]: crate::construct::label_end fn normalize_uri(value: &str) -> String { let chars = value.chars().collect::<Vec<_>>(); // Note: it’ll grow bigger for each non-ascii or non-safe character. diff --git a/src/util/skip.rs b/src/util/skip.rs index 46cbb4a..a7de408 100644 --- a/src/util/skip.rs +++ b/src/util/skip.rs @@ -1,4 +1,4 @@ -//! Utilities to deal with lists of events. +//! Move across lists of events. use crate::event::{Event, Kind, Name}; diff --git a/src/util/slice.rs b/src/util/slice.rs index e70078a..be2a381 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -1,4 +1,4 @@ -//! Utilities to deal with characters. +//! Deal with bytes. use crate::constant::TAB_SIZE; use crate::event::{Event, Kind, Point}; @@ -7,7 +7,9 @@ use std::str; /// A range between two points. #[derive(Debug)] pub struct Position<'a> { + /// Start point. pub start: &'a Point, + /// End point. pub end: &'a Point, } @@ -55,11 +57,14 @@ impl<'a> Position<'a> { /// Bytes belonging to a range. /// -/// Includes information on virtual spaces before and after the bytes. +/// Includes info on virtual spaces before and after the bytes. #[derive(Debug)] pub struct Slice<'a> { + /// Bytes. pub bytes: &'a [u8], + /// Number of virtual spaces before the bytes. pub before: usize, + /// Number of virtual spaces after the bytes. pub after: usize, } |