From 5416b61f89155b475fa5674898b8ff963aa443b5 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 22 Jun 2022 17:51:14 +0200 Subject: Add docs on normalizing identifiers, matching --- src/construct/definition.rs | 44 +++++++++++++++++++++++++++++++++------- src/util/normalize_identifier.rs | 35 ++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 3291f7f..48d1192 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -31,16 +31,46 @@ //! ``` //! //! Definitions in markdown do not, on their own, relate to anything in HTML. -//! When connected with a link (reference), they together relate to the `` +//! When matched with a link (reference), they together relate to the `` //! element in HTML. //! The definition forms its `href`, and optionally `title`, attributes. -//! See [*§ 4.5.1 The `a` element*][html] in the HTML spec for more info. +//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info. +//! Definitions can also match with image (reference), in which case they +//! form an `` element, where the definition contributes the `src`, and +//! optionally `title`, attributes. +//! See [*§ 4.8.3 The `img` element*][html-img] in the HTML spec for more info. //! //! The `label`, `destination`, and `title` parts are interpreted as the //! [string][] content type. //! That means that [character escapes][character_escape] and //! [character references][character_reference] are allowed. //! +//! Definitions match to references through their label. +//! To match, both labels must be equal after normalizing with +//! [`normalize_identifier`][normalize_identifier]. +//! One definitions can match to multiple references. +//! Multiple definitions with the same, normalized, identifier are ignored: the +//! first definition is preferred. +//! To illustrate, the definition with a destination of `x` wins: +//! +//! ```markdown +//! [a]: x +//! [a]: y +//! +//! [a] +//! ``` +//! +//! Importantly, while labels *can* include [string][] content (character +//! escapes and character references), these are not considered when matching. +//! To illustrate, neither definition matches the reference: +//! +//! ```markdown +//! [a&b]: x +//! [a\&b]: y +//! +//! [a&b] +//! ``` +//! //! For info on how to encode characters in URLs, see //! [`partial_destination`][destination]. //! For info on how to characters are encoded as `href` on `` or `src` on @@ -75,12 +105,12 @@ //! [character_escape]: crate::construct::character_escape //! [character_reference]: crate::construct::character_reference //! [destination]: crate::construct::partial_destination -//! [sanitize_uri]: crate::util::sanitize_uri -//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element -//! -//! +//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri +//! [normalize_identifier]: crate::util::normalize_identifier +//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! [html-img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element //! -//! +//! use crate::construct::{ partial_destination::{start as destination, Options as DestinationOptions}, diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 870fd33..c287e1a 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -1,6 +1,37 @@ -//! To do. +//! Utility to normalize identifiers. -/// To do. +/// Normalize an identifier, as found in references and +/// [definitions][definition], so it can be compared when matching. +/// +/// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `) +/// into one space, trims it (as in, dropping the first and last space), +/// and then performs unicode case folding twice: first by uppercasing +/// lowercase characters, and then lowercasing uppercase characters. +/// +/// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if +/// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a +/// different uppercase character (U+0398 (`Θ`)). +/// Hence, to get that form, we perform both upper- and lowercase. +/// +/// ## Examples +/// +/// ```rust ignore +/// micromark::util::normalize_identifier::normalize_identifier; +/// +/// assert_eq!(normalize_identifier(" a "), "a"); +/// assert_eq!(normalize_identifier("a\t\r\nb"), "a b"); +/// assert_eq!(normalize_identifier("ПРИВЕТ"), "привет"); +/// assert_eq!(normalize_identifier("Привет"), "привет"); +/// assert_eq!(normalize_identifier("привет"), "привет"); +/// ``` +/// +/// ## References +/// +/// * [`micromark-util-normalize-identifier` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-normalize-identifier) +/// +/// [definition]: crate::construct::definition +/// +/// pub fn normalize_identifier(value: &str) -> String { let mut codes = vec![]; let mut at_start = true; -- cgit