Add docs on normalizing identifiers, matching

author: Titus Wormer <tituswormer@gmail.com> 2022-06-22 17:51:14 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-06-22 17:51:14 +0200
commit: 5416b61f89155b475fa5674898b8ff963aa443b5 (patch)
tree: f4709e5f350964f7735207cacd936dbb36f898d9
parent: 79c3275f91f1c0867a1bfba3085c0682aa5486ef (diff)
download: markdown-rs-5416b61f89155b475fa5674898b8ff963aa443b5.tar.gz
markdown-rs-5416b61f89155b475fa5674898b8ff963aa443b5.tar.bz2
markdown-rs-5416b61f89155b475fa5674898b8ff963aa443b5.zip
3 files changed, 72 insertions, 11 deletions
diff --git a/readme.md b/readme.md
index f2188ae..01a911b 100644
--- a/readme.md
+++ b/readme.md
@@ -68,8 +68,6 @@ cargo doc --document-private-items
 
 #### Docs
 
-- [ ] (1) Add docs to `normalize_identifier`
-- [ ] (1) Add docs for how references and definitions match (definition, reference)
 - [ ] (1) Go through all bnf
 - [ ] (1) Go through all docs
 - [ ] (1) Add module docs to parser
@@ -234,6 +232,8 @@ cargo doc --document-private-items
 - [x] (1) Add docs for tokenizer
 - [x] (1) Add docs for sanitation
 - [x] (1) Get definition identifiers (definition)
+- [x] (1) Add docs to `normalize_identifier`
+- [x] (1) Add docs for how references and definitions match
 
 ### Extensions
 
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 3291f7f..48d1192 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -31,16 +31,46 @@
 //! ```
 //!
 //! Definitions in markdown do not, on their own, relate to anything in HTML.
-//! When connected with a link (reference), they together relate to the `<a>`
+//! When matched with a link (reference), they together relate to the `<a>`
 //! element in HTML.
 //! The definition forms its `href`, and optionally `title`, attributes.
-//! See [*§ 4.5.1 The `a` element*][html] in the HTML spec for more info.
+//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info.
+//! Definitions can also match with image (reference), in which case they
+//! form an `<img>` element, where the definition contributes the `src`, and
+//! optionally `title`, attributes.
+//! See [*§ 4.8.3 The `img` element*][html-img] in the HTML spec for more info.
 //!
 //! The `label`, `destination`, and `title` parts are interpreted as the
 //! [string][] content type.
 //! That means that [character escapes][character_escape] and
 //! [character references][character_reference] are allowed.
 //!
+//! Definitions match to references through their label.
+//! To match, both labels must be equal after normalizing with
+//! [`normalize_identifier`][normalize_identifier].
+//! One definitions can match to multiple references.
+//! Multiple definitions with the same, normalized, identifier are ignored: the
+//! first definition is preferred.
+//! To illustrate, the definition with a destination of `x` wins:
+//!
+//! ```markdown
+//! [a]: x
+//! [a]: y
+//!
+//! [a]
+//! ```
+//!
+//! Importantly, while labels *can* include [string][] content (character
+//! escapes and character references), these are not considered when matching.
+//! To illustrate, neither definition matches the reference:
+//!
+//! ```markdown
+//! [a&amp;b]: x
+//! [a\&b]: y
+//!
+//! [a&b]
+//! ```
+//!
 //! For info on how to encode characters in URLs, see
 //! [`partial_destination`][destination].
 //! For info on how to characters are encoded as `href` on `<a>` or `src` on
@@ -75,12 +105,12 @@
 //! [character_escape]: crate::construct::character_escape
 //! [character_reference]: crate::construct::character_reference
 //! [destination]: crate::construct::partial_destination
-//! [sanitize_uri]: crate::util::sanitize_uri
-//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
-//!
-//! <!-- To do: link link (reference) -->
+//! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri
+//! [normalize_identifier]: crate::util::normalize_identifier
+//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
+//! [html-img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element
 //!
-//! <!-- To do: describe how references and definitions match -->
+//! <!-- To do: link link/image (reference) -->
 
 use crate::construct::{
     partial_destination::{start as destination, Options as DestinationOptions},
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index 870fd33..c287e1a 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -1,6 +1,37 @@
-//! To do.
+//! Utility to normalize identifiers.
 
-/// To do.
+/// Normalize an identifier, as found in references and
+/// [definitions][definition], so it can be compared when matching.
+///
+/// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `)
+/// into one space, trims it (as in, dropping the first and last space),
+/// and then performs unicode case folding twice: first by uppercasing
+/// lowercase characters, and then lowercasing uppercase characters.
+///
+/// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if
+/// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a
+/// different uppercase character (U+0398 (`Θ`)).
+/// Hence, to get that form, we perform both upper- and lowercase.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// micromark::util::normalize_identifier::normalize_identifier;
+///
+/// assert_eq!(normalize_identifier(" a "), "a");
+/// assert_eq!(normalize_identifier("a\t\r\nb"), "a b");
+/// assert_eq!(normalize_identifier("ПРИВЕТ"), "привет");
+/// assert_eq!(normalize_identifier("Привет"), "привет");
+/// assert_eq!(normalize_identifier("привет"), "привет");
+/// ```
+///
+/// ## References
+///
+/// *   [`micromark-util-normalize-identifier` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-normalize-identifier)
+///
+/// [definition]: crate::construct::definition
+///
+/// <!--To do: link resource.-->
 pub fn normalize_identifier(value: &str) -> String {
     let mut codes = vec![];
     let mut at_start = true;
author	Titus Wormer <tituswormer@gmail.com>	2022-06-22 17:51:14 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-06-22 17:51:14 +0200
commit	5416b61f89155b475fa5674898b8ff963aa443b5 (patch)
tree	f4709e5f350964f7735207cacd936dbb36f898d9
parent	79c3275f91f1c0867a1bfba3085c0682aa5486ef (diff)
download	markdown-rs-5416b61f89155b475fa5674898b8ff963aa443b5.tar.gz markdown-rs-5416b61f89155b475fa5674898b8ff963aa443b5.tar.bz2 markdown-rs-5416b61f89155b475fa5674898b8ff963aa443b5.zip