From 79c3275f91f1c0867a1bfba3085c0682aa5486ef Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Wed, 22 Jun 2022 17:24:05 +0200
Subject: Add support for normalizing identifiers

---
 readme.md                        |  5 +++--
 src/construct/definition.rs      |  5 -----
 src/content/flow.rs              | 25 ++++++++++++++++++++++++-
 src/util/encode.rs               |  2 ++
 src/util/mod.rs                  |  1 +
 src/util/normalize_identifier.rs | 37 +++++++++++++++++++++++++++++++++++++
 6 files changed, 67 insertions(+), 8 deletions(-)
 create mode 100644 src/util/normalize_identifier.rs
diff --git a/readme.md b/readme.md
index 6594148..f2188ae 100644
--- a/readme.md
+++ b/readme.md
@@ -68,6 +68,7 @@ cargo doc --document-private-items
 
 #### Docs
 
+- [ ] (1) Add docs to `normalize_identifier`
 - [ ] (1) Add docs for how references and definitions match (definition, reference)
 - [ ] (1) Go through all bnf
 - [ ] (1) Go through all docs
@@ -80,7 +81,6 @@ cargo doc --document-private-items
        test (`code_indented`, `hard_break_escape`, `hard_break_trailing`,
       `heading_atx`, `heading_setext`, `html_flow`, `misc_soft_break`,
       `misc_tabs`, `thematic_break`)
-- [ ] (1) Get definition identifiers (definition)
 - [ ] (3) Interrupting (html flow complete)
 - [ ] (5) labels\
        test (`character_escape`, `character_reference`, `definition`,
@@ -124,7 +124,7 @@ cargo doc --document-private-items
       `unicode_whitespace` or so the same?)
 - [ ] (1) Any special handling of surrogates?
 - [ ] (1) Make sure debugging, assertions are useful for other folks
-- [ ] (3) Add some benchmarks, do some perf testing
+- [ ] (3) Add some benchmarks (against comrak, pulldown-cmark, kramdown?), do some perf testing
 - [ ] (3) Write comparison to other parsers
 - [ ] (3) Add node/etc bindings?
 - [ ] (3) Bunch of docs
@@ -233,6 +233,7 @@ cargo doc --document-private-items
 - [x] (1) Clean attempts
 - [x] (1) Add docs for tokenizer
 - [x] (1) Add docs for sanitation
+- [x] (1) Get definition identifiers (definition)
 
 ### Extensions
 
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 57c62a5..3291f7f 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -131,11 +131,6 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
 /// [a]|: b "c"
 /// ```
 fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
-    // To do: get the identifier:
-    // identifier = normalizeIdentifier(
-    //   self.sliceSerialize(self.events[self.events.length - 1][1]).slice(1, -1)
-    // )
-
     match code {
         Code::Char(':') => {
             tokenizer.enter(TokenType::DefinitionMarker);
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 6283fef..e71d25a 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -27,13 +27,36 @@ use crate::construct::{
     thematic_break::start as thematic_break,
 };
 use crate::subtokenize::subtokenize;
-use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
+use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::{
+    normalize_identifier::normalize_identifier,
+    span::{from_exit_event, serialize},
+};
 
 /// Turn `codes` as the flow content type into events.
 pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
     let mut tokenizer = Tokenizer::new(point, index);
     tokenizer.feed(codes, Box::new(start), true);
+
+    let mut index = 0;
+
+    while index < tokenizer.events.len() {
+        let event = &tokenizer.events[index];
+
+        if event.event_type == EventType::Exit
+            && event.token_type == TokenType::DefinitionLabelString
+        {
+            let id = normalize_identifier(
+                serialize(codes, &from_exit_event(&tokenizer.events, index), false).as_str(),
+            );
+            println!("to do: use identifier {:?}", id);
+        }
+
+        index += 1;
+    }
+
     let mut result = (tokenizer.events, false);
+
     while !result.1 {
         result = subtokenize(result.0, codes);
     }
diff --git a/src/util/encode.rs b/src/util/encode.rs
index f79c8ea..5762c22 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -21,6 +21,8 @@
 ///
 /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
 pub fn encode(value: &str) -> String {
+    // To do: replacing 4 times might just be slow.
+    // Perhaps we can walk the chars.
     value
         .replace('&', "&amp;")
         .replace('"', "&quot;")
diff --git a/src/util/mod.rs b/src/util/mod.rs
index c3db267..ee58518 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -2,5 +2,6 @@
 
 pub mod decode_character_reference;
 pub mod encode;
+pub mod normalize_identifier;
 pub mod sanitize_uri;
 pub mod span;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
new file mode 100644
index 0000000..870fd33
--- /dev/null
+++ b/src/util/normalize_identifier.rs
@@ -0,0 +1,37 @@
+//! To do.
+
+/// To do.
+pub fn normalize_identifier(value: &str) -> String {
+    let mut codes = vec![];
+    let mut at_start = true;
+    let mut at_whitespace = true;
+
+    // Collapse markdown whitespace and trim it.
+    for char in value.chars() {
+        match char {
+            '\t' | '\r' | '\n' | ' ' => {
+                at_whitespace = true;
+            }
+            _ => {
+                if at_whitespace && !at_start {
+                    codes.push(' ');
+                }
+
+                codes.push(char);
+                at_start = false;
+                at_whitespace = false;
+            }
+        }
+    }
+
+    // To do: test if this matches unicode.
+    // Some characters are considered “uppercase”, but if their lowercase
+    // counterpart is uppercased will result in a different uppercase
+    // character.
+    // Hence, to get that form, we perform both lower- and uppercase.
+    codes
+        .iter()
+        .collect::<String>()
+        .to_uppercase()
+        .to_lowercase()
+}
-- 
cgit