aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-22 17:24:05 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-22 17:24:05 +0200
commit79c3275f91f1c0867a1bfba3085c0682aa5486ef (patch)
treebe30b9a8b755bc6bc01e3f9d59e7d69c60b80b24 /src
parentb0accb11f1aade55e9fc4dc0a1c1d1b8362ab5d9 (diff)
downloadmarkdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.gz
markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.bz2
markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.zip
Add support for normalizing identifiers
Diffstat (limited to '')
-rw-r--r--src/construct/definition.rs5
-rw-r--r--src/content/flow.rs25
-rw-r--r--src/util/encode.rs2
-rw-r--r--src/util/mod.rs1
-rw-r--r--src/util/normalize_identifier.rs37
5 files changed, 64 insertions, 6 deletions
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 57c62a5..3291f7f 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -131,11 +131,6 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// [a]|: b "c"
/// ```
fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- // To do: get the identifier:
- // identifier = normalizeIdentifier(
- // self.sliceSerialize(self.events[self.events.length - 1][1]).slice(1, -1)
- // )
-
match code {
Code::Char(':') => {
tokenizer.enter(TokenType::DefinitionMarker);
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 6283fef..e71d25a 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -27,13 +27,36 @@ use crate::construct::{
thematic_break::start as thematic_break,
};
use crate::subtokenize::subtokenize;
-use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
+use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::{
+ normalize_identifier::normalize_identifier,
+ span::{from_exit_event, serialize},
+};
/// Turn `codes` as the flow content type into events.
pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
let mut tokenizer = Tokenizer::new(point, index);
tokenizer.feed(codes, Box::new(start), true);
+
+ let mut index = 0;
+
+ while index < tokenizer.events.len() {
+ let event = &tokenizer.events[index];
+
+ if event.event_type == EventType::Exit
+ && event.token_type == TokenType::DefinitionLabelString
+ {
+ let id = normalize_identifier(
+ serialize(codes, &from_exit_event(&tokenizer.events, index), false).as_str(),
+ );
+ println!("to do: use identifier {:?}", id);
+ }
+
+ index += 1;
+ }
+
let mut result = (tokenizer.events, false);
+
while !result.1 {
result = subtokenize(result.0, codes);
}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index f79c8ea..5762c22 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -21,6 +21,8 @@
///
/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
pub fn encode(value: &str) -> String {
+ // To do: replacing 4 times might just be slow.
+ // Perhaps we can walk the chars.
value
.replace('&', "&amp;")
.replace('"', "&quot;")
diff --git a/src/util/mod.rs b/src/util/mod.rs
index c3db267..ee58518 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -2,5 +2,6 @@
pub mod decode_character_reference;
pub mod encode;
+pub mod normalize_identifier;
pub mod sanitize_uri;
pub mod span;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
new file mode 100644
index 0000000..870fd33
--- /dev/null
+++ b/src/util/normalize_identifier.rs
@@ -0,0 +1,37 @@
+//! To do.
+
+/// To do.
+pub fn normalize_identifier(value: &str) -> String {
+ let mut codes = vec![];
+ let mut at_start = true;
+ let mut at_whitespace = true;
+
+ // Collapse markdown whitespace and trim it.
+ for char in value.chars() {
+ match char {
+ '\t' | '\r' | '\n' | ' ' => {
+ at_whitespace = true;
+ }
+ _ => {
+ if at_whitespace && !at_start {
+ codes.push(' ');
+ }
+
+ codes.push(char);
+ at_start = false;
+ at_whitespace = false;
+ }
+ }
+ }
+
+ // To do: test if this matches unicode.
+ // Some characters are considered “uppercase”, but if their lowercase
+ // counterpart is uppercased will result in a different uppercase
+ // character.
+ // Hence, to get that form, we perform both lower- and uppercase.
+ codes
+ .iter()
+ .collect::<String>()
+ .to_uppercase()
+ .to_lowercase()
+}