diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-22 17:24:05 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-22 17:24:05 +0200 |
commit | 79c3275f91f1c0867a1bfba3085c0682aa5486ef (patch) | |
tree | be30b9a8b755bc6bc01e3f9d59e7d69c60b80b24 | |
parent | b0accb11f1aade55e9fc4dc0a1c1d1b8362ab5d9 (diff) | |
download | markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.gz markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.tar.bz2 markdown-rs-79c3275f91f1c0867a1bfba3085c0682aa5486ef.zip |
Add support for normalizing identifiers
Diffstat (limited to '')
-rw-r--r-- | readme.md | 5 | ||||
-rw-r--r-- | src/construct/definition.rs | 5 | ||||
-rw-r--r-- | src/content/flow.rs | 25 | ||||
-rw-r--r-- | src/util/encode.rs | 2 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | src/util/normalize_identifier.rs | 37 |
6 files changed, 67 insertions, 8 deletions
@@ -68,6 +68,7 @@ cargo doc --document-private-items #### Docs +- [ ] (1) Add docs to `normalize_identifier` - [ ] (1) Add docs for how references and definitions match (definition, reference) - [ ] (1) Go through all bnf - [ ] (1) Go through all docs @@ -80,7 +81,6 @@ cargo doc --document-private-items test (`code_indented`, `hard_break_escape`, `hard_break_trailing`, `heading_atx`, `heading_setext`, `html_flow`, `misc_soft_break`, `misc_tabs`, `thematic_break`) -- [ ] (1) Get definition identifiers (definition) - [ ] (3) Interrupting (html flow complete) - [ ] (5) labels\ test (`character_escape`, `character_reference`, `definition`, @@ -124,7 +124,7 @@ cargo doc --document-private-items `unicode_whitespace` or so the same?) - [ ] (1) Any special handling of surrogates? - [ ] (1) Make sure debugging, assertions are useful for other folks -- [ ] (3) Add some benchmarks, do some perf testing +- [ ] (3) Add some benchmarks (against comrak, pulldown-cmark, kramdown?), do some perf testing - [ ] (3) Write comparison to other parsers - [ ] (3) Add node/etc bindings? - [ ] (3) Bunch of docs @@ -233,6 +233,7 @@ cargo doc --document-private-items - [x] (1) Clean attempts - [x] (1) Add docs for tokenizer - [x] (1) Add docs for sanitation +- [x] (1) Get definition identifiers (definition) ### Extensions diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 57c62a5..3291f7f 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -131,11 +131,6 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [a]|: b "c" /// ``` fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // To do: get the identifier: - // identifier = normalizeIdentifier( - // self.sliceSerialize(self.events[self.events.length - 1][1]).slice(1, -1) - // ) - match code { Code::Char(':') => { tokenizer.enter(TokenType::DefinitionMarker); diff --git a/src/content/flow.rs b/src/content/flow.rs index 6283fef..e71d25a 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -27,13 +27,36 @@ use crate::construct::{ thematic_break::start as thematic_break, }; use crate::subtokenize::subtokenize; -use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::{ + normalize_identifier::normalize_identifier, + span::{from_exit_event, serialize}, +}; /// Turn `codes` as the flow content type into events. pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { let mut tokenizer = Tokenizer::new(point, index); tokenizer.feed(codes, Box::new(start), true); + + let mut index = 0; + + while index < tokenizer.events.len() { + let event = &tokenizer.events[index]; + + if event.event_type == EventType::Exit + && event.token_type == TokenType::DefinitionLabelString + { + let id = normalize_identifier( + serialize(codes, &from_exit_event(&tokenizer.events, index), false).as_str(), + ); + println!("to do: use identifier {:?}", id); + } + + index += 1; + } + let mut result = (tokenizer.events, false); + while !result.1 { result = subtokenize(result.0, codes); } diff --git a/src/util/encode.rs b/src/util/encode.rs index f79c8ea..5762c22 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,6 +21,8 @@ /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) pub fn encode(value: &str) -> String { + // To do: replacing 4 times might just be slow. + // Perhaps we can walk the chars. value .replace('&', "&") .replace('"', """) diff --git a/src/util/mod.rs b/src/util/mod.rs index c3db267..ee58518 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -2,5 +2,6 @@ pub mod decode_character_reference; pub mod encode; +pub mod normalize_identifier; pub mod sanitize_uri; pub mod span; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs new file mode 100644 index 0000000..870fd33 --- /dev/null +++ b/src/util/normalize_identifier.rs @@ -0,0 +1,37 @@ +//! To do. + +/// To do. +pub fn normalize_identifier(value: &str) -> String { + let mut codes = vec![]; + let mut at_start = true; + let mut at_whitespace = true; + + // Collapse markdown whitespace and trim it. + for char in value.chars() { + match char { + '\t' | '\r' | '\n' | ' ' => { + at_whitespace = true; + } + _ => { + if at_whitespace && !at_start { + codes.push(' '); + } + + codes.push(char); + at_start = false; + at_whitespace = false; + } + } + } + + // To do: test if this matches unicode. + // Some characters are considered “uppercase”, but if their lowercase + // counterpart is uppercased will result in a different uppercase + // character. + // Hence, to get that form, we perform both lower- and uppercase. + codes + .iter() + .collect::<String>() + .to_uppercase() + .to_lowercase() +} |