aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-04 15:21:11 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-04 15:21:11 +0200
commit0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (patch)
treecc73bb48ae6102b27b7b864f13585eb77ef86c2c
parent8eb4631bd7c4345ec2a0c9b2ca2e05bdb1d79dd7 (diff)
downloadmarkdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.gz
markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.bz2
markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.zip
Add support for unicode punctuation
-rw-r--r--.gitignore1
-rw-r--r--readme.md3
-rw-r--r--script/generate-unicode.js68
-rw-r--r--script/package.json3
-rw-r--r--src/compiler.rs2
-rw-r--r--src/construct/attention.rs6
-rw-r--r--src/content/flow.rs2
-rw-r--r--src/lib.rs1
-rw-r--r--src/subtokenize.rs2
-rw-r--r--src/tokenizer.rs2
-rw-r--r--src/unicode.rs838
-rw-r--r--src/util/edit_map.rs2
12 files changed, 920 insertions, 10 deletions
diff --git a/.gitignore b/.gitignore
index 32a28f2..0247d2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
*.lock
coverage/
target
+script/unicode-data.txt
diff --git a/readme.md b/readme.md
index ef943b9..6c3ecf3 100644
--- a/readme.md
+++ b/readme.md
@@ -125,6 +125,7 @@ cargo doc --document-private-items
#### Refactor
- [ ] (1) Use `edit_map` in `subtokenize` (needs to support links in edits)
+- [ ] (1) Use rust to crawl unicode
#### Parse
@@ -151,7 +152,6 @@ cargo doc --document-private-items
#### Misc
- [ ] (1) use `char::REPLACEMENT_CHARACTER`?
-- [ ] (3) Unicode punctuation
- [ ] (3) `nostd`
- [ ] (3) Check subtokenizer unraveling is ok
- [ ] (3) Remove splicing and cloning in subtokenizer
@@ -275,3 +275,4 @@ important.
things interrupt them each line
- [x] (3) Add support for interrupting (or not)
- [x] (5) attention
+- [x] (3) Unicode punctuation
diff --git a/script/generate-unicode.js b/script/generate-unicode.js
new file mode 100644
index 0000000..35150af
--- /dev/null
+++ b/script/generate-unicode.js
@@ -0,0 +1,68 @@
+// To do: port to Rust with `reqwest`?
+import fs from "node:fs/promises";
+
+const dataUrl = new URL("unicode-data.txt", import.meta.url);
+const codeUrl = new URL("../src/unicode.rs", import.meta.url);
+/** @type {string} */
+let data;
+
+try {
+ data = String(await fs.readFile(dataUrl));
+} catch {
+ const response = await fetch(
+ "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
+ );
+ console.log(response);
+ const text = await response.text();
+ await fs.writeFile(dataUrl, text);
+ data = text;
+}
+
+let rows = data.split("\n");
+let index = -1;
+let search = [
+ "Pc", // Punctuation, Connector
+ "Pd", // Punctuation, Dash
+ "Pe", // Punctuation, Close
+ "Pf", // Punctuation, FinalQuote
+ "Pi", // Punctuation, InitialQuote
+ "Po", // Punctuation, Other
+ "Ps", // Punctuation, Open
+];
+/** @type {Array<string>} */
+let found = [];
+
+while (++index < rows.length) {
+ const cells = rows[index].split(";");
+ const [code, , category] = cells;
+ if (search.includes(category)) {
+ found.push(code);
+ }
+}
+
+await fs.writeFile(
+ codeUrl,
+ [
+ "//! Information on Unicode.",
+ "",
+ "/// List of characters that are considered punctuation according to Unicode.",
+ "///",
+ "/// > 👉 **Important**: this module is generated by `script/`.",
+ "/// > It is generate from the latest Unicode data.",
+ "///",
+ "/// Rust does not contain an `is_punctuation` method on `char`, while it does",
+ "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).",
+ "///",
+ "/// `CommonMark` handles attention (emphasis, strong) markers based on what",
+ "/// comes before or after them.",
+ "/// One such difference is if those characters are Unicode punctuation.",
+ "///",
+ "/// ## References",
+ "///",
+ "/// * [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)",
+ "pub const PUNCTUATION: [char; " + found.length + "] = [",
+ ...found.map((d) => " '\\u{" + d + "}',"),
+ "];",
+ "",
+ ].join("\n")
+);
diff --git a/script/package.json b/script/package.json
new file mode 100644
index 0000000..3dbc1ca
--- /dev/null
+++ b/script/package.json
@@ -0,0 +1,3 @@
+{
+ "type": "module"
+}
diff --git a/src/compiler.rs b/src/compiler.rs
index b0061ce..2c6fe68 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -1,5 +1,4 @@
//! Turn events into a string of HTML.
-use std::collections::HashMap;
use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC};
use crate::construct::character_reference::Kind as CharacterReferenceKind;
use crate::tokenizer::{Code, Event, EventType, TokenType};
@@ -10,6 +9,7 @@ use crate::util::{
sanitize_uri::sanitize_uri,
span::{codes as codes_from_span, from_exit_event, serialize},
};
+use std::collections::HashMap;
/// Type of line endings in markdown.
#[derive(Debug, Clone, PartialEq)]
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index d4541b4..f4bb841 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -1,6 +1,7 @@
//! To do.
use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer};
+use crate::unicode::PUNCTUATION;
use crate::util::edit_map::EditMap;
/// To do
@@ -421,10 +422,7 @@ fn classify_character(code: Code) -> GroupKind {
// Unicode whitespace.
Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace,
// Unicode punctuation.
- // To do: `is_punctuation` is not in rust? Why not?
- // Perhaps we need to generate stuff just like:
- // <https://github.com/micromark/micromark/blob/main/packages/micromark-util-character/dev/lib/unicode-punctuation-regex.js>.
- Code::Char(char) if char.is_ascii_punctuation() => GroupKind::Punctuation,
+ Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation,
Code::Char(_) => GroupKind::Other,
}
}
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 4a12e0f..3ff948d 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -19,7 +19,6 @@
//! * [HTML (flow)][crate::construct::html_flow]
//! * [Thematic break][crate::construct::thematic_break]
-use std::collections::HashSet;
use crate::construct::{
blank_line::start as blank_line, code_fenced::start as code_fenced,
code_indented::start as code_indented, definition::start as definition,
@@ -34,6 +33,7 @@ use crate::util::{
normalize_identifier::normalize_identifier,
span::{from_exit_event, serialize},
};
+use std::collections::HashSet;
/// Turn `codes` as the flow content type into events.
pub fn flow(parse_state: &mut ParseState, point: Point, index: usize) -> Vec<Event> {
diff --git a/src/lib.rs b/src/lib.rs
index ba129dc..1a86286 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,7 @@ mod content;
mod parser;
mod subtokenize;
mod tokenizer;
+mod unicode;
mod util;
use crate::compiler::compile;
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index ad8aace..f3e9ae0 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -21,11 +21,11 @@
//! thus the whole document needs to be parsed up to the level of definitions,
//! before any level that can include references can be parsed.
-use std::collections::HashMap;
use crate::content::{string::start as string, text::start as text};
use crate::parser::ParseState;
use crate::tokenizer::{ContentType, Event, EventType, State, StateFn, StateFnResult, Tokenizer};
use crate::util::span;
+use std::collections::HashMap;
/// Create a link between two [`Event`][]s.
///
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 9d870c9..1fa94d7 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -11,9 +11,9 @@
//! [`attempt`]: Tokenizer::attempt
//! [`check`]: Tokenizer::check
-use std::collections::HashMap;
use crate::constant::TAB_SIZE;
use crate::parser::ParseState;
+use std::collections::HashMap;
/// Semantic label of a span.
// To do: figure out how to share this so extensions can add their own stuff,
diff --git a/src/unicode.rs b/src/unicode.rs
new file mode 100644
index 0000000..8107440
--- /dev/null
+++ b/src/unicode.rs
@@ -0,0 +1,838 @@
+//! Information on Unicode.
+
+/// List of characters that are considered punctuation according to Unicode.
+///
+/// > 👉 **Important**: this module is generated by `script/`.
+/// > It is generate from the latest Unicode data.
+///
+/// Rust does not contain an `is_punctuation` method on `char`, while it does
+/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
+///
+/// `CommonMark` handles attention (emphasis, strong) markers based on what
+/// comes before or after them.
+/// One such difference is if those characters are Unicode punctuation.
+///
+/// ## References
+///
+/// * [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
+pub const PUNCTUATION: [char; 819] = [
+ '\u{0021}',
+ '\u{0022}',
+ '\u{0023}',
+ '\u{0025}',
+ '\u{0026}',
+ '\u{0027}',
+ '\u{0028}',
+ '\u{0029}',
+ '\u{002A}',
+ '\u{002C}',
+ '\u{002D}',
+ '\u{002E}',
+ '\u{002F}',
+ '\u{003A}',
+ '\u{003B}',
+ '\u{003F}',
+ '\u{0040}',
+ '\u{005B}',
+ '\u{005C}',
+ '\u{005D}',
+ '\u{005F}',
+ '\u{007B}',
+ '\u{007D}',
+ '\u{00A1}',
+ '\u{00A7}',
+ '\u{00AB}',
+ '\u{00B6}',
+ '\u{00B7}',
+ '\u{00BB}',
+ '\u{00BF}',
+ '\u{037E}',
+ '\u{0387}',
+ '\u{055A}',
+ '\u{055B}',
+ '\u{055C}',
+ '\u{055D}',
+ '\u{055E}',
+ '\u{055F}',
+ '\u{0589}',
+ '\u{058A}',
+ '\u{05BE}',
+ '\u{05C0}',
+ '\u{05C3}',
+ '\u{05C6}',
+ '\u{05F3}',
+ '\u{05F4}',
+ '\u{0609}',
+ '\u{060A}',
+ '\u{060C}',
+ '\u{060D}',
+ '\u{061B}',
+ '\u{061D}',
+ '\u{061E}',
+ '\u{061F}',
+ '\u{066A}',
+ '\u{066B}',
+ '\u{066C}',
+ '\u{066D}',
+ '\u{06D4}',
+ '\u{0700}',
+ '\u{0701}',
+ '\u{0702}',
+ '\u{0703}',
+ '\u{0704}',
+ '\u{0705}',
+ '\u{0706}',
+ '\u{0707}',
+ '\u{0708}',
+ '\u{0709}',
+ '\u{070A}',
+ '\u{070B}',
+ '\u{070C}',
+ '\u{070D}',
+ '\u{07F7}',
+ '\u{07F8}',
+ '\u{07F9}',
+ '\u{0830}',
+ '\u{0831}',
+ '\u{0832}',
+ '\u{0833}',
+ '\u{0834}',
+ '\u{0835}',
+ '\u{0836}',
+ '\u{0837}',
+ '\u{0838}',
+ '\u{0839}',
+ '\u{083A}',
+ '\u{083B}',
+ '\u{083C}',
+ '\u{083D}',
+ '\u{083E}',
+ '\u{085E}',
+ '\u{0964}',
+ '\u{0965}',
+ '\u{0970}',
+ '\u{09FD}',
+ '\u{0A76}',
+ '\u{0AF0}',
+ '\u{0C77}',
+ '\u{0C84}',
+ '\u{0DF4}',
+ '\u{0E4F}',
+ '\u{0E5A}',
+ '\u{0E5B}',
+ '\u{0F04}',
+ '\u{0F05}',
+ '\u{0F06}',
+ '\u{0F07}',
+ '\u{0F08}',
+ '\u{0F09}',
+ '\u{0F0A}',
+ '\u{0F0B}',
+ '\u{0F0C}',
+ '\u{0F0D}',
+ '\u{0F0E}',
+ '\u{0F0F}',
+ '\u{0F10}',
+ '\u{0F11}',
+ '\u{0F12}',
+ '\u{0F14}',
+ '\u{0F3A}',
+ '\u{0F3B}',
+ '\u{0F3C}',
+ '\u{0F3D}',
+ '\u{0F85}',
+ '\u{0FD0}',
+ '\u{0FD1}',
+ '\u{0FD2}',
+ '\u{0FD3}',
+ '\u{0FD4}',
+ '\u{0FD9}',
+ '\u{0FDA}',
+ '\u{104A}',
+ '\u{104B}',
+ '\u{104C}',
+ '\u{104D}',
+ '\u{104E}',
+ '\u{104F}',
+ '\u{10FB}',
+ '\u{1360}',
+ '\u{1361}',
+ '\u{1362}',
+ '\u{1363}',
+ '\u{1364}',
+ '\u{1365}',
+ '\u{1366}',
+ '\u{1367}',
+ '\u{1368}',
+ '\u{1400}',
+ '\u{166E}',
+ '\u{169B}',
+ '\u{169C}',
+ '\u{16EB}',
+ '\u{16EC}',
+ '\u{16ED}',
+ '\u{1735}',
+ '\u{1736}',
+ '\u{17D4}',
+ '\u{17D5}',
+ '\u{17D6}',
+ '\u{17D8}',
+ '\u{17D9}',
+ '\u{17DA}',
+ '\u{1800}',
+ '\u{1801}',
+ '\u{1802}',
+ '\u{1803}',
+ '\u{1804}',
+ '\u{1805}',
+ '\u{1806}',
+ '\u{1807}',
+ '\u{1808}',
+ '\u{1809}',
+ '\u{180A}',
+ '\u{1944}',
+ '\u{1945}',
+ '\u{1A1E}',
+ '\u{1A1F}',
+ '\u{1AA0}',
+ '\u{1AA1}',
+ '\u{1AA2}',
+ '\u{1AA3}',
+ '\u{1AA4}',
+ '\u{1AA5}',
+ '\u{1AA6}',
+ '\u{1AA8}',
+ '\u{1AA9}',
+ '\u{1AAA}',
+ '\u{1AAB}',
+ '\u{1AAC}',
+ '\u{1AAD}',
+ '\u{1B5A}',
+ '\u{1B5B}',
+ '\u{1B5C}',
+ '\u{1B5D}',
+ '\u{1B5E}',
+ '\u{1B5F}',
+ '\u{1B60}',
+ '\u{1B7D}',
+ '\u{1B7E}',
+ '\u{1BFC}',
+ '\u{1BFD}',
+ '\u{1BFE}',
+ '\u{1BFF}',
+ '\u{1C3B}',
+ '\u{1C3C}',
+ '\u{1C3D}',
+ '\u{1C3E}',
+ '\u{1C3F}',
+ '\u{1C7E}',
+ '\u{1C7F}',
+ '\u{1CC0}',
+ '\u{1CC1}',
+ '\u{1CC2}',
+ '\u{1CC3}',
+ '\u{1CC4}',
+ '\u{1CC5}',
+ '\u{1CC6}',
+ '\u{1CC7}',
+ '\u{1CD3}',
+ '\u{2010}',
+ '\u{2011}',
+ '\u{2012}',
+ '\u{2013}',
+ '\u{2014}',
+ '\u{2015}',
+ '\u{2016}',
+ '\u{2017}',
+ '\u{2018}',
+ '\u{2019}',
+ '\u{201A}',
+ '\u{201B}',
+ '\u{201C}',
+ '\u{201D}',
+ '\u{201E}',
+ '\u{201F}',
+ '\u{2020}',
+ '\u{2021}',
+ '\u{2022}',
+ '\u{2023}',
+ '\u{2024}',
+ '\u{2025}',
+ '\u{2026}',
+ '\u{2027}',
+ '\u{2030}',
+ '\u{2031}',
+ '\u{2032}',
+ '\u{2033}',
+ '\u{2034}',
+ '\u{2035}',
+ '\u{2036}',
+ '\u{2037}',
+ '\u{2038}',
+ '\u{2039}',
+ '\u{203A}',
+ '\u{203B}',
+ '\u{203C}',
+ '\u{203D}',
+ '\u{203E}',
+ '\u{203F}',
+ '\u{2040}',
+ '\u{2041}',
+ '\u{2042}',
+ '\u{2043}',
+ '\u{2045}',
+ '\u{2046}',
+ '\u{2047}',
+ '\u{2048}',
+ '\u{2049}',
+ '\u{204A}',
+ '\u{204B}',
+ '\u{204C}',
+ '\u{204D}',
+ '\u{204E}',
+ '\u{204F}',
+ '\u{2050}',
+ '\u{2051}',
+ '\u{2053}',
+ '\u{2054}',
+ '\u{2055}',
+ '\u{2056}',
+ '\u{2057}',
+ '\u{2058}',
+ '\u{2059}',
+ '\u{205A}',
+ '\u{205B}',
+ '\u{205C}',
+ '\u{205D}',
+ '\u{205E}',
+ '\u{207D}',
+ '\u{207E}',
+ '\u{208D}',
+ '\u{208E}',
+ '\u{2308}',
+ '\u{2309}',
+ '\u{230A}',
+ '\u{230B}',
+ '\u{2329}',
+ '\u{232A}',
+ '\u{2768}',
+ '\u{2769}',
+ '\u{276A}',
+ '\u{276B}',
+ '\u{276C}',
+ '\u{276D}',
+ '\u{276E}',
+ '\u{276F}',
+ '\u{2770}',
+ '\u{2771}',
+ '\u{2772}',
+ '\u{2773}',
+ '\u{2774}',
+ '\u{2775}',
+ '\u{27C5}',
+ '\u{27C6}',
+ '\u{27E6}',
+ '\u{27E7}',
+ '\u{27E8}',
+ '\u{27E9}',
+ '\u{27EA}',
+ '\u{27EB}',
+ '\u{27EC}',
+ '\u{27ED}',
+ '\u{27EE}',
+ '\u{27EF}',
+ '\u{2983}',
+ '\u{2984}',
+ '\u{2985}',
+ '\u{2986}',
+ '\u{2987}',
+ '\u{2988}',
+ '\u{2989}',
+ '\u{298A}',
+ '\u{298B}',
+ '\u{298C}',
+ '\u{298D}',
+ '\u{298E}',
+ '\u{298F}',
+ '\u{2990}',
+ '\u{2991}',
+ '\u{2992}',
+ '\u{2993}',
+ '\u{2994}',
+ '\u{2995}',
+ '\u{2996}',
+ '\u{2997}',
+ '\u{2998}',
+ '\u{29D8}',
+ '\u{29D9}',
+ '\u{29DA}',
+ '\u{29DB}',
+ '\u{29FC}',
+ '\u{29FD}',
+ '\u{2CF9}',
+ '\u{2CFA}',
+ '\u{2CFB}',
+ '\u{2CFC}',
+ '\u{2CFE}',
+ '\u{2CFF}',
+ '\u{2D70}',
+ '\u{2E00}',
+ '\u{2E01}',
+ '\u{2E02}',
+ '\u{2E03}',
+ '\u{2E04}',
+ '\u{2E05}',
+ '\u{2E06}',
+ '\u{2E07}',
+ '\u{2E08}',
+ '\u{2E09}',
+ '\u{2E0A}',
+ '\u{2E0B}',
+ '\u{2E0C}',
+ '\u{2E0D}',
+ '\u{2E0E}',
+ '\u{2E0F}',
+ '\u{2E10}',
+ '\u{2E11}',
+ '\u{2E12}',
+ '\u{2E13}',
+ '\u{2E14}',
+ '\u{2E15}',
+ '\u{2E16}',
+ '\u{2E17}',
+ '\u{2E18}',
+ '\u{2E19}',
+ '\u{2E1A}',
+ '\u{2E1B}',
+ '\u{2E1C}',
+ '\u{2E1D}',
+ '\u{2E1E}',
+ '\u{2E1F}',
+ '\u{2E20}',
+ '\u{2E21}',
+ '\u{2E22}',
+ '\u{2E23}',
+ '\u{2E24}',
+ '\u{2E25}',
+ '\u{2E26}',
+ '\u{2E27}',
+ '\u{2E28}',
+ '\u{2E29}',
+ '\u{2E2A}',
+ '\u{2E2B}',
+ '\u{2E2C}',
+ '\u{2E2D}',
+ '\u{2E2E}',
+ '\u{2E30}',
+ '\u{2E31}',
+ '\u{2E32}',
+ '\u{2E33}',
+ '\u{2E34}',
+ '\u{2E35}',
+ '\u{2E36}',
+ '\u{2E37}',
+ '\u{2E38}',
+ '\u{2E39}',
+ '\u{2E3A}',
+ '\u{2E3B}',
+ '\u{2E3C}',
+ '\u{2E3D}',
+ '\u{2E3E}',
+ '\u{2E3F}',
+ '\u{2E40}',
+ '\u{2E41}',
+ '\u{2E42}',
+ '\u{2E43}',
+ '\u{2E44}',
+ '\u{2E45}',
+ '\u{2E46}',
+ '\u{2E47}',
+ '\u{2E48}',
+ '\u{2E49}',
+ '\u{2E4A}',
+ '\u{2E4B}',
+ '\u{2E4C}',
+ '\u{2E4D}',
+ '\u{2E4E}',
+ '\u{2E4F}',
+ '\u{2E52}',
+ '\u{2E53}',
+ '\u{2E54}',
+ '\u{2E55}',
+ '\u{2E56}',
+ '\u{2E57}',
+ '\u{2E58}',
+ '\u{2E59}',
+ '\u{2E5A}',
+ '\u{2E5B}',
+ '\u{2E5C}',
+ '\u{2E5D}',
+ '\u{3001}',
+ '\u{3002}',
+ '\u{3003}',
+ '\u{3008}',
+ '\u{3009}',
+ '\u{300A}',
+ '\u{300B}',
+ '\u{300C}',
+ '\u{300D}',
+ '\u{300E}',
+ '\u{300F}',
+ '\u{3010}',
+ '\u{3011}',
+ '\u{3014}',
+ '\u{3015}',
+ '\u{3016}',
+ '\u{3017}',
+ '\u{3018}',
+ '\u{3019}',
+ '\u{301A}',
+ '\u{301B}',
+ '\u{301C}',
+ '\u{301D}',
+ '\u{301E}',
+ '\u{301F}',
+ '\u{3030}',
+ '\u{303D}',
+ '\u{30A0}',
+ '\u{30FB}',
+ '\u{A4FE}',
+ '\u{A4FF}',
+ '\u{A60D}',
+ '\u{A60E}',
+ '\u{A60F}',
+ '\u{A673}',
+ '\u{A67E}',
+ '\u{A6F2}',
+ '\u{A6F3}',
+ '\u{A6F4}',
+ '\u{A6F5}',
+ '\u{A6F6}',
+ '\u{A6F7}',
+ '\u{A874}',
+ '\u{A875}',
+ '\u{A876}',
+ '\u{A877}',
+ '\u{A8CE}',
+ '\u{A8CF}',
+ '\u{A8F8}',
+ '\u{A8F9}',
+ '\u{A8FA}',
+ '\u{A8FC}',
+ '\u{A92E}',
+ '\u{A92F}',
+ '\u{A95F}',
+ '\u{A9C1}',
+ '\u{A9C2}',
+ '\u{A9C3}',
+ '\u{A9C4}',
+ '\u{A9C5}',
+ '\u{A9C6}',
+ '\u{A9C7}',
+ '\u{A9C8}',
+ '\u{A9C9}',
+ '\u{A9CA}',
+ '\u{A9CB}',
+ '\u{A9CC}',
+ '\u{A9CD}',
+ '\u{A9DE}',
+ '\u{A9DF}',
+ '\u{AA5C}',
+ '\u{AA5D}',
+ '\u{AA5E}',
+ '\u{AA5F}',
+ '\u{AADE}',
+ '\u{AADF}',
+ '\u{AAF0}',
+ '\u{AAF1}',
+ '\u{ABEB}',
+ '\u{FD3E}',
+ '\u{FD3F}',
+ '\u{FE10}',
+ '\u{FE11}',
+ '\u{FE12}',
+ '\u{FE13}',
+ '\u{FE14}',
+ '\u{FE15}',
+ '\u{FE16}',
+ '\u{FE17}',
+ '\u{FE18}',
+ '\u{FE19}',
+ '\u{FE30}',
+ '\u{FE31}',
+ '\u{FE32}',
+ '\u{FE33}',
+ '\u{FE34}',
+ '\u{FE35}',
+ '\u{FE36}',
+ '\u{FE37}',
+ '\u{FE38}',
+ '\u{FE39}',
+ '\u{FE3A}',
+ '\u{FE3B}',
+ '\u{FE3C}',
+ '\u{FE3D}',
+ '\u{FE3E}',
+ '\u{FE3F}',
+ '\u{FE40}',
+ '\u{FE41}',
+ '\u{FE42}',
+ '\u{FE43}',
+ '\u{FE44}',
+ '\u{FE45}',
+ '\u{FE46}',
+ '\u{FE47}',
+ '\u{FE48}',
+ '\u{FE49}',
+ '\u{FE4A}',
+ '\u{FE4B}',
+ '\u{FE4C}',
+ '\u{FE4D}',
+ '\u{FE4E}',
+ '\u{FE4F}',
+ '\u{FE50}',
+ '\u{FE51}',
+ '\u{FE52}',
+ '\u{FE54}',
+ '\u{FE55}',
+ '\u{FE56}',
+ '\u{FE57}',
+ '\u{FE58}',
+ '\u{FE59}',
+ '\u{FE5A}',
+ '\u{FE5B}',
+ '\u{FE5C}',
+ '\u{FE5D}',
+ '\u{FE5E}',
+ '\u{FE5F}',
+ '\u{FE60}',
+ '\u{FE61}',
+ '\u{FE63}',
+ '\u{FE68}',
+ '\u{FE6A}',
+ '\u{FE6B}',
+ '\u{FF01}',
+ '\u{FF02}',
+ '\u{FF03}',
+ '\u{FF05}',
+ '\u{FF06}',
+ '\u{FF07}',
+ '\u{FF08}',
+ '\u{FF09}',
+ '\u{FF0A}',
+ '\u{FF0C}',
+ '\u{FF0D}',
+ '\u{FF0E}',
+ '\u{FF0F}',
+ '\u{FF1A}',
+ '\u{FF1B}',
+ '\u{FF1F}',
+ '\u{FF20}',
+ '\u{FF3B}',
+ '\u{FF3C}',
+ '\u{FF3D}',
+ '\u{FF3F}',
+ '\u{FF5B}',
+ '\u{FF5D}',
+ '\u{FF5F}',
+ '\u{FF60}',
+ '\u{FF61}',
+ '\u{FF62}',
+ '\u{FF63}',
+ '\u{FF64}',
+ '\u{FF65}',
+ '\u{10100}',
+ '\u{10101}',
+ '\u{10102}',
+ '\u{1039F}',
+ '\u{103D0}',
+ '\u{1056F}',
+ '\u{10857}',
+ '\u{1091F}',
+ '\u{1093F}',
+ '\u{10A50}',
+ '\u{10A51}',
+ '\u{10A52}',
+ '\u{10A53}',
+ '\u{10A54}',
+ '\u{10A55}',
+ '\u{10A56}',
+ '\u{10A57}',
+ '\u{10A58}',
+ '\u{10A7F}',
+ '\u{10AF0}',
+ '\u{10AF1}',
+ '\u{10AF2}',
+ '\u{10AF3}',
+ '\u{10AF4}',
+ '\u{10AF5}',
+ '\u{10AF6}',
+ '\u{10B39}',
+ '\u{10B3A}',
+ '\u{10B3B}',
+ '\u{10B3C}',
+ '\u{10B3D}',
+ '\u{10B3E}',
+ '\u{10B3F}',
+ '\u{10B99}',
+ '\u{10B9A}',
+ '\u{10B9B}',
+ '\u{10B9C}',
+ '\u{10EAD}',
+ '\u{10F55}',
+ '\u{10F56}',
+ '\u{10F57}',
+ '\u{10F58}',
+ '\u{10F59}',
+ '\u{10F86}',
+ '\u{10F87}',
+ '\u{10F88}',
+ '\u{10F89}',
+ '\u{11047}',
+ '\u{11048}',
+ '\u{11049}',
+ '\u{1104A}',
+ '\u{1104B}',
+ '\u{1104C}',
+ '\u{1104D}',
+ '\u{110BB}',
+ '\u{110BC}',
+ '\u{110BE}',
+ '\u{110BF}',
+ '\u{110C0}',
+ '\u{110C1}',
+ '\u{11140}',
+ '\u{11141}',
+ '\u{11142}',
+ '\u{11143}',
+ '\u{11174}',
+ '\u{11175}',
+ '\u{111C5}',
+ '\u{111C6}',
+ '\u{111C7}',
+ '\u{111C8}',
+ '\u{111CD}',
+ '\u{111DB}',
+ '\u{111DD}',
+ '\u{111DE}',
+ '\u{111DF}',
+ '\u{11238}',
+ '\u{11239}',
+ '\u{1123A}',
+ '\u{1123B}',
+ '\u{1123C}',
+ '\u{1123D}',
+ '\u{112A9}',
+ '\u{1144B}',
+ '\u{1144C}',
+ '\u{1144D}',
+ '\u{1144E}',
+ '\u{1144F}',
+ '\u{1145A}',
+ '\u{1145B}',
+ '\u{1145D}',
+ '\u{114C6}',
+ '\u{115C1}',
+ '\u{115C2}',
+ '\u{115C3}',
+ '\u{115C4}',
+ '\u{115C5}',
+ '\u{115C6}',
+ '\u{115C7}',
+ '\u{115C8}',
+ '\u{115C9}',
+ '\u{115CA}',
+ '\u{115CB}',
+ '\u{115CC}',
+ '\u{115CD}',
+ '\u{115CE}',
+ '\u{115CF}',
+ '\u{115D0}',
+ '\u{115D1}',
+ '\u{115D2}',
+ '\u{115D3}',
+ '\u{115D4}',
+ '\u{115D5}',
+ '\u{115D6}',
+ '\u{115D7}',
+ '\u{11641}',
+ '\u{11642}',
+ '\u{11643}',
+ '\u{11660}',
+ '\u{11661}',
+ '\u{11662}',
+ '\u{11663}',
+ '\u{11664}',
+ '\u{11665}',
+ '\u{11666}',
+ '\u{11667}',
+ '\u{11668}',
+ '\u{11669}',
+ '\u{1166A}',
+ '\u{1166B}',
+ '\u{1166C}',
+ '\u{116B9}',
+ '\u{1173C}',
+ '\u{1173D}',
+ '\u{1173E}',
+ '\u{1183B}',
+ '\u{11944}',
+ '\u{11945}',
+ '\u{11946}',
+ '\u{119E2}',
+ '\u{11A3F}',
+ '\u{11A40}',
+ '\u{11A41}',
+ '\u{11A42}',
+ '\u{11A43}',
+ '\u{11A44}',
+ '\u{11A45}',
+ '\u{11A46}',
+ '\u{11A9A}',
+ '\u{11A9B}',
+ '\u{11A9C}',
+ '\u{11A9E}',
+ '\u{11A9F}',
+ '\u{11AA0}',
+ '\u{11AA1}',
+ '\u{11AA2}',
+ '\u{11C41}',
+ '\u{11C42}',
+ '\u{11C43}',
+ '\u{11C44}',
+ '\u{11C45}',
+ '\u{11C70}',
+ '\u{11C71}',
+ '\u{11EF7}',
+ '\u{11EF8}',
+ '\u{11FFF}',
+ '\u{12470}',
+ '\u{12471}',
+ '\u{12472}',
+ '\u{12473}',
+ '\u{12474}',
+ '\u{12FF1}',
+ '\u{12FF2}',
+ '\u{16A6E}',
+ '\u{16A6F}',
+ '\u{16AF5}',
+ '\u{16B37}',
+ '\u{16B38}',
+ '\u{16B39}',
+ '\u{16B3A}',
+ '\u{16B3B}',
+ '\u{16B44}',
+ '\u{16E97}',
+ '\u{16E98}',
+ '\u{16E99}',
+ '\u{16E9A}',
+ '\u{16FE2}',
+ '\u{1BC9F}',
+ '\u{1DA87}',
+ '\u{1DA88}',
+ '\u{1DA89}',
+ '\u{1DA8A}',
+ '\u{1DA8B}',
+ '\u{1E95E}',
+ '\u{1E95F}',
+];
diff --git a/src/util/edit_map.rs b/src/util/edit_map.rs
index 417f42b..eba667d 100644
--- a/src/util/edit_map.rs
+++ b/src/util/edit_map.rs
@@ -8,8 +8,8 @@
//! And, in other cases, it’s needed to parse subcontent: pass some events
//! through another tokenizer and inject the result.
-use std::collections::HashMap;
use crate::tokenizer::Event;
+use std::collections::HashMap;
/// Shift `previous` and `next` links according to `jumps`.
///