diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-04 15:21:11 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-04 15:21:11 +0200 |
commit | 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (patch) | |
tree | cc73bb48ae6102b27b7b864f13585eb77ef86c2c | |
parent | 8eb4631bd7c4345ec2a0c9b2ca2e05bdb1d79dd7 (diff) | |
download | markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.gz markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.bz2 markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.zip |
Add support for unicode punctuation
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | readme.md | 3 | ||||
-rw-r--r-- | script/generate-unicode.js | 68 | ||||
-rw-r--r-- | script/package.json | 3 | ||||
-rw-r--r-- | src/compiler.rs | 2 | ||||
-rw-r--r-- | src/construct/attention.rs | 6 | ||||
-rw-r--r-- | src/content/flow.rs | 2 | ||||
-rw-r--r-- | src/lib.rs | 1 | ||||
-rw-r--r-- | src/subtokenize.rs | 2 | ||||
-rw-r--r-- | src/tokenizer.rs | 2 | ||||
-rw-r--r-- | src/unicode.rs | 838 | ||||
-rw-r--r-- | src/util/edit_map.rs | 2 |
12 files changed, 920 insertions, 10 deletions
@@ -3,3 +3,4 @@ *.lock coverage/ target +script/unicode-data.txt @@ -125,6 +125,7 @@ cargo doc --document-private-items #### Refactor - [ ] (1) Use `edit_map` in `subtokenize` (needs to support links in edits) +- [ ] (1) Use rust to crawl unicode #### Parse @@ -151,7 +152,6 @@ cargo doc --document-private-items #### Misc - [ ] (1) use `char::REPLACEMENT_CHARACTER`? -- [ ] (3) Unicode punctuation - [ ] (3) `nostd` - [ ] (3) Check subtokenizer unraveling is ok - [ ] (3) Remove splicing and cloning in subtokenizer @@ -275,3 +275,4 @@ important. things interrupt them each line - [x] (3) Add support for interrupting (or not) - [x] (5) attention +- [x] (3) Unicode punctuation diff --git a/script/generate-unicode.js b/script/generate-unicode.js new file mode 100644 index 0000000..35150af --- /dev/null +++ b/script/generate-unicode.js @@ -0,0 +1,68 @@ +// To do: port to Rust with `reqwest`? +import fs from "node:fs/promises"; + +const dataUrl = new URL("unicode-data.txt", import.meta.url); +const codeUrl = new URL("../src/unicode.rs", import.meta.url); +/** @type {string} */ +let data; + +try { + data = String(await fs.readFile(dataUrl)); +} catch { + const response = await fetch( + "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" + ); + console.log(response); + const text = await response.text(); + await fs.writeFile(dataUrl, text); + data = text; +} + +let rows = data.split("\n"); +let index = -1; +let search = [ + "Pc", // Punctuation, Connector + "Pd", // Punctuation, Dash + "Pe", // Punctuation, Close + "Pf", // Punctuation, FinalQuote + "Pi", // Punctuation, InitialQuote + "Po", // Punctuation, Other + "Ps", // Punctuation, Open +]; +/** @type {Array<string>} */ +let found = []; + +while (++index < rows.length) { + const cells = rows[index].split(";"); + const [code, , category] = cells; + if (search.includes(category)) { + found.push(code); + } +} + +await fs.writeFile( + codeUrl, + [ + "//! Information on Unicode.", + "", + "/// List of characters that are considered punctuation according to Unicode.", + "///", + "/// > 👉 **Important**: this module is generated by `script/`.", + "/// > It is generate from the latest Unicode data.", + "///", + "/// Rust does not contain an `is_punctuation` method on `char`, while it does", + "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).", + "///", + "/// `CommonMark` handles attention (emphasis, strong) markers based on what", + "/// comes before or after them.", + "/// One such difference is if those characters are Unicode punctuation.", + "///", + "/// ## References", + "///", + "/// * [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)", + "pub const PUNCTUATION: [char; " + found.length + "] = [", + ...found.map((d) => " '\\u{" + d + "}',"), + "];", + "", + ].join("\n") +); diff --git a/script/package.json b/script/package.json new file mode 100644 index 0000000..3dbc1ca --- /dev/null +++ b/script/package.json @@ -0,0 +1,3 @@ +{ + "type": "module" +} diff --git a/src/compiler.rs b/src/compiler.rs index b0061ce..2c6fe68 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -1,5 +1,4 @@ //! Turn events into a string of HTML. -use std::collections::HashMap; use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}; use crate::construct::character_reference::Kind as CharacterReferenceKind; use crate::tokenizer::{Code, Event, EventType, TokenType}; @@ -10,6 +9,7 @@ use crate::util::{ sanitize_uri::sanitize_uri, span::{codes as codes_from_span, from_exit_event, serialize}, }; +use std::collections::HashMap; /// Type of line endings in markdown. #[derive(Debug, Clone, PartialEq)] diff --git a/src/construct/attention.rs b/src/construct/attention.rs index d4541b4..f4bb841 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -1,6 +1,7 @@ //! To do. use crate::tokenizer::{Code, Event, EventType, Point, State, StateFnResult, TokenType, Tokenizer}; +use crate::unicode::PUNCTUATION; use crate::util::edit_map::EditMap; /// To do @@ -421,10 +422,7 @@ fn classify_character(code: Code) -> GroupKind { // Unicode whitespace. Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace, // Unicode punctuation. - // To do: `is_punctuation` is not in rust? Why not? - // Perhaps we need to generate stuff just like: - // <https://github.com/micromark/micromark/blob/main/packages/micromark-util-character/dev/lib/unicode-punctuation-regex.js>. - Code::Char(char) if char.is_ascii_punctuation() => GroupKind::Punctuation, + Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation, Code::Char(_) => GroupKind::Other, } } diff --git a/src/content/flow.rs b/src/content/flow.rs index 4a12e0f..3ff948d 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -19,7 +19,6 @@ //! * [HTML (flow)][crate::construct::html_flow] //! * [Thematic break][crate::construct::thematic_break] -use std::collections::HashSet; use crate::construct::{ blank_line::start as blank_line, code_fenced::start as code_fenced, code_indented::start as code_indented, definition::start as definition, @@ -34,6 +33,7 @@ use crate::util::{ normalize_identifier::normalize_identifier, span::{from_exit_event, serialize}, }; +use std::collections::HashSet; /// Turn `codes` as the flow content type into events. pub fn flow(parse_state: &mut ParseState, point: Point, index: usize) -> Vec<Event> { @@ -11,6 +11,7 @@ mod content; mod parser; mod subtokenize; mod tokenizer; +mod unicode; mod util; use crate::compiler::compile; diff --git a/src/subtokenize.rs b/src/subtokenize.rs index ad8aace..f3e9ae0 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -21,11 +21,11 @@ //! thus the whole document needs to be parsed up to the level of definitions, //! before any level that can include references can be parsed. -use std::collections::HashMap; use crate::content::{string::start as string, text::start as text}; use crate::parser::ParseState; use crate::tokenizer::{ContentType, Event, EventType, State, StateFn, StateFnResult, Tokenizer}; use crate::util::span; +use std::collections::HashMap; /// Create a link between two [`Event`][]s. /// diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9d870c9..1fa94d7 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -11,9 +11,9 @@ //! [`attempt`]: Tokenizer::attempt //! [`check`]: Tokenizer::check -use std::collections::HashMap; use crate::constant::TAB_SIZE; use crate::parser::ParseState; +use std::collections::HashMap; /// Semantic label of a span. // To do: figure out how to share this so extensions can add their own stuff, diff --git a/src/unicode.rs b/src/unicode.rs new file mode 100644 index 0000000..8107440 --- /dev/null +++ b/src/unicode.rs @@ -0,0 +1,838 @@ +//! Information on Unicode. + +/// List of characters that are considered punctuation according to Unicode. +/// +/// > 👉 **Important**: this module is generated by `script/`. +/// > It is generate from the latest Unicode data. +/// +/// Rust does not contain an `is_punctuation` method on `char`, while it does +/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). +/// +/// `CommonMark` handles attention (emphasis, strong) markers based on what +/// comes before or after them. +/// One such difference is if those characters are Unicode punctuation. +/// +/// ## References +/// +/// * [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character) +pub const PUNCTUATION: [char; 819] = [ + '\u{0021}', + '\u{0022}', + '\u{0023}', + '\u{0025}', + '\u{0026}', + '\u{0027}', + '\u{0028}', + '\u{0029}', + '\u{002A}', + '\u{002C}', + '\u{002D}', + '\u{002E}', + '\u{002F}', + '\u{003A}', + '\u{003B}', + '\u{003F}', + '\u{0040}', + '\u{005B}', + '\u{005C}', + '\u{005D}', + '\u{005F}', + '\u{007B}', + '\u{007D}', + '\u{00A1}', + '\u{00A7}', + '\u{00AB}', + '\u{00B6}', + '\u{00B7}', + '\u{00BB}', + '\u{00BF}', + '\u{037E}', + '\u{0387}', + '\u{055A}', + '\u{055B}', + '\u{055C}', + '\u{055D}', + '\u{055E}', + '\u{055F}', + '\u{0589}', + '\u{058A}', + '\u{05BE}', + '\u{05C0}', + '\u{05C3}', + '\u{05C6}', + '\u{05F3}', + '\u{05F4}', + '\u{0609}', + '\u{060A}', + '\u{060C}', + '\u{060D}', + '\u{061B}', + '\u{061D}', + '\u{061E}', + '\u{061F}', + '\u{066A}', + '\u{066B}', + '\u{066C}', + '\u{066D}', + '\u{06D4}', + '\u{0700}', + '\u{0701}', + '\u{0702}', + '\u{0703}', + '\u{0704}', + '\u{0705}', + '\u{0706}', + '\u{0707}', + '\u{0708}', + '\u{0709}', + '\u{070A}', + '\u{070B}', + '\u{070C}', + '\u{070D}', + '\u{07F7}', + '\u{07F8}', + '\u{07F9}', + '\u{0830}', + '\u{0831}', + '\u{0832}', + '\u{0833}', + '\u{0834}', + '\u{0835}', + '\u{0836}', + '\u{0837}', + '\u{0838}', + '\u{0839}', + '\u{083A}', + '\u{083B}', + '\u{083C}', + '\u{083D}', + '\u{083E}', + '\u{085E}', + '\u{0964}', + '\u{0965}', + '\u{0970}', + '\u{09FD}', + '\u{0A76}', + '\u{0AF0}', + '\u{0C77}', + '\u{0C84}', + '\u{0DF4}', + '\u{0E4F}', + '\u{0E5A}', + '\u{0E5B}', + '\u{0F04}', + '\u{0F05}', + '\u{0F06}', + '\u{0F07}', + '\u{0F08}', + '\u{0F09}', + '\u{0F0A}', + '\u{0F0B}', + '\u{0F0C}', + '\u{0F0D}', + '\u{0F0E}', + '\u{0F0F}', + '\u{0F10}', + '\u{0F11}', + '\u{0F12}', + '\u{0F14}', + '\u{0F3A}', + '\u{0F3B}', + '\u{0F3C}', + '\u{0F3D}', + '\u{0F85}', + '\u{0FD0}', + '\u{0FD1}', + '\u{0FD2}', + '\u{0FD3}', + '\u{0FD4}', + '\u{0FD9}', + '\u{0FDA}', + '\u{104A}', + '\u{104B}', + '\u{104C}', + '\u{104D}', + '\u{104E}', + '\u{104F}', + '\u{10FB}', + '\u{1360}', + '\u{1361}', + '\u{1362}', + '\u{1363}', + '\u{1364}', + '\u{1365}', + '\u{1366}', + '\u{1367}', + '\u{1368}', + '\u{1400}', + '\u{166E}', + '\u{169B}', + '\u{169C}', + '\u{16EB}', + '\u{16EC}', + '\u{16ED}', + '\u{1735}', + '\u{1736}', + '\u{17D4}', + '\u{17D5}', + '\u{17D6}', + '\u{17D8}', + '\u{17D9}', + '\u{17DA}', + '\u{1800}', + '\u{1801}', + '\u{1802}', + '\u{1803}', + '\u{1804}', + '\u{1805}', + '\u{1806}', + '\u{1807}', + '\u{1808}', + '\u{1809}', + '\u{180A}', + '\u{1944}', + '\u{1945}', + '\u{1A1E}', + '\u{1A1F}', + '\u{1AA0}', + '\u{1AA1}', + '\u{1AA2}', + '\u{1AA3}', + '\u{1AA4}', + '\u{1AA5}', + '\u{1AA6}', + '\u{1AA8}', + '\u{1AA9}', + '\u{1AAA}', + '\u{1AAB}', + '\u{1AAC}', + '\u{1AAD}', + '\u{1B5A}', + '\u{1B5B}', + '\u{1B5C}', + '\u{1B5D}', + '\u{1B5E}', + '\u{1B5F}', + '\u{1B60}', + '\u{1B7D}', + '\u{1B7E}', + '\u{1BFC}', + '\u{1BFD}', + '\u{1BFE}', + '\u{1BFF}', + '\u{1C3B}', + '\u{1C3C}', + '\u{1C3D}', + '\u{1C3E}', + '\u{1C3F}', + '\u{1C7E}', + '\u{1C7F}', + '\u{1CC0}', + '\u{1CC1}', + '\u{1CC2}', + '\u{1CC3}', + '\u{1CC4}', + '\u{1CC5}', + '\u{1CC6}', + '\u{1CC7}', + '\u{1CD3}', + '\u{2010}', + '\u{2011}', + '\u{2012}', + '\u{2013}', + '\u{2014}', + '\u{2015}', + '\u{2016}', + '\u{2017}', + '\u{2018}', + '\u{2019}', + '\u{201A}', + '\u{201B}', + '\u{201C}', + '\u{201D}', + '\u{201E}', + '\u{201F}', + '\u{2020}', + '\u{2021}', + '\u{2022}', + '\u{2023}', + '\u{2024}', + '\u{2025}', + '\u{2026}', + '\u{2027}', + '\u{2030}', + '\u{2031}', + '\u{2032}', + '\u{2033}', + '\u{2034}', + '\u{2035}', + '\u{2036}', + '\u{2037}', + '\u{2038}', + '\u{2039}', + '\u{203A}', + '\u{203B}', + '\u{203C}', + '\u{203D}', + '\u{203E}', + '\u{203F}', + '\u{2040}', + '\u{2041}', + '\u{2042}', + '\u{2043}', + '\u{2045}', + '\u{2046}', + '\u{2047}', + '\u{2048}', + '\u{2049}', + '\u{204A}', + '\u{204B}', + '\u{204C}', + '\u{204D}', + '\u{204E}', + '\u{204F}', + '\u{2050}', + '\u{2051}', + '\u{2053}', + '\u{2054}', + '\u{2055}', + '\u{2056}', + '\u{2057}', + '\u{2058}', + '\u{2059}', + '\u{205A}', + '\u{205B}', + '\u{205C}', + '\u{205D}', + '\u{205E}', + '\u{207D}', + '\u{207E}', + '\u{208D}', + '\u{208E}', + '\u{2308}', + '\u{2309}', + '\u{230A}', + '\u{230B}', + '\u{2329}', + '\u{232A}', + '\u{2768}', + '\u{2769}', + '\u{276A}', + '\u{276B}', + '\u{276C}', + '\u{276D}', + '\u{276E}', + '\u{276F}', + '\u{2770}', + '\u{2771}', + '\u{2772}', + '\u{2773}', + '\u{2774}', + '\u{2775}', + '\u{27C5}', + '\u{27C6}', + '\u{27E6}', + '\u{27E7}', + '\u{27E8}', + '\u{27E9}', + '\u{27EA}', + '\u{27EB}', + '\u{27EC}', + '\u{27ED}', + '\u{27EE}', + '\u{27EF}', + '\u{2983}', + '\u{2984}', + '\u{2985}', + '\u{2986}', + '\u{2987}', + '\u{2988}', + '\u{2989}', + '\u{298A}', + '\u{298B}', + '\u{298C}', + '\u{298D}', + '\u{298E}', + '\u{298F}', + '\u{2990}', + '\u{2991}', + '\u{2992}', + '\u{2993}', + '\u{2994}', + '\u{2995}', + '\u{2996}', + '\u{2997}', + '\u{2998}', + '\u{29D8}', + '\u{29D9}', + '\u{29DA}', + '\u{29DB}', + '\u{29FC}', + '\u{29FD}', + '\u{2CF9}', + '\u{2CFA}', + '\u{2CFB}', + '\u{2CFC}', + '\u{2CFE}', + '\u{2CFF}', + '\u{2D70}', + '\u{2E00}', + '\u{2E01}', + '\u{2E02}', + '\u{2E03}', + '\u{2E04}', + '\u{2E05}', + '\u{2E06}', + '\u{2E07}', + '\u{2E08}', + '\u{2E09}', + '\u{2E0A}', + '\u{2E0B}', + '\u{2E0C}', + '\u{2E0D}', + '\u{2E0E}', + '\u{2E0F}', + '\u{2E10}', + '\u{2E11}', + '\u{2E12}', + '\u{2E13}', + '\u{2E14}', + '\u{2E15}', + '\u{2E16}', + '\u{2E17}', + '\u{2E18}', + '\u{2E19}', + '\u{2E1A}', + '\u{2E1B}', + '\u{2E1C}', + '\u{2E1D}', + '\u{2E1E}', + '\u{2E1F}', + '\u{2E20}', + '\u{2E21}', + '\u{2E22}', + '\u{2E23}', + '\u{2E24}', + '\u{2E25}', + '\u{2E26}', + '\u{2E27}', + '\u{2E28}', + '\u{2E29}', + '\u{2E2A}', + '\u{2E2B}', + '\u{2E2C}', + '\u{2E2D}', + '\u{2E2E}', + '\u{2E30}', + '\u{2E31}', + '\u{2E32}', + '\u{2E33}', + '\u{2E34}', + '\u{2E35}', + '\u{2E36}', + '\u{2E37}', + '\u{2E38}', + '\u{2E39}', + '\u{2E3A}', + '\u{2E3B}', + '\u{2E3C}', + '\u{2E3D}', + '\u{2E3E}', + '\u{2E3F}', + '\u{2E40}', + '\u{2E41}', + '\u{2E42}', + '\u{2E43}', + '\u{2E44}', + '\u{2E45}', + '\u{2E46}', + '\u{2E47}', + '\u{2E48}', + '\u{2E49}', + '\u{2E4A}', + '\u{2E4B}', + '\u{2E4C}', + '\u{2E4D}', + '\u{2E4E}', + '\u{2E4F}', + '\u{2E52}', + '\u{2E53}', + '\u{2E54}', + '\u{2E55}', + '\u{2E56}', + '\u{2E57}', + '\u{2E58}', + '\u{2E59}', + '\u{2E5A}', + '\u{2E5B}', + '\u{2E5C}', + '\u{2E5D}', + '\u{3001}', + '\u{3002}', + '\u{3003}', + '\u{3008}', + '\u{3009}', + '\u{300A}', + '\u{300B}', + '\u{300C}', + '\u{300D}', + '\u{300E}', + '\u{300F}', + '\u{3010}', + '\u{3011}', + '\u{3014}', + '\u{3015}', + '\u{3016}', + '\u{3017}', + '\u{3018}', + '\u{3019}', + '\u{301A}', + '\u{301B}', + '\u{301C}', + '\u{301D}', + '\u{301E}', + '\u{301F}', + '\u{3030}', + '\u{303D}', + '\u{30A0}', + '\u{30FB}', + '\u{A4FE}', + '\u{A4FF}', + '\u{A60D}', + '\u{A60E}', + '\u{A60F}', + '\u{A673}', + '\u{A67E}', + '\u{A6F2}', + '\u{A6F3}', + '\u{A6F4}', + '\u{A6F5}', + '\u{A6F6}', + '\u{A6F7}', + '\u{A874}', + '\u{A875}', + '\u{A876}', + '\u{A877}', + '\u{A8CE}', + '\u{A8CF}', + '\u{A8F8}', + '\u{A8F9}', + '\u{A8FA}', + '\u{A8FC}', + '\u{A92E}', + '\u{A92F}', + '\u{A95F}', + '\u{A9C1}', + '\u{A9C2}', + '\u{A9C3}', + '\u{A9C4}', + '\u{A9C5}', + '\u{A9C6}', + '\u{A9C7}', + '\u{A9C8}', + '\u{A9C9}', + '\u{A9CA}', + '\u{A9CB}', + '\u{A9CC}', + '\u{A9CD}', + '\u{A9DE}', + '\u{A9DF}', + '\u{AA5C}', + '\u{AA5D}', + '\u{AA5E}', + '\u{AA5F}', + '\u{AADE}', + '\u{AADF}', + '\u{AAF0}', + '\u{AAF1}', + '\u{ABEB}', + '\u{FD3E}', + '\u{FD3F}', + '\u{FE10}', + '\u{FE11}', + '\u{FE12}', + '\u{FE13}', + '\u{FE14}', + '\u{FE15}', + '\u{FE16}', + '\u{FE17}', + '\u{FE18}', + '\u{FE19}', + '\u{FE30}', + '\u{FE31}', + '\u{FE32}', + '\u{FE33}', + '\u{FE34}', + '\u{FE35}', + '\u{FE36}', + '\u{FE37}', + '\u{FE38}', + '\u{FE39}', + '\u{FE3A}', + '\u{FE3B}', + '\u{FE3C}', + '\u{FE3D}', + '\u{FE3E}', + '\u{FE3F}', + '\u{FE40}', + '\u{FE41}', + '\u{FE42}', + '\u{FE43}', + '\u{FE44}', + '\u{FE45}', + '\u{FE46}', + '\u{FE47}', + '\u{FE48}', + '\u{FE49}', + '\u{FE4A}', + '\u{FE4B}', + '\u{FE4C}', + '\u{FE4D}', + '\u{FE4E}', + '\u{FE4F}', + '\u{FE50}', + '\u{FE51}', + '\u{FE52}', + '\u{FE54}', + '\u{FE55}', + '\u{FE56}', + '\u{FE57}', + '\u{FE58}', + '\u{FE59}', + '\u{FE5A}', + '\u{FE5B}', + '\u{FE5C}', + '\u{FE5D}', + '\u{FE5E}', + '\u{FE5F}', + '\u{FE60}', + '\u{FE61}', + '\u{FE63}', + '\u{FE68}', + '\u{FE6A}', + '\u{FE6B}', + '\u{FF01}', + '\u{FF02}', + '\u{FF03}', + '\u{FF05}', + '\u{FF06}', + '\u{FF07}', + '\u{FF08}', + '\u{FF09}', + '\u{FF0A}', + '\u{FF0C}', + '\u{FF0D}', + '\u{FF0E}', + '\u{FF0F}', + '\u{FF1A}', + '\u{FF1B}', + '\u{FF1F}', + '\u{FF20}', + '\u{FF3B}', + '\u{FF3C}', + '\u{FF3D}', + '\u{FF3F}', + '\u{FF5B}', + '\u{FF5D}', + '\u{FF5F}', + '\u{FF60}', + '\u{FF61}', + '\u{FF62}', + '\u{FF63}', + '\u{FF64}', + '\u{FF65}', + '\u{10100}', + '\u{10101}', + '\u{10102}', + '\u{1039F}', + '\u{103D0}', + '\u{1056F}', + '\u{10857}', + '\u{1091F}', + '\u{1093F}', + '\u{10A50}', + '\u{10A51}', + '\u{10A52}', + '\u{10A53}', + '\u{10A54}', + '\u{10A55}', + '\u{10A56}', + '\u{10A57}', + '\u{10A58}', + '\u{10A7F}', + '\u{10AF0}', + '\u{10AF1}', + '\u{10AF2}', + '\u{10AF3}', + '\u{10AF4}', + '\u{10AF5}', + '\u{10AF6}', + '\u{10B39}', + '\u{10B3A}', + '\u{10B3B}', + '\u{10B3C}', + '\u{10B3D}', + '\u{10B3E}', + '\u{10B3F}', + '\u{10B99}', + '\u{10B9A}', + '\u{10B9B}', + '\u{10B9C}', + '\u{10EAD}', + '\u{10F55}', + '\u{10F56}', + '\u{10F57}', + '\u{10F58}', + '\u{10F59}', + '\u{10F86}', + '\u{10F87}', + '\u{10F88}', + '\u{10F89}', + '\u{11047}', + '\u{11048}', + '\u{11049}', + '\u{1104A}', + '\u{1104B}', + '\u{1104C}', + '\u{1104D}', + '\u{110BB}', + '\u{110BC}', + '\u{110BE}', + '\u{110BF}', + '\u{110C0}', + '\u{110C1}', + '\u{11140}', + '\u{11141}', + '\u{11142}', + '\u{11143}', + '\u{11174}', + '\u{11175}', + '\u{111C5}', + '\u{111C6}', + '\u{111C7}', + '\u{111C8}', + '\u{111CD}', + '\u{111DB}', + '\u{111DD}', + '\u{111DE}', + '\u{111DF}', + '\u{11238}', + '\u{11239}', + '\u{1123A}', + '\u{1123B}', + '\u{1123C}', + '\u{1123D}', + '\u{112A9}', + '\u{1144B}', + '\u{1144C}', + '\u{1144D}', + '\u{1144E}', + '\u{1144F}', + '\u{1145A}', + '\u{1145B}', + '\u{1145D}', + '\u{114C6}', + '\u{115C1}', + '\u{115C2}', + '\u{115C3}', + '\u{115C4}', + '\u{115C5}', + '\u{115C6}', + '\u{115C7}', + '\u{115C8}', + '\u{115C9}', + '\u{115CA}', + '\u{115CB}', + '\u{115CC}', + '\u{115CD}', + '\u{115CE}', + '\u{115CF}', + '\u{115D0}', + '\u{115D1}', + '\u{115D2}', + '\u{115D3}', + '\u{115D4}', + '\u{115D5}', + '\u{115D6}', + '\u{115D7}', + '\u{11641}', + '\u{11642}', + '\u{11643}', + '\u{11660}', + '\u{11661}', + '\u{11662}', + '\u{11663}', + '\u{11664}', + '\u{11665}', + '\u{11666}', + '\u{11667}', + '\u{11668}', + '\u{11669}', + '\u{1166A}', + '\u{1166B}', + '\u{1166C}', + '\u{116B9}', + '\u{1173C}', + '\u{1173D}', + '\u{1173E}', + '\u{1183B}', + '\u{11944}', + '\u{11945}', + '\u{11946}', + '\u{119E2}', + '\u{11A3F}', + '\u{11A40}', + '\u{11A41}', + '\u{11A42}', + '\u{11A43}', + '\u{11A44}', + '\u{11A45}', + '\u{11A46}', + '\u{11A9A}', + '\u{11A9B}', + '\u{11A9C}', + '\u{11A9E}', + '\u{11A9F}', + '\u{11AA0}', + '\u{11AA1}', + '\u{11AA2}', + '\u{11C41}', + '\u{11C42}', + '\u{11C43}', + '\u{11C44}', + '\u{11C45}', + '\u{11C70}', + '\u{11C71}', + '\u{11EF7}', + '\u{11EF8}', + '\u{11FFF}', + '\u{12470}', + '\u{12471}', + '\u{12472}', + '\u{12473}', + '\u{12474}', + '\u{12FF1}', + '\u{12FF2}', + '\u{16A6E}', + '\u{16A6F}', + '\u{16AF5}', + '\u{16B37}', + '\u{16B38}', + '\u{16B39}', + '\u{16B3A}', + '\u{16B3B}', + '\u{16B44}', + '\u{16E97}', + '\u{16E98}', + '\u{16E99}', + '\u{16E9A}', + '\u{16FE2}', + '\u{1BC9F}', + '\u{1DA87}', + '\u{1DA88}', + '\u{1DA89}', + '\u{1DA8A}', + '\u{1DA8B}', + '\u{1E95E}', + '\u{1E95F}', +]; diff --git a/src/util/edit_map.rs b/src/util/edit_map.rs index 417f42b..eba667d 100644 --- a/src/util/edit_map.rs +++ b/src/util/edit_map.rs @@ -8,8 +8,8 @@ //! And, in other cases, it’s needed to parse subcontent: pass some events //! through another tokenizer and inject the result. -use std::collections::HashMap; use crate::tokenizer::Event; +use std::collections::HashMap; /// Shift `previous` and `next` links according to `jumps`. /// |