diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-04 17:28:11 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-04 17:28:11 +0200 |
commit | 2bd50cd8082e686ee74adc0770cc63593b1718f1 (patch) | |
tree | 0d2c7ddd3df290c2f89e4726f9bb6264e68a376b /script/generate-unicode.js | |
parent | 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (diff) | |
download | markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.gz markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.bz2 markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.zip |
Use Rust to crawl unicode
Diffstat (limited to '')
-rw-r--r-- | script/generate-unicode.js | 68 |
1 files changed, 0 insertions, 68 deletions
diff --git a/script/generate-unicode.js b/script/generate-unicode.js deleted file mode 100644 index 35150af..0000000 --- a/script/generate-unicode.js +++ /dev/null @@ -1,68 +0,0 @@ -// To do: port to Rust with `reqwest`? -import fs from "node:fs/promises"; - -const dataUrl = new URL("unicode-data.txt", import.meta.url); -const codeUrl = new URL("../src/unicode.rs", import.meta.url); -/** @type {string} */ -let data; - -try { - data = String(await fs.readFile(dataUrl)); -} catch { - const response = await fetch( - "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" - ); - console.log(response); - const text = await response.text(); - await fs.writeFile(dataUrl, text); - data = text; -} - -let rows = data.split("\n"); -let index = -1; -let search = [ - "Pc", // Punctuation, Connector - "Pd", // Punctuation, Dash - "Pe", // Punctuation, Close - "Pf", // Punctuation, FinalQuote - "Pi", // Punctuation, InitialQuote - "Po", // Punctuation, Other - "Ps", // Punctuation, Open -]; -/** @type {Array<string>} */ -let found = []; - -while (++index < rows.length) { - const cells = rows[index].split(";"); - const [code, , category] = cells; - if (search.includes(category)) { - found.push(code); - } -} - -await fs.writeFile( - codeUrl, - [ - "//! Information on Unicode.", - "", - "/// List of characters that are considered punctuation according to Unicode.", - "///", - "/// > ๐ **Important**: this module is generated by `script/`.", - "/// > It is generate from the latest Unicode data.", - "///", - "/// Rust does not contain an `is_punctuation` method on `char`, while it does", - "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).", - "///", - "/// `CommonMark` handles attention (emphasis, strong) markers based on what", - "/// comes before or after them.", - "/// One such difference is if those characters are Unicode punctuation.", - "///", - "/// ## References", - "///", - "/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)", - "pub const PUNCTUATION: [char; " + found.length + "] = [", - ...found.map((d) => " '\\u{" + d + "}',"), - "];", - "", - ].join("\n") -); |