From 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 4 Jul 2022 15:21:11 +0200 Subject: Add support for unicode punctuation --- script/generate-unicode.js | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 script/generate-unicode.js (limited to 'script/generate-unicode.js') diff --git a/script/generate-unicode.js b/script/generate-unicode.js new file mode 100644 index 0000000..35150af --- /dev/null +++ b/script/generate-unicode.js @@ -0,0 +1,68 @@ +// To do: port to Rust with `reqwest`? +import fs from "node:fs/promises"; + +const dataUrl = new URL("unicode-data.txt", import.meta.url); +const codeUrl = new URL("../src/unicode.rs", import.meta.url); +/** @type {string} */ +let data; + +try { + data = String(await fs.readFile(dataUrl)); +} catch { + const response = await fetch( + "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" + ); + console.log(response); + const text = await response.text(); + await fs.writeFile(dataUrl, text); + data = text; +} + +let rows = data.split("\n"); +let index = -1; +let search = [ + "Pc", // Punctuation, Connector + "Pd", // Punctuation, Dash + "Pe", // Punctuation, Close + "Pf", // Punctuation, FinalQuote + "Pi", // Punctuation, InitialQuote + "Po", // Punctuation, Other + "Ps", // Punctuation, Open +]; +/** @type {Array} */ +let found = []; + +while (++index < rows.length) { + const cells = rows[index].split(";"); + const [code, , category] = cells; + if (search.includes(category)) { + found.push(code); + } +} + +await fs.writeFile( + codeUrl, + [ + "//! Information on Unicode.", + "", + "/// List of characters that are considered punctuation according to Unicode.", + "///", + "/// > ๐Ÿ‘‰ **Important**: this module is generated by `script/`.", + "/// > It is generate from the latest Unicode data.", + "///", + "/// Rust does not contain an `is_punctuation` method on `char`, while it does", + "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).", + "///", + "/// `CommonMark` handles attention (emphasis, strong) markers based on what", + "/// comes before or after them.", + "/// One such difference is if those characters are Unicode punctuation.", + "///", + "/// ## References", + "///", + "/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)", + "pub const PUNCTUATION: [char; " + found.length + "] = [", + ...found.map((d) => " '\\u{" + d + "}',"), + "];", + "", + ].join("\n") +); -- cgit