Add support for unicode punctuation

author: Titus Wormer <tituswormer@gmail.com> 2022-07-04 15:21:11 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-07-04 15:21:11 +0200
commit: 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (patch)
tree: cc73bb48ae6102b27b7b864f13585eb77ef86c2c /script/generate-unicode.js
parent: 8eb4631bd7c4345ec2a0c9b2ca2e05bdb1d79dd7 (diff)
download: markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.gz
markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.bz2
markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.zip
1 files changed, 68 insertions, 0 deletions
diff --git a/script/generate-unicode.js b/script/generate-unicode.js
new file mode 100644
index 0000000..35150af
--- /dev/null
+++ b/script/generate-unicode.js
@@ -0,0 +1,68 @@
+// To do: port to Rust with `reqwest`?
+import fs from "node:fs/promises";
+
+const dataUrl = new URL("unicode-data.txt", import.meta.url);
+const codeUrl = new URL("../src/unicode.rs", import.meta.url);
+/** @type {string} */
+let data;
+
+try {
+  data = String(await fs.readFile(dataUrl));
+} catch {
+  const response = await fetch(
+    "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
+  );
+  console.log(response);
+  const text = await response.text();
+  await fs.writeFile(dataUrl, text);
+  data = text;
+}
+
+let rows = data.split("\n");
+let index = -1;
+let search = [
+  "Pc", // Punctuation, Connector
+  "Pd", // Punctuation, Dash
+  "Pe", // Punctuation, Close
+  "Pf", // Punctuation, FinalQuote
+  "Pi", // Punctuation, InitialQuote
+  "Po", // Punctuation, Other
+  "Ps", // Punctuation, Open
+];
+/** @type {Array<string>} */
+let found = [];
+
+while (++index < rows.length) {
+  const cells = rows[index].split(";");
+  const [code, , category] = cells;
+  if (search.includes(category)) {
+    found.push(code);
+  }
+}
+
+await fs.writeFile(
+  codeUrl,
+  [
+    "//! Information on Unicode.",
+    "",
+    "/// List of characters that are considered punctuation according to Unicode.",
+    "///",
+    "/// > 👉 **Important**: this module is generated by `script/`.",
+    "/// > It is generate from the latest Unicode data.",
+    "///",
+    "/// Rust does not contain an `is_punctuation` method on `char`, while it does",
+    "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).",
+    "///",
+    "/// `CommonMark` handles attention (emphasis, strong) markers based on what",
+    "/// comes before or after them.",
+    "/// One such difference is if those characters are Unicode punctuation.",
+    "///",
+    "/// ## References",
+    "///",
+    "/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)",
+    "pub const PUNCTUATION: [char; " + found.length + "] = [",
+    ...found.map((d) => "    '\\u{" + d + "}',"),
+    "];",
+    "",
+  ].join("\n")
+);
author	Titus Wormer <tituswormer@gmail.com>	2022-07-04 15:21:11 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-07-04 15:21:11 +0200
commit	0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (patch)
tree	cc73bb48ae6102b27b7b864f13585eb77ef86c2c /script/generate-unicode.js
parent	8eb4631bd7c4345ec2a0c9b2ca2e05bdb1d79dd7 (diff)
download	markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.gz markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.bz2 markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.zip