aboutsummaryrefslogtreecommitdiffstats
path: root/script/generate-unicode.js
blob: 35150aff62052dc3d021837b2fa160693d103bd0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
// To do: port to Rust with `reqwest`?
import fs from "node:fs/promises";

const dataUrl = new URL("unicode-data.txt", import.meta.url);
const codeUrl = new URL("../src/unicode.rs", import.meta.url);
/** @type {string} */
let data;

try {
  data = String(await fs.readFile(dataUrl));
} catch {
  const response = await fetch(
    "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
  );
  console.log(response);
  const text = await response.text();
  await fs.writeFile(dataUrl, text);
  data = text;
}

let rows = data.split("\n");
let index = -1;
let search = [
  "Pc", // Punctuation, Connector
  "Pd", // Punctuation, Dash
  "Pe", // Punctuation, Close
  "Pf", // Punctuation, FinalQuote
  "Pi", // Punctuation, InitialQuote
  "Po", // Punctuation, Other
  "Ps", // Punctuation, Open
];
/** @type {Array<string>} */
let found = [];

while (++index < rows.length) {
  const cells = rows[index].split(";");
  const [code, , category] = cells;
  if (search.includes(category)) {
    found.push(code);
  }
}

await fs.writeFile(
  codeUrl,
  [
    "//! Information on Unicode.",
    "",
    "/// List of characters that are considered punctuation according to Unicode.",
    "///",
    "/// > 👉 **Important**: this module is generated by `script/`.",
    "/// > It is generate from the latest Unicode data.",
    "///",
    "/// Rust does not contain an `is_punctuation` method on `char`, while it does",
    "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).",
    "///",
    "/// `CommonMark` handles attention (emphasis, strong) markers based on what",
    "/// comes before or after them.",
    "/// One such difference is if those characters are Unicode punctuation.",
    "///",
    "/// ## References",
    "///",
    "/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)",
    "pub const PUNCTUATION: [char; " + found.length + "] = [",
    ...found.map((d) => "    '\\u{" + d + "}',"),
    "];",
    "",
  ].join("\n")
);