diff options
| author | 2022-07-04 15:21:11 +0200 | |
|---|---|---|
| committer | 2022-07-04 15:21:11 +0200 | |
| commit | 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (patch) | |
| tree | cc73bb48ae6102b27b7b864f13585eb77ef86c2c /script/generate-unicode.js | |
| parent | 8eb4631bd7c4345ec2a0c9b2ca2e05bdb1d79dd7 (diff) | |
| download | markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.gz markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.tar.bz2 markdown-rs-0450e7c2b12bd3ef53e0cffb60a3dd860325b478.zip | |
Add support for unicode punctuation
Diffstat (limited to 'script/generate-unicode.js')
| -rw-r--r-- | script/generate-unicode.js | 68 | 
1 files changed, 68 insertions, 0 deletions
| diff --git a/script/generate-unicode.js b/script/generate-unicode.js new file mode 100644 index 0000000..35150af --- /dev/null +++ b/script/generate-unicode.js @@ -0,0 +1,68 @@ +// To do: port to Rust with `reqwest`? +import fs from "node:fs/promises"; + +const dataUrl = new URL("unicode-data.txt", import.meta.url); +const codeUrl = new URL("../src/unicode.rs", import.meta.url); +/** @type {string} */ +let data; + +try { +  data = String(await fs.readFile(dataUrl)); +} catch { +  const response = await fetch( +    "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" +  ); +  console.log(response); +  const text = await response.text(); +  await fs.writeFile(dataUrl, text); +  data = text; +} + +let rows = data.split("\n"); +let index = -1; +let search = [ +  "Pc", // Punctuation, Connector +  "Pd", // Punctuation, Dash +  "Pe", // Punctuation, Close +  "Pf", // Punctuation, FinalQuote +  "Pi", // Punctuation, InitialQuote +  "Po", // Punctuation, Other +  "Ps", // Punctuation, Open +]; +/** @type {Array<string>} */ +let found = []; + +while (++index < rows.length) { +  const cells = rows[index].split(";"); +  const [code, , category] = cells; +  if (search.includes(category)) { +    found.push(code); +  } +} + +await fs.writeFile( +  codeUrl, +  [ +    "//! Information on Unicode.", +    "", +    "/// List of characters that are considered punctuation according to Unicode.", +    "///", +    "/// > ๐ **Important**: this module is generated by `script/`.", +    "/// > It is generate from the latest Unicode data.", +    "///", +    "/// Rust does not contain an `is_punctuation` method on `char`, while it does", +    "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).", +    "///", +    "/// `CommonMark` handles attention (emphasis, strong) markers based on what", +    "/// comes before or after them.", +    "/// One such difference is if those characters are Unicode punctuation.", +    "///", +    "/// ## References", +    "///", +    "/// *   [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)", +    "pub const PUNCTUATION: [char; " + found.length + "] = [", +    ...found.map((d) => "    '\\u{" + d + "}',"), +    "];", +    "", +  ].join("\n") +); | 
