Use Rust to crawl unicode

author: Titus Wormer <tituswormer@gmail.com> 2022-07-04 17:28:11 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-07-04 17:28:11 +0200
commit: 2bd50cd8082e686ee74adc0770cc63593b1718f1 (patch)
tree: 0d2c7ddd3df290c2f89e4726f9bb6264e68a376b
parent: 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (diff)
download: markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.gz
markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.bz2
markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.zip
6 files changed, 78 insertions, 75 deletions
diff --git a/.gitignore b/.gitignore
index 0247d2c..ec3af4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,4 @@
 *.lock
 coverage/
 target
-script/unicode-data.txt
+unicode-data.txt
diff --git a/Cargo.toml b/Cargo.toml
index 96f23d7..1c443c3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 name = "micromark"
 version = "0.0.0"
 authors = ["Titus Wormer <tituswormer@gmail.com>"]
-edition = "2015"
+edition = "2018"
 rust-version = "1.56"
 description = "small commonmark compliant markdown parser with positional info and concrete tokens"
 homepage = "https://github.com/micromark/micromark-rs"
@@ -14,5 +14,9 @@ include = ["src/", "license"]
 publish = false
 
 [dependencies]
-log = "0.4"
 env_logger = "0.9"
+log = "0.4"
+
+[build-dependencies]
+reqwest = "0.11"
+tokio = { version = "1.12", features = ["full"] }
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..6a05dfe
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,70 @@
+extern crate reqwest;
+use std::fs;
+
+#[tokio::main]
+async fn main() {
+    let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
+    let data_url = "unicode-data.txt";
+    let code_url = "src/unicode.rs";
+
+    let value = if let Ok(value) = fs::read_to_string(data_url) {
+        value
+    } else {
+        let value = reqwest::get(url)
+            .await
+            .unwrap()
+            .text()
+            .await
+            .unwrap();
+
+        fs::write(data_url, value.clone()).unwrap();
+
+        value
+    };
+
+    let search = [
+        "Pc", // Punctuation, Connector
+        "Pd", // Punctuation, Dash
+        "Pe", // Punctuation, Close
+        "Pf", // Punctuation, FinalQuote
+        "Pi", // Punctuation, InitialQuote
+        "Po", // Punctuation, Other
+        "Ps", // Punctuation, Open
+    ];
+
+    let found = value
+        .lines()
+        .map(|line| line.split(';').collect::<Vec<_>>())
+        .map(|cells| (cells[0], cells[2]))
+        .filter(|c| search.contains(&c.1))
+        .map(|c| c.0)
+        .collect::<Vec<_>>();
+
+    let doc = format!(
+        "//! Information on Unicode.
+
+/// List of characters that are considered punctuation according to Unicode.
+///
+/// > 👉 **Important**: this module is generated by `script/`.
+/// > It is generate from the latest Unicode data.
+///
+/// Rust does not contain an `is_punctuation` method on `char`, while it does
+/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
+///
+/// `CommonMark` handles attention (emphasis, strong) markers based on what
+/// comes before or after them.
+/// One such difference is if those characters are Unicode punctuation.
+///
+/// ## References
+///
+/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
+pub const PUNCTUATION: [char; {}] = [
+{}
+];
+",
+    found.len(),
+    found.iter().map(|d| format!("    '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
+    );
+
+    fs::write(code_url, doc).unwrap();
+}
diff --git a/readme.md b/readme.md
index 6c3ecf3..5d0ffff 100644
--- a/readme.md
+++ b/readme.md
@@ -125,7 +125,6 @@ cargo doc --document-private-items
 #### Refactor
 
 - [ ] (1) Use `edit_map` in `subtokenize` (needs to support links in edits)
-- [ ] (1) Use rust to crawl unicode
 
 #### Parse
 
@@ -276,3 +275,4 @@ important.
 - [x] (3) Add support for interrupting (or not)
 - [x] (5) attention
 - [x] (3) Unicode punctuation
+- [x] (1) Use rust to crawl unicode
diff --git a/script/generate-unicode.js b/script/generate-unicode.js
deleted file mode 100644
index 35150af..0000000
--- a/script/generate-unicode.js
+++ /dev/null
@@ -1,68 +0,0 @@
-// To do: port to Rust with `reqwest`?
-import fs from "node:fs/promises";
-
-const dataUrl = new URL("unicode-data.txt", import.meta.url);
-const codeUrl = new URL("../src/unicode.rs", import.meta.url);
-/** @type {string} */
-let data;
-
-try {
-  data = String(await fs.readFile(dataUrl));
-} catch {
-  const response = await fetch(
-    "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
-  );
-  console.log(response);
-  const text = await response.text();
-  await fs.writeFile(dataUrl, text);
-  data = text;
-}
-
-let rows = data.split("\n");
-let index = -1;
-let search = [
-  "Pc", // Punctuation, Connector
-  "Pd", // Punctuation, Dash
-  "Pe", // Punctuation, Close
-  "Pf", // Punctuation, FinalQuote
-  "Pi", // Punctuation, InitialQuote
-  "Po", // Punctuation, Other
-  "Ps", // Punctuation, Open
-];
-/** @type {Array<string>} */
-let found = [];
-
-while (++index < rows.length) {
-  const cells = rows[index].split(";");
-  const [code, , category] = cells;
-  if (search.includes(category)) {
-    found.push(code);
-  }
-}
-
-await fs.writeFile(
-  codeUrl,
-  [
-    "//! Information on Unicode.",
-    "",
-    "/// List of characters that are considered punctuation according to Unicode.",
-    "///",
-    "/// > 👉 **Important**: this module is generated by `script/`.",
-    "/// > It is generate from the latest Unicode data.",
-    "///",
-    "/// Rust does not contain an `is_punctuation` method on `char`, while it does",
-    "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).",
-    "///",
-    "/// `CommonMark` handles attention (emphasis, strong) markers based on what",
-    "/// comes before or after them.",
-    "/// One such difference is if those characters are Unicode punctuation.",
-    "///",
-    "/// ## References",
-    "///",
-    "/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)",
-    "pub const PUNCTUATION: [char; " + found.length + "] = [",
-    ...found.map((d) => "    '\\u{" + d + "}',"),
-    "];",
-    "",
-  ].join("\n")
-);
diff --git a/script/package.json b/script/package.json
deleted file mode 100644
index 3dbc1ca..0000000
--- a/script/package.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "type": "module"
-}
author	Titus Wormer <tituswormer@gmail.com>	2022-07-04 17:28:11 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-07-04 17:28:11 +0200
commit	2bd50cd8082e686ee74adc0770cc63593b1718f1 (patch)
tree	0d2c7ddd3df290c2f89e4726f9bb6264e68a376b
parent	0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (diff)
download	markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.gz markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.bz2 markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.zip