From 2bd50cd8082e686ee74adc0770cc63593b1718f1 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 4 Jul 2022 17:28:11 +0200 Subject: Use Rust to crawl unicode --- .gitignore | 2 +- Cargo.toml | 8 ++++-- build.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++++ readme.md | 2 +- script/generate-unicode.js | 68 -------------------------------------------- script/package.json | 3 -- 6 files changed, 78 insertions(+), 75 deletions(-) create mode 100644 build.rs delete mode 100644 script/generate-unicode.js delete mode 100644 script/package.json diff --git a/.gitignore b/.gitignore index 0247d2c..ec3af4f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ *.lock coverage/ target -script/unicode-data.txt +unicode-data.txt diff --git a/Cargo.toml b/Cargo.toml index 96f23d7..1c443c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "micromark" version = "0.0.0" authors = ["Titus Wormer "] -edition = "2015" +edition = "2018" rust-version = "1.56" description = "small commonmark compliant markdown parser with positional info and concrete tokens" homepage = "https://github.com/micromark/micromark-rs" @@ -14,5 +14,9 @@ include = ["src/", "license"] publish = false [dependencies] -log = "0.4" env_logger = "0.9" +log = "0.4" + +[build-dependencies] +reqwest = "0.11" +tokio = { version = "1.12", features = ["full"] } diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..6a05dfe --- /dev/null +++ b/build.rs @@ -0,0 +1,70 @@ +extern crate reqwest; +use std::fs; + +#[tokio::main] +async fn main() { + let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"; + let data_url = "unicode-data.txt"; + let code_url = "src/unicode.rs"; + + let value = if let Ok(value) = fs::read_to_string(data_url) { + value + } else { + let value = reqwest::get(url) + .await + .unwrap() + .text() + .await + .unwrap(); + + fs::write(data_url, value.clone()).unwrap(); + + value + }; + + let search = [ + "Pc", // Punctuation, Connector + "Pd", // Punctuation, Dash + "Pe", // Punctuation, Close + "Pf", // Punctuation, FinalQuote + "Pi", // Punctuation, InitialQuote + "Po", // Punctuation, Other + "Ps", // Punctuation, Open + ]; + + let found = value + .lines() + .map(|line| line.split(';').collect::>()) + .map(|cells| (cells[0], cells[2])) + .filter(|c| search.contains(&c.1)) + .map(|c| c.0) + .collect::>(); + + let doc = format!( + "//! Information on Unicode. + +/// List of characters that are considered punctuation according to Unicode. +/// +/// > ๐Ÿ‘‰ **Important**: this module is generated by `script/`. +/// > It is generate from the latest Unicode data. +/// +/// Rust does not contain an `is_punctuation` method on `char`, while it does +/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). +/// +/// `CommonMark` handles attention (emphasis, strong) markers based on what +/// comes before or after them. +/// One such difference is if those characters are Unicode punctuation. +/// +/// ## References +/// +/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character) +pub const PUNCTUATION: [char; {}] = [ +{} +]; +", + found.len(), + found.iter().map(|d| format!(" '\\u{{{}}}',", d)).collect::>().join("\n") + ); + + fs::write(code_url, doc).unwrap(); +} diff --git a/readme.md b/readme.md index 6c3ecf3..5d0ffff 100644 --- a/readme.md +++ b/readme.md @@ -125,7 +125,6 @@ cargo doc --document-private-items #### Refactor - [ ] (1) Use `edit_map` in `subtokenize` (needs to support links in edits) -- [ ] (1) Use rust to crawl unicode #### Parse @@ -276,3 +275,4 @@ important. - [x] (3) Add support for interrupting (or not) - [x] (5) attention - [x] (3) Unicode punctuation +- [x] (1) Use rust to crawl unicode diff --git a/script/generate-unicode.js b/script/generate-unicode.js deleted file mode 100644 index 35150af..0000000 --- a/script/generate-unicode.js +++ /dev/null @@ -1,68 +0,0 @@ -// To do: port to Rust with `reqwest`? -import fs from "node:fs/promises"; - -const dataUrl = new URL("unicode-data.txt", import.meta.url); -const codeUrl = new URL("../src/unicode.rs", import.meta.url); -/** @type {string} */ -let data; - -try { - data = String(await fs.readFile(dataUrl)); -} catch { - const response = await fetch( - "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" - ); - console.log(response); - const text = await response.text(); - await fs.writeFile(dataUrl, text); - data = text; -} - -let rows = data.split("\n"); -let index = -1; -let search = [ - "Pc", // Punctuation, Connector - "Pd", // Punctuation, Dash - "Pe", // Punctuation, Close - "Pf", // Punctuation, FinalQuote - "Pi", // Punctuation, InitialQuote - "Po", // Punctuation, Other - "Ps", // Punctuation, Open -]; -/** @type {Array} */ -let found = []; - -while (++index < rows.length) { - const cells = rows[index].split(";"); - const [code, , category] = cells; - if (search.includes(category)) { - found.push(code); - } -} - -await fs.writeFile( - codeUrl, - [ - "//! Information on Unicode.", - "", - "/// List of characters that are considered punctuation according to Unicode.", - "///", - "/// > ๐Ÿ‘‰ **Important**: this module is generated by `script/`.", - "/// > It is generate from the latest Unicode data.", - "///", - "/// Rust does not contain an `is_punctuation` method on `char`, while it does", - "/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).", - "///", - "/// `CommonMark` handles attention (emphasis, strong) markers based on what", - "/// comes before or after them.", - "/// One such difference is if those characters are Unicode punctuation.", - "///", - "/// ## References", - "///", - "/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)", - "pub const PUNCTUATION: [char; " + found.length + "] = [", - ...found.map((d) => " '\\u{" + d + "}',"), - "];", - "", - ].join("\n") -); diff --git a/script/package.json b/script/package.json deleted file mode 100644 index 3dbc1ca..0000000 --- a/script/package.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "type": "module" -} -- cgit