aboutsummaryrefslogtreecommitdiffstats
path: root/build.rs
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-04 17:28:11 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-04 17:28:11 +0200
commit2bd50cd8082e686ee74adc0770cc63593b1718f1 (patch)
tree0d2c7ddd3df290c2f89e4726f9bb6264e68a376b /build.rs
parent0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (diff)
downloadmarkdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.gz
markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.bz2
markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.zip
Use Rust to crawl unicode
Diffstat (limited to 'build.rs')
-rw-r--r--build.rs70
1 files changed, 70 insertions, 0 deletions
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..6a05dfe
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,70 @@
+extern crate reqwest;
+use std::fs;
+
+#[tokio::main]
+async fn main() {
+ let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
+ let data_url = "unicode-data.txt";
+ let code_url = "src/unicode.rs";
+
+ let value = if let Ok(value) = fs::read_to_string(data_url) {
+ value
+ } else {
+ let value = reqwest::get(url)
+ .await
+ .unwrap()
+ .text()
+ .await
+ .unwrap();
+
+ fs::write(data_url, value.clone()).unwrap();
+
+ value
+ };
+
+ let search = [
+ "Pc", // Punctuation, Connector
+ "Pd", // Punctuation, Dash
+ "Pe", // Punctuation, Close
+ "Pf", // Punctuation, FinalQuote
+ "Pi", // Punctuation, InitialQuote
+ "Po", // Punctuation, Other
+ "Ps", // Punctuation, Open
+ ];
+
+ let found = value
+ .lines()
+ .map(|line| line.split(';').collect::<Vec<_>>())
+ .map(|cells| (cells[0], cells[2]))
+ .filter(|c| search.contains(&c.1))
+ .map(|c| c.0)
+ .collect::<Vec<_>>();
+
+ let doc = format!(
+ "//! Information on Unicode.
+
+/// List of characters that are considered punctuation according to Unicode.
+///
+/// > ๐Ÿ‘‰ **Important**: this module is generated by `script/`.
+/// > It is generate from the latest Unicode data.
+///
+/// Rust does not contain an `is_punctuation` method on `char`, while it does
+/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
+///
+/// `CommonMark` handles attention (emphasis, strong) markers based on what
+/// comes before or after them.
+/// One such difference is if those characters are Unicode punctuation.
+///
+/// ## References
+///
+/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
+pub const PUNCTUATION: [char; {}] = [
+{}
+];
+",
+ found.len(),
+ found.iter().map(|d| format!(" '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
+ );
+
+ fs::write(code_url, doc).unwrap();
+}