diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-04 17:28:11 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-04 17:28:11 +0200 |
commit | 2bd50cd8082e686ee74adc0770cc63593b1718f1 (patch) | |
tree | 0d2c7ddd3df290c2f89e4726f9bb6264e68a376b /build.rs | |
parent | 0450e7c2b12bd3ef53e0cffb60a3dd860325b478 (diff) | |
download | markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.gz markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.tar.bz2 markdown-rs-2bd50cd8082e686ee74adc0770cc63593b1718f1.zip |
Use Rust to crawl unicode
Diffstat (limited to '')
-rw-r--r-- | build.rs | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..6a05dfe --- /dev/null +++ b/build.rs @@ -0,0 +1,70 @@ +extern crate reqwest; +use std::fs; + +#[tokio::main] +async fn main() { + let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"; + let data_url = "unicode-data.txt"; + let code_url = "src/unicode.rs"; + + let value = if let Ok(value) = fs::read_to_string(data_url) { + value + } else { + let value = reqwest::get(url) + .await + .unwrap() + .text() + .await + .unwrap(); + + fs::write(data_url, value.clone()).unwrap(); + + value + }; + + let search = [ + "Pc", // Punctuation, Connector + "Pd", // Punctuation, Dash + "Pe", // Punctuation, Close + "Pf", // Punctuation, FinalQuote + "Pi", // Punctuation, InitialQuote + "Po", // Punctuation, Other + "Ps", // Punctuation, Open + ]; + + let found = value + .lines() + .map(|line| line.split(';').collect::<Vec<_>>()) + .map(|cells| (cells[0], cells[2])) + .filter(|c| search.contains(&c.1)) + .map(|c| c.0) + .collect::<Vec<_>>(); + + let doc = format!( + "//! Information on Unicode. + +/// List of characters that are considered punctuation according to Unicode. +/// +/// > ๐ **Important**: this module is generated by `script/`. +/// > It is generate from the latest Unicode data. +/// +/// Rust does not contain an `is_punctuation` method on `char`, while it does +/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). +/// +/// `CommonMark` handles attention (emphasis, strong) markers based on what +/// comes before or after them. +/// One such difference is if those characters are Unicode punctuation. +/// +/// ## References +/// +/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character) +pub const PUNCTUATION: [char; {}] = [ +{} +]; +", + found.len(), + found.iter().map(|d| format!(" '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n") + ); + + fs::write(code_url, doc).unwrap(); +} |