1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
extern crate reqwest;
use std::fs;
#[tokio::main]
async fn main() {
let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
let data_url = "unicode-data.txt";
let code_url = "src/unicode.rs";
let value = if let Ok(value) = fs::read_to_string(data_url) {
value
} else {
let value = reqwest::get(url)
.await
.unwrap()
.text()
.await
.unwrap();
fs::write(data_url, value.clone()).unwrap();
value
};
let search = [
"Pc", // Punctuation, Connector
"Pd", // Punctuation, Dash
"Pe", // Punctuation, Close
"Pf", // Punctuation, FinalQuote
"Pi", // Punctuation, InitialQuote
"Po", // Punctuation, Other
"Ps", // Punctuation, Open
];
let found = value
.lines()
.map(|line| line.split(';').collect::<Vec<_>>())
.map(|cells| (cells[0], cells[2]))
.filter(|c| search.contains(&c.1))
.map(|c| c.0)
.collect::<Vec<_>>();
let doc = format!(
"//! Information on Unicode.
/// List of characters that are considered punctuation according to Unicode.
///
/// > ๐ **Important**: this module is generated by `script/`.
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
/// One such difference is if those characters are Unicode punctuation.
///
/// ## References
///
/// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
pub const PUNCTUATION: [char; {}] = [
{}
];
",
found.len(),
found.iter().map(|d| format!(" '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
);
fs::write(code_url, doc).unwrap();
}
|