build.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

extern crate reqwest;
use std::fs;

#[tokio::main]
async fn main() {
    let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
    let data_url = "unicode-data.txt";
    let code_url = "src/unicode.rs";

    let value = if let Ok(value) = fs::read_to_string(data_url) {
        value
    } else {
        let value = reqwest::get(url)
            .await
            .unwrap()
            .text()
            .await
            .unwrap();

        fs::write(data_url, value.clone()).unwrap();

        value
    };

    let search = [
        "Pc", // Punctuation, Connector
        "Pd", // Punctuation, Dash
        "Pe", // Punctuation, Close
        "Pf", // Punctuation, FinalQuote
        "Pi", // Punctuation, InitialQuote
        "Po", // Punctuation, Other
        "Ps", // Punctuation, Open
    ];

    let found = value
        .lines()
        .map(|line| line.split(';').collect::<Vec<_>>())
        .map(|cells| (cells[0], cells[2]))
        .filter(|c| search.contains(&c.1))
        .map(|c| c.0)
        .collect::<Vec<_>>();

    let doc = format!(
        "//! Information on Unicode.

/// List of characters that are considered punctuation according to Unicode.
///
/// > 👉 **Important**: this module is generated by `script/`.
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
/// One such difference is if those characters are Unicode punctuation.
///
/// ## References
///
/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
pub const PUNCTUATION: [char; {}] = [
{}
];
",
    found.len(),
    found.iter().map(|d| format!("    '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
    );

    fs::write(code_url, doc).unwrap();
}