aboutsummaryrefslogblamecommitdiffstats
path: root/build.rs
blob: cdfeac4bc600c9310fb2ba26f1b70934ade856e5 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12











                                                                              
                                                                           




























                                                                            
                                                                 





















                                                                                                                        
extern crate reqwest;
use std::fs;

#[tokio::main]
async fn main() {
    let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
    let data_url = "unicode-data.txt";
    let code_url = "src/unicode.rs";

    let value = if let Ok(value) = fs::read_to_string(data_url) {
        value
    } else {
        let value = reqwest::get(url).await.unwrap().text().await.unwrap();

        fs::write(data_url, value.clone()).unwrap();

        value
    };

    let search = [
        "Pc", // Punctuation, Connector
        "Pd", // Punctuation, Dash
        "Pe", // Punctuation, Close
        "Pf", // Punctuation, FinalQuote
        "Pi", // Punctuation, InitialQuote
        "Po", // Punctuation, Other
        "Ps", // Punctuation, Open
    ];

    let found = value
        .lines()
        .map(|line| line.split(';').collect::<Vec<_>>())
        .map(|cells| (cells[0], cells[2]))
        .filter(|c| search.contains(&c.1))
        .map(|c| c.0)
        .collect::<Vec<_>>();

    let doc = format!(
        "//! Information on Unicode.

/// List of characters that are considered punctuation according to Unicode.
///
/// > ๐Ÿ‘‰ **Important**: this module is generated by `build.rs`.
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
/// One such difference is if those characters are Unicode punctuation.
///
/// ## References
///
/// *   [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
pub const PUNCTUATION: [char; {}] = [
{}
];
",
    found.len(),
    found.iter().map(|d| format!("    '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
    );

    fs::write(code_url, doc).unwrap();
}