build.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

extern crate reqwest;
use regex::Regex;
use std::fs;

#[tokio::main]
async fn main() {
    commonmark().await;
    punctuation().await;
}

async fn commonmark() {
    let url = "https://raw.githubusercontent.com/commonmark/commonmark-spec/0.30/spec.txt";
    let data_url = "commonmark-data.txt";
    let code_url = "tests/commonmark.rs";

    let value = if let Ok(value) = fs::read_to_string(data_url) {
        value
    } else {
        let value = reqwest::get(url).await.unwrap().text().await.unwrap();

        fs::write(data_url, value.clone()).unwrap();

        value
    };

    let re = Regex::new(r"(?m)(?:^`{32} example\n[\s\S]*?\n`{32}$|^#{1,6} *(.*)$)").unwrap();
    let re_heading_prefix = Regex::new(r"#{1,6} ").unwrap();
    let re_in_out = Regex::new(r"\n\.(?:\n|$)").unwrap();
    let mut current_heading = None;
    let mut number = 1;

    let value = Regex::new(r"<!-- END TESTS -->[\s\S]*")
        .unwrap()
        .replace(&value, "");
    let value = Regex::new(r"→").unwrap().replace_all(&value, "\t");
    let mut cases = vec![];

    for mat in re.find_iter(&value) {
        let mut lines = mat.as_str().lines().collect::<Vec<_>>();

        if lines.len() == 1 {
            current_heading = Some(re_heading_prefix.replace(lines[0], "").to_string());
        } else {
            lines.remove(0);
            lines.pop();
            let section = current_heading.as_ref().unwrap();
            let case = lines.join("\n");
            let parts = re_in_out.split(&case).collect::<Vec<_>>();
            let input = format!("{}\n", parts[0]);
            let output = if parts[1].is_empty() {
                "".to_string()
            } else {
                format!("{}\n", parts[1])
            };

            let test = format!("    assert_eq!(\n        micromark_with_options(\n            r###\"{}\"###,\n            &danger\n        ),\n        r###\"{}\"###,\n        r###\"{} ({})\"###\n);", input, output, section, number);

            cases.push(test);

            number += 1;
        }
    }

    let doc = format!(
        "//! CommonMark test suite.

// > 👉 **Important**: this module is generated by `build.rs`.
// > It is generate from the latest CommonMark website.

extern crate micromark;
use micromark::{{micromark_with_options, Options}};
use pretty_assertions::assert_eq;

#[rustfmt::skip]
#[test]
fn commonmark() {{
    let danger = Options {{
        allow_dangerous_html: true,
        allow_dangerous_protocol: true,
        ..Options::default()
    }};

{}
}}
",
        cases.join("\n\n")
    );

    fs::write(code_url, doc).unwrap();
}

async fn punctuation() {
    let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
    let data_url = "unicode-data.txt";
    let code_url = "src/util/unicode.rs";

    let value = if let Ok(value) = fs::read_to_string(data_url) {
        value
    } else {
        let value = reqwest::get(url).await.unwrap().text().await.unwrap();

        fs::write(data_url, value.clone()).unwrap();

        value
    };

    let search = [
        "Pc", // Punctuation, Connector
        "Pd", // Punctuation, Dash
        "Pe", // Punctuation, Close
        "Pf", // Punctuation, FinalQuote
        "Pi", // Punctuation, InitialQuote
        "Po", // Punctuation, Other
        "Ps", // Punctuation, Open
    ];

    let found = value
        .lines()
        .map(|line| line.split(';').collect::<Vec<_>>())
        .map(|cells| (cells[0], cells[2]))
        .filter(|c| search.contains(&c.1))
        .map(|c| c.0)
        .collect::<Vec<_>>();

    let doc = format!(
        "//! Info on Unicode.

/// List of characters that are considered punctuation.
///
/// > 👉 **Important**: this module is generated by `build.rs`.
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
/// One such difference is if those characters are Unicode punctuation.
///
/// ## References
///
/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
pub const PUNCTUATION: [char; {}] = [
{}
];
",
    found.len(),
    found.iter().map(|d| format!("    '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
    );

    fs::write(code_url, doc).unwrap();
}