build.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

extern crate reqwest;
use regex::Regex;
use std::fs;

#[tokio::main]
async fn main() {
    commonmark().await;
    punctuation().await;
}

async fn commonmark() {
    let url = "https://raw.githubusercontent.com/commonmark/commonmark-spec/0.30/spec.txt";
    let data_url = "commonmark-data.txt";
    #[allow(unused_variables)]
    let code_url = "tests/commonmark.rs";

    let value = if let Ok(value) = fs::read_to_string(data_url) {
        value
    } else {
        let value = reqwest::get(url).await.unwrap().text().await.unwrap();

        fs::write(data_url, value.clone()).unwrap();

        value
    };

    let re = Regex::new(r"(?m)(?:^`{32} example\n[\s\S]*?\n`{32}$|^#{1,6} *(.*)$)").unwrap();
    let re_heading_prefix = Regex::new(r"#{1,6} ").unwrap();
    let re_in_out = Regex::new(r"\n\.(?:\n|$)").unwrap();
    let mut current_heading: Option<String> = None;
    let mut case_index = 0;

    let value = Regex::new(r"<!-- END TESTS -->[\s\S]*")
        .unwrap()
        .replace(&value, "");
    let value = Regex::new(r"→").unwrap().replace_all(&value, "\t");
    let mut cases: Vec<String> = vec![];

    for mat in re.find_iter(&value) {
        let mut lines = mat.as_str().lines().collect::<Vec<_>>();

        if lines.len() == 1 {
            current_heading = Some(re_heading_prefix.replace(lines[0], "").to_string());
        } else {
            lines.remove(0);
            lines.pop();
            let section = current_heading.as_ref().unwrap();
            let case = lines.join("\n");
            let parts = re_in_out.split(&case).collect::<Vec<_>>();
            let input = format!("{}\n", parts[0]);
            let output = if parts[1].is_empty() {
                "".to_string()
            } else {
                format!("{}\n", parts[1])
            };

            let test = format!("    assert_eq!(\n        micromark_with_options(r###\"{}\"###, DANGER),\n        r###\"{}\"###,\n        r###\"{} ({})\"###\n);", input, output, section, case_index);

            cases.push(test);

            case_index += 1;
        }
    }

    #[allow(unused_variables)]
    let doc = format!(
        "//! CommonMark test suite.

// > 👉 **Important**: this module is generated by `build.rs`.
// > It is generate from the latest Unicode data.

extern crate micromark;
use micromark::{{micromark_with_options, Options}};

const DANGER: &Options = &Options {{
    allow_dangerous_html: true,
    allow_dangerous_protocol: true,
    default_line_ending: None,
}};

#[test]
fn commonmark() {{
{}
}}
",
        cases.join("\n\n")
    );

    // To do: enable when CM is completely fixed.
    // fs::write(code_url, doc).unwrap();
}

async fn punctuation() {
    let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
    let data_url = "unicode-data.txt";
    let code_url = "src/unicode.rs";

    let value = if let Ok(value) = fs::read_to_string(data_url) {
        value
    } else {
        let value = reqwest::get(url).await.unwrap().text().await.unwrap();

        fs::write(data_url, value.clone()).unwrap();

        value
    };

    let search = [
        "Pc", // Punctuation, Connector
        "Pd", // Punctuation, Dash
        "Pe", // Punctuation, Close
        "Pf", // Punctuation, FinalQuote
        "Pi", // Punctuation, InitialQuote
        "Po", // Punctuation, Other
        "Ps", // Punctuation, Open
    ];

    let found = value
        .lines()
        .map(|line| line.split(';').collect::<Vec<_>>())
        .map(|cells| (cells[0], cells[2]))
        .filter(|c| search.contains(&c.1))
        .map(|c| c.0)
        .collect::<Vec<_>>();

    let doc = format!(
        "//! Information on Unicode.

/// List of characters that are considered punctuation according to Unicode.
///
/// > 👉 **Important**: this module is generated by `build.rs`.
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
/// One such difference is if those characters are Unicode punctuation.
///
/// ## References
///
/// *   [*§ 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character)
pub const PUNCTUATION: [char; {}] = [
{}
];
",
    found.len(),
    found.iter().map(|d| format!("    '\\u{{{}}}',", d)).collect::<Vec<_>>().join("\n")
    );

    fs::write(code_url, doc).unwrap();
}