extern crate reqwest; use regex::Regex; use std::fs; #[tokio::main] async fn main() { commonmark().await; punctuation().await; } async fn commonmark() { let url = "https://raw.githubusercontent.com/commonmark/commonmark-spec/0.30/spec.txt"; let data_url = "commonmark-data.txt"; #[allow(unused_variables)] let code_url = "tests/commonmark.rs"; let value = if let Ok(value) = fs::read_to_string(data_url) { value } else { let value = reqwest::get(url).await.unwrap().text().await.unwrap(); fs::write(data_url, value.clone()).unwrap(); value }; let re = Regex::new(r"(?m)(?:^`{32} example\n[\s\S]*?\n`{32}$|^#{1,6} *(.*)$)").unwrap(); let re_heading_prefix = Regex::new(r"#{1,6} ").unwrap(); let re_in_out = Regex::new(r"\n\.(?:\n|$)").unwrap(); let mut current_heading: Option = None; let mut case_index = 0; let value = Regex::new(r"[\s\S]*") .unwrap() .replace(&value, ""); let value = Regex::new(r"โ†’").unwrap().replace_all(&value, "\t"); let mut cases: Vec = vec![]; for mat in re.find_iter(&value) { let mut lines = mat.as_str().lines().collect::>(); if lines.len() == 1 { current_heading = Some(re_heading_prefix.replace(lines[0], "").to_string()); } else { lines.remove(0); lines.pop(); let section = current_heading.as_ref().unwrap(); let case = lines.join("\n"); let parts = re_in_out.split(&case).collect::>(); let input = format!("{}\n", parts[0]); let output = if parts[1].is_empty() { "".to_string() } else { format!("{}\n", parts[1]) }; let test = format!(" assert_eq!(\n micromark_with_options(r###\"{}\"###, DANGER),\n r###\"{}\"###,\n r###\"{} ({})\"###\n);", input, output, section, case_index); cases.push(test); case_index += 1; } } #[allow(unused_variables)] let doc = format!( "//! CommonMark test suite. // > ๐Ÿ‘‰ **Important**: this module is generated by `build.rs`. // > It is generate from the latest Unicode data. extern crate micromark; use micromark::{{micromark_with_options, Options}}; const DANGER: &Options = &Options {{ allow_dangerous_html: true, allow_dangerous_protocol: true, default_line_ending: None, }}; #[test] fn commonmark() {{ {} }} ", cases.join("\n\n") ); // To do: enable when CM is completely fixed. // fs::write(code_url, doc).unwrap(); } async fn punctuation() { let url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"; let data_url = "unicode-data.txt"; let code_url = "src/unicode.rs"; let value = if let Ok(value) = fs::read_to_string(data_url) { value } else { let value = reqwest::get(url).await.unwrap().text().await.unwrap(); fs::write(data_url, value.clone()).unwrap(); value }; let search = [ "Pc", // Punctuation, Connector "Pd", // Punctuation, Dash "Pe", // Punctuation, Close "Pf", // Punctuation, FinalQuote "Pi", // Punctuation, InitialQuote "Po", // Punctuation, Other "Ps", // Punctuation, Open ]; let found = value .lines() .map(|line| line.split(';').collect::>()) .map(|cells| (cells[0], cells[2])) .filter(|c| search.contains(&c.1)) .map(|c| c.0) .collect::>(); let doc = format!( "//! Information on Unicode. /// List of characters that are considered punctuation according to Unicode. /// /// > ๐Ÿ‘‰ **Important**: this module is generated by `build.rs`. /// > It is generate from the latest Unicode data. /// /// Rust does not contain an `is_punctuation` method on `char`, while it does /// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). /// /// `CommonMark` handles attention (emphasis, strong) markers based on what /// comes before or after them. /// One such difference is if those characters are Unicode punctuation. /// /// ## References /// /// * [*ยง 2.1 Characters and lines* in `CommonMark`](https://spec.commonmark.org/0.30/#unicode-punctuation-character) pub const PUNCTUATION: [char; {}] = [ {} ]; ", found.len(), found.iter().map(|d| format!(" '\\u{{{}}}',", d)).collect::>().join("\n") ); fs::write(code_url, doc).unwrap(); }