diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-21 12:06:51 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-21 12:06:51 +0200 |
commit | f99d131ec3ab60956344d001bcd40244343c241b (patch) | |
tree | ac798f9a6a1ab73021cdd5a5303e20424d37172e | |
parent | 182467c1d393dee2081ff80f1c049cb145f23123 (diff) | |
download | markdown-rs-f99d131ec3ab60956344d001bcd40244343c241b.tar.gz markdown-rs-f99d131ec3ab60956344d001bcd40244343c241b.tar.bz2 markdown-rs-f99d131ec3ab60956344d001bcd40244343c241b.zip |
Add support for inferring line ending, configurable
* Rename `CompileOptions` to `Options`
* Add support for an optional default line ending style
* Add support for inferring the used line ending style
Diffstat (limited to '')
-rw-r--r-- | examples/lib.rs | 7 | ||||
-rw-r--r-- | readme.md | 12 | ||||
-rw-r--r-- | src/compiler.rs | 89 | ||||
-rw-r--r-- | src/lib.rs | 11 | ||||
-rw-r--r-- | src/tokenizer.rs | 6 | ||||
-rw-r--r-- | tests/autolink.rs | 5 | ||||
-rw-r--r-- | tests/character_escape.rs | 5 | ||||
-rw-r--r-- | tests/character_reference.rs | 5 | ||||
-rw-r--r-- | tests/code_text.rs | 5 | ||||
-rw-r--r-- | tests/definition.rs | 5 | ||||
-rw-r--r-- | tests/html_flow.rs | 5 | ||||
-rw-r--r-- | tests/html_text.rs | 5 | ||||
-rw-r--r-- | tests/misc_dangerous_html.rs | 5 | ||||
-rw-r--r-- | tests/misc_default_line_ending.rs | 56 | ||||
-rw-r--r-- | tests/misc_line_ending.rs | 161 | ||||
-rw-r--r-- | tests/misc_tabs.rs | 5 |
16 files changed, 338 insertions, 49 deletions
diff --git a/examples/lib.rs b/examples/lib.rs index 00f45dc..718e400 100644 --- a/examples/lib.rs +++ b/examples/lib.rs @@ -1,5 +1,5 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; fn main() { // Turn on debugging. @@ -14,9 +14,10 @@ fn main() { "{:?}", micromark_with_options( "<div style=\"color: tomato\">\n\n# Hello, tomato!\n\n</div>", - &CompileOptions { + &Options { allow_dangerous_html: true, - allow_dangerous_protocol: true + allow_dangerous_protocol: true, + default_line_ending: None } ) ); @@ -66,16 +66,14 @@ cargo doc --document-private-items ### Small things +- [ ] (1) Use `impl fmt::Display for x` for a bunch of enums, e.g., markers - [ ] (1) Parse initial and final whitespace of paragraphs (in text) - [ ] (1) Add docs to subtokenize - [ ] (1) Add module docs to parser - [ ] (1) Add overview docs on how everything works - [ ] (1) Move safe protocols to constants - [ ] (3) Clean compiler -- [ ] (1) Use preferred line ending style in markdown -- [ ] (1) Add tests for `default-line-ending`, `line-ending` -- [ ] (1) Make sure tabs are handled properly and that positional info is perfect -- [ ] (1) Make sure crlf/cr/lf are working perfectly +- [ ] (1) Make sure positional info is perfect - [ ] (3) Figure out lifetimes of things (see `life time` in source) - [ ] (3) Use `commonmark` tests - [ ] (3) Share a bunch of tests with `micromark-js` @@ -159,7 +157,7 @@ cargo doc --document-private-items - [x] (1) Add docs to html (text) - [x] (1) Add docs on bnf - [x] (1) Reorganize to split util -- [x] (1) Add examples to `CompileOptions` docs +- [x] (1) Add examples to `Options` docs - [x] (3) Fix deep subtokenization - [x] (1) text in heading - [x] (1) Setext headings, solved in flow @@ -171,6 +169,10 @@ cargo doc --document-private-items - [x] (1) Connect `ChunkString` in label, destination, title - [x] (1) Add support for line endings in `string` - [x] (1) Handle BOM at start +- [x] (1) Make sure tabs are handled properly +- [x] (1) Add tests for `default-line-ending`, `line-ending` +- [x] (1) Use preferred line ending style in markdown +- [x] (1) Make sure crlf/cr/lf are working perfectly ### Extensions diff --git a/src/compiler.rs b/src/compiler.rs index 366dcd9..5c7f6d8 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -8,9 +8,17 @@ use crate::util::{ span::{codes as codes_from_span, from_exit_event, serialize}, }; +/// To do. +#[derive(Debug, Clone)] +pub enum LineEnding { + CarriageReturnLineFeed, + CarriageReturn, + LineFeed, +} + /// Configuration (optional). #[derive(Default, Debug)] -pub struct CompileOptions { +pub struct Options { /// Whether to allow (dangerous) HTML. /// The default is `false`, you can turn it on to `true` for trusted /// content. @@ -18,7 +26,7 @@ pub struct CompileOptions { /// ## Examples /// /// ```rust - /// use micromark::{micromark, micromark_with_options, CompileOptions}; + /// use micromark::{micromark, micromark_with_options, Options}; /// /// // micromark is safe by default: /// assert_eq!( @@ -30,9 +38,11 @@ pub struct CompileOptions { /// assert_eq!( /// micromark_with_options( /// "Hi, <i>venus</i>!", - /// &CompileOptions { + /// &Options { /// allow_dangerous_html: true, /// allow_dangerous_protocol: false, + /// default_line_ending: None, + /// /// } /// ), /// "<p>Hi, <i>venus</i>!</p>" @@ -47,7 +57,7 @@ pub struct CompileOptions { /// ## Examples /// /// ```rust - /// use micromark::{micromark, micromark_with_options, CompileOptions}; + /// use micromark::{micromark, micromark_with_options, Options}; /// /// // micromark is safe by default: /// assert_eq!( @@ -59,20 +69,24 @@ pub struct CompileOptions { /// assert_eq!( /// micromark_with_options( /// "<javascript:alert(1)>", - /// &CompileOptions { + /// &Options { /// allow_dangerous_html: false, /// allow_dangerous_protocol: true, + /// default_line_ending: None, /// } /// ), /// "<p><a href=\"javascript:alert(1)\">javascript:alert(1)</a></p>" /// ); /// ``` pub allow_dangerous_protocol: bool, + + /// To do. + pub default_line_ending: Option<LineEnding>, } /// Turn events and codes into a string of HTML. #[allow(clippy::too_many_lines)] -pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> String { +pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { let mut index = 0; // let mut last_was_tag = false; let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]]; @@ -89,6 +103,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St } else { Some(vec!["http", "https", "irc", "ircs", "mailto", "xmpp"]) }; + let mut line_ending_inferred: Option<LineEnding> = None; // let protocol_src = if options.allow_dangerous_protocol { // None // } else { @@ -96,6 +111,40 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St // }; // let mut slurp_all_line_endings = false; + while index < events.len() { + let event = &events[index]; + + if event.event_type == EventType::Exit + && (event.token_type == TokenType::BlankLineEnding + || event.token_type == TokenType::CodeTextLineEnding + || event.token_type == TokenType::LineEnding) + { + let codes = codes_from_span(codes, &from_exit_event(events, index)); + let code = *codes.first().unwrap(); + line_ending_inferred = Some(if code == Code::CarriageReturnLineFeed { + LineEnding::CarriageReturnLineFeed + } else if code == Code::Char('\r') { + LineEnding::CarriageReturn + } else { + LineEnding::LineFeed + }); + break; + } + + index += 1; + } + + let line_ending_default: LineEnding; + + if let Some(value) = line_ending_inferred { + line_ending_default = value; + } else if let Some(value) = &options.default_line_ending { + line_ending_default = value.clone(); + } else { + line_ending_default = LineEnding::LineFeed; + } + + index = 0; while index < events.len() { let event = &events[index]; @@ -162,12 +211,12 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St } TokenType::CodeIndented => { code_flow_seen_data = Some(false); - line_ending_if_needed(buffers); + line_ending_if_needed(buffers, &line_ending_default); buf_tail_mut(buffers).push("<pre><code>".to_string()); } TokenType::CodeFenced => { code_flow_seen_data = Some(false); - line_ending_if_needed(buffers); + line_ending_if_needed(buffers, &line_ending_default); // Note that no `>` is used, which is added later. buf_tail_mut(buffers).push("<pre><code".to_string()); code_fenced_fences_count = Some(0); @@ -177,7 +226,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St buffer(buffers); } TokenType::HtmlFlow => { - line_ending_if_needed(buffers); + line_ending_if_needed(buffers, &line_ending_default); if options.allow_dangerous_html { ignore_encode = true; } @@ -297,14 +346,14 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St // But in most cases, it’s simpler: when we’ve seen some data, emit an extra // line ending when needed. if seen_data { - line_ending_if_needed(buffers); + line_ending_if_needed(buffers, &line_ending_default); } buf_tail_mut(buffers).push("</code></pre>".to_string()); if let Some(count) = code_fenced_fences_count { if count < 2 { - line_ending_if_needed(buffers); + line_ending_if_needed(buffers, &line_ending_default); } } @@ -506,15 +555,23 @@ fn buf_tail(buffers: &mut [Vec<String>]) -> &Vec<String> { } /// Add a line ending. -fn line_ending(buffers: &mut [Vec<String>]) { +fn line_ending(buffers: &mut [Vec<String>], default: &LineEnding) { let tail = buf_tail_mut(buffers); - // To do: use inferred line ending style. + + println!("xxx: {:?}", default); + + let line_ending = match default { + LineEnding::CarriageReturnLineFeed => "\r\n", + LineEnding::CarriageReturn => "\r", + LineEnding::LineFeed => "\n", + }; + // lastWasTag = false - tail.push("\n".to_string()); + tail.push(line_ending.to_string()); } /// Add a line ending if needed (as in, there’s no eol/eof already). -fn line_ending_if_needed(buffers: &mut [Vec<String>]) { +fn line_ending_if_needed(buffers: &mut [Vec<String>], default: &LineEnding) { let slice = buf_tail_slice(buffers); let last_char = if let Some(x) = slice { x.chars().last() @@ -532,6 +589,6 @@ fn line_ending_if_needed(buffers: &mut [Vec<String>]) { } if add { - line_ending(buffers); + line_ending(buffers, default); } } @@ -14,7 +14,7 @@ mod tokenizer; mod util; use crate::compiler::compile; -pub use crate::compiler::CompileOptions; +pub use crate::compiler::{LineEnding, Options}; use crate::parser::parse; /// Turn markdown into HTML. @@ -30,7 +30,7 @@ use crate::parser::parse; /// ``` #[must_use] pub fn micromark(value: &str) -> String { - micromark_with_options(value, &CompileOptions::default()) + micromark_with_options(value, &Options::default()) } /// Turn markdown into HTML, with configuration. @@ -38,17 +38,18 @@ pub fn micromark(value: &str) -> String { /// ## Examples /// /// ```rust -/// use micromark::{micromark_with_options, CompileOptions}; +/// use micromark::{micromark_with_options, Options}; /// -/// let result = micromark_with_options("<div>\n\n# Hello, world!\n\n</div>", &CompileOptions { +/// let result = micromark_with_options("<div>\n\n# Hello, world!\n\n</div>", &Options { /// allow_dangerous_html: true, /// allow_dangerous_protocol: true, +/// default_line_ending: None, /// }); /// /// assert_eq!(result, "<div>\n<h1>Hello, world!</h1>\n</div>"); /// ``` #[must_use] -pub fn micromark_with_options(value: &str, options: &CompileOptions) -> String { +pub fn micromark_with_options(value: &str, options: &Options) -> String { let (events, codes) = parse(value); compile(&events, &codes, options) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c0a7105..ba9bcbb 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -677,7 +677,6 @@ fn attempt_impl( } /// Turn a string into codes. -// To do: handle BOM at start? pub fn as_codes(value: &str) -> Vec<Code> { let mut codes: Vec<Code> = vec![]; let mut at_start = true; @@ -748,7 +747,10 @@ pub fn as_codes(value: &str) -> Vec<Code> { }; } - // To do: handle a final CR? + // Send the last CR: we’re not at a next `\n`. + if at_carriage_return { + codes.push(Code::Char('\r')); + } codes } diff --git a/tests/autolink.rs b/tests/autolink.rs index 51873ed..3882264 100644 --- a/tests/autolink.rs +++ b/tests/autolink.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: true, + default_line_ending: None, }; #[test] diff --git a/tests/character_escape.rs b/tests/character_escape.rs index ba94ab3..e4f23d2 100644 --- a/tests/character_escape.rs +++ b/tests/character_escape.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: true, + default_line_ending: None, }; #[test] diff --git a/tests/character_reference.rs b/tests/character_reference.rs index f2337ab..136ce17 100644 --- a/tests/character_reference.rs +++ b/tests/character_reference.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: true, + default_line_ending: None, }; #[test] diff --git a/tests/code_text.rs b/tests/code_text.rs index bab6dd6..054d8e2 100644 --- a/tests/code_text.rs +++ b/tests/code_text.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: false, + default_line_ending: None, }; #[test] diff --git a/tests/definition.rs b/tests/definition.rs index c15e44b..a8e8164 100644 --- a/tests/definition.rs +++ b/tests/definition.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: true, + default_line_ending: None, }; #[test] diff --git a/tests/html_flow.rs b/tests/html_flow.rs index 53105a6..d942642 100644 --- a/tests/html_flow.rs +++ b/tests/html_flow.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: false, + default_line_ending: None, }; #[test] diff --git a/tests/html_text.rs b/tests/html_text.rs index 1f85ac4..e70a4da 100644 --- a/tests/html_text.rs +++ b/tests/html_text.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: false, + default_line_ending: None, }; #[test] diff --git a/tests/misc_dangerous_html.rs b/tests/misc_dangerous_html.rs index 7a0b49a..76031c1 100644 --- a/tests/misc_dangerous_html.rs +++ b/tests/misc_dangerous_html.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: true, + default_line_ending: None, }; #[test] diff --git a/tests/misc_default_line_ending.rs b/tests/misc_default_line_ending.rs new file mode 100644 index 0000000..fb4e1df --- /dev/null +++ b/tests/misc_default_line_ending.rs @@ -0,0 +1,56 @@ +extern crate micromark; +// use micromark::{micromark, micromark_with_options, Options}; + +#[test] +fn default_line_ending() { + // To do: blockquote. + // assert_eq!( + // micromark("> a"), + // "<blockquote>\n<p>a</p>\n</blockquote>", + // "should use `\\n` default" + // ); + + // assert_eq!( + // micromark("> a\n"), + // "<blockquote>\n<p>a</p>\n</blockquote>\n", + // "should infer the first line ending (1)" + // ); + + // assert_eq!( + // micromark("> a\r"), + // "<blockquote>\r<p>a</p>\r</blockquote>\r", + // "should infer the first line ending (2)" + // ); + + // assert_eq!( + // micromark("> a\r\n"), + // "<blockquote>\r\n<p>a</p>\r\n</blockquote>\r\n", + // "should infer the first line ending (3)" + // ); + + // assert_eq!( + // micromark_with_options( + // "> a", + // &Options { + // // default_line_ending: "\r", + // allow_dangerous_html: false, + // allow_dangerous_protocol: false + // } + // ), + // "<blockquote>\r<p>a</p>\r</blockquote>", + // "should support the given line ending" + // ); + + // assert_eq!( + // micromark_with_options( + // "> a\n", + // &Options { + // // default_line_ending: "\r", + // allow_dangerous_html: false, + // allow_dangerous_protocol: false + // } + // ), + // "<blockquote>\r<p>a</p>\r</blockquote>\n", + // "should support the given line ending, even if line endings exist" + // ); +} diff --git a/tests/misc_line_ending.rs b/tests/misc_line_ending.rs new file mode 100644 index 0000000..195ddaa --- /dev/null +++ b/tests/misc_line_ending.rs @@ -0,0 +1,161 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, Options}; + +const DANGER: &Options = &Options { + allow_dangerous_html: true, + allow_dangerous_protocol: true, + default_line_ending: None, +}; + +#[test] +fn line_ending() { + assert_eq!( + micromark("a\nb"), + "<p>a\nb</p>", + "should support a line feed for a line ending inside a paragraph" + ); + + assert_eq!( + micromark("a\rb"), + "<p>a\rb</p>", + "should support a carriage return for a line ending inside a paragraph" + ); + + assert_eq!( + micromark("a\r\nb"), + "<p>a\r\nb</p>", + "should support a carriage return + line feed for a line ending inside a paragraph" + ); + + assert_eq!( + micromark("\ta\n\tb"), + "<pre><code>a\nb\n</code></pre>", + "should support a line feed in indented code (and prefer it)" + ); + + assert_eq!( + micromark("\ta\r\tb"), + "<pre><code>a\rb\r</code></pre>", + "should support a carriage return in indented code (and prefer it)" + ); + + assert_eq!( + micromark("\ta\r\n\tb"), + "<pre><code>a\r\nb\r\n</code></pre>", + "should support a carriage return + line feed in indented code (and prefer it)" + ); + + assert_eq!( + micromark("***\n### Heading"), + "<hr />\n<h3>Heading</h3>", + "should support a line feed between flow" + ); + + assert_eq!( + micromark("***\r### Heading"), + "<hr />\r<h3>Heading</h3>", + "should support a carriage return between flow" + ); + + assert_eq!( + micromark("***\r\n### Heading"), + "<hr />\r\n<h3>Heading</h3>", + "should support a carriage return + line feed between flow" + ); + + assert_eq!( + micromark("***\n\n\n### Heading\n"), + "<hr />\n<h3>Heading</h3>\n", + "should support several line feeds between flow" + ); + + assert_eq!( + micromark("***\r\r\r### Heading\r"), + "<hr />\r<h3>Heading</h3>\r", + "should support several carriage returns between flow" + ); + + assert_eq!( + micromark("***\r\n\r\n\r\n### Heading\r\n"), + "<hr />\r\n<h3>Heading</h3>\r\n", + "should support several carriage return + line feeds between flow" + ); + + assert_eq!( + micromark("```x\n\n\ny\n\n\n```\n\n\n"), + "<pre><code class=\"language-x\">\n\ny\n\n\n</code></pre>\n", + "should support several line feeds in fenced code" + ); + + assert_eq!( + micromark("```x\r\r\ry\r\r\r```\r\r\r"), + "<pre><code class=\"language-x\">\r\ry\r\r\r</code></pre>\r", + "should support several carriage returns in fenced code" + ); + + assert_eq!( + micromark("```x\r\n\r\n\r\ny\r\n\r\n\r\n```\r\n\r\n\r\n"), + "<pre><code class=\"language-x\">\r\n\r\ny\r\n\r\n\r\n</code></pre>\r\n", + "should support several carriage return + line feeds in fenced code" + ); + + assert_eq!( + micromark("A\r\nB\r\n-\r\nC"), + "<h2>A\r\nB</h2>\r\n<p>C</p>", + "should support a carriage return + line feed in content" + ); + + assert_eq!( + micromark_with_options("<div\n", DANGER), + "<div\n", + "should support a line feed after html" + ); + + assert_eq!( + micromark_with_options("<div\r", DANGER), + "<div\r", + "should support a carriage return after html" + ); + + assert_eq!( + micromark_with_options("<div\r\n", DANGER), + "<div\r\n", + "should support a carriage return + line feed after html" + ); + + assert_eq!( + micromark_with_options("<div>\n\nx", DANGER), + "<div>\n<p>x</p>", + "should support a blank line w/ line feeds after html" + ); + + assert_eq!( + micromark_with_options("<div>\r\rx", DANGER), + "<div>\r<p>x</p>", + "should support a blank line w/ carriage returns after html" + ); + + assert_eq!( + micromark_with_options("<div>\r\n\r\nx", DANGER), + "<div>\r\n<p>x</p>", + "should support a blank line w/ carriage return + line feeds after html" + ); + + assert_eq!( + micromark_with_options("<div>\nx", DANGER), + "<div>\nx", + "should support a non-blank line w/ line feed in html" + ); + + assert_eq!( + micromark_with_options("<div>\rx", DANGER), + "<div>\rx", + "should support a non-blank line w/ carriage return in html" + ); + + assert_eq!( + micromark_with_options("<div>\r\nx", DANGER), + "<div>\r\nx", + "should support a non-blank line w/ carriage return + line feed in html" + ); +} diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs index 46588e7..e9a0b72 100644 --- a/tests/misc_tabs.rs +++ b/tests/misc_tabs.rs @@ -1,9 +1,10 @@ extern crate micromark; -use micromark::{micromark, micromark_with_options, CompileOptions}; +use micromark::{micromark, micromark_with_options, Options}; -const DANGER: &CompileOptions = &CompileOptions { +const DANGER: &Options = &Options { allow_dangerous_html: true, allow_dangerous_protocol: true, + default_line_ending: None, }; #[test] |