diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
commit | 4c06c8554c35887f8f5147783953b2b7e7c2327f (patch) | |
tree | 1b2463848a3ae4c645f7f1a325877ee829ab65c5 | |
download | markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2 markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip |
.
34 files changed, 8997 insertions, 0 deletions
diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..201f7b7 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.rs] +indent_size = 4 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..cbee315 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,24 @@ +name: main +on: + - pull_request + - push +jobs: + main: + name: ${{matrix.rust}} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{matrix.rust}} + components: rustfmt, clippy + - run: cargo clippy -- -W clippy::pedantic + - run: cargo fmt --all -- --check + - run: cargo test + - run: cargo install cargo-tarpaulin && cargo tarpaulin --out Xml + - uses: codecov/codecov-action@v1 + strategy: + matrix: + rust: + - stable + - beta diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..32a28f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.DS_Store +*.log +*.lock +coverage/ +target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..96f23d7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "micromark" +version = "0.0.0" +authors = ["Titus Wormer <tituswormer@gmail.com>"] +edition = "2015" +rust-version = "1.56" +description = "small commonmark compliant markdown parser with positional info and concrete tokens" +homepage = "https://github.com/micromark/micromark-rs" +repository = "https://github.com/micromark/micromark-rs" +license = "MIT" +keywords = ["commonmark", "markdown", "parse", "render", "tokenize"] +categories = ["compilers", "encoding", "parser-implementations", "parsing", "text-processing"] +include = ["src/", "license"] +publish = false + +[dependencies] +log = "0.4" +env_logger = "0.9" diff --git a/Untitled.txt b/Untitled.txt new file mode 100644 index 0000000..cc1576f --- /dev/null +++ b/Untitled.txt @@ -0,0 +1 @@ +micromark.js: unquoted: is `completeAttributeValueUnquoted`s case for `completeAttributeNameAfter` missing a `/`?. I’ve added it here. diff --git a/examples/lib.rs b/examples/lib.rs new file mode 100644 index 0000000..4d01161 --- /dev/null +++ b/examples/lib.rs @@ -0,0 +1,22 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, CompileOptions}; + +fn main() { + // Turn on debugging. + // You can show it with `RUST_LOG=debug cargo run --example lib` + env_logger::init(); + + // Safely turn (untrusted?) markdown into HTML. + println!("{:?}", micromark("# Hello, world!")); + + // Turn trusted markdown into HTML. + println!( + "{:?}", + micromark_with_options( + "<div style=\"color: tomato\">\n\n# Hello, tomato!\n\n</div>", + &CompileOptions { + allow_dangerous_html: true + } + ) + ); +} diff --git a/funding.yml b/funding.yml new file mode 100644 index 0000000..dee132d --- /dev/null +++ b/funding.yml @@ -0,0 +1 @@ +github: wooorm @@ -0,0 +1,22 @@ +(The MIT License) + +Copyright (c) 2022 Titus Wormer <tituswormer@gmail.com> + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +'Software'), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..8892183 --- /dev/null +++ b/readme.md @@ -0,0 +1,183 @@ +# micromark-rs + +Here be dragons! +🐉 +There’s a lot to do. +Some major to dos are described here, more smaller ones are in the code. + +## Some useful scripts for now + +Run examples: + +```sh +RUST_BACKTRACE=1 RUST_LOG=debug cargo run --example lib +``` + +Format: + +```sh +cargo fmt --all +``` + +Lint: + +```sh +cargo fmt --all -- --check && cargo clippy -- -W clippy::pedantic +``` + +Tests: + +```sh +RUST_BACKTRACE=1 cargo test +``` + +Docs: + +```sh +cargo doc --document-private-items +``` + +(add `--open` to open them in a browser) + +## To do + +### Some major obstacles + +- [ ] (8) Subtokenization: figure out a good, fast way to deal with constructs in + one content type that also are another content type +- [ ] (1) Setext headings: can they be solved in content, or do they have to be + solved in flow somehow +- [ ] (8) Can content (and to a lesser extent string and text) operate more + performantly than checking whether other flow constructs start a line, + before exiting and actually attempting flow constructs? +- [ ] (5) Figure out definitions and sharing those identifiers, and references + before definitions +- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the + previous construct (typically paragraph) +- [ ] (5) Containers: this will be rather messy, and depends a lot on how + subtokenization is solved +- [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by + containers +- [ ] (3) Lazy lines, in containers, in flow and content in a paragraph, a line + does not need to be indented +- [ ] (5) There’s a lot of rust-related choosing whether to pass (mutable) + references or whatever around that should be refactored +- [ ] (5) Figure out extensions +- [ ] (1) Support turning off constructs + +### Small things + +- [ ] (3) Clean compiler +- [ ] (1) Optionally remove dangerous protocols when compiling +- [ ] (1) Use preferred line ending style in markdown +- [ ] (1) Handle BOM at start +- [ ] (1) Make sure tabs are handled properly and that positional info is perfect +- [ ] (1) Make sure crlf/cr/lf are working perfectly +- [ ] (3) Figure out lifetimes of things (see `life time` in source) +- [ ] (3) Use `commonmark` tests +- [ ] (3) Share a bunch of tests with `micromark-js` +- [ ] (5) Do some research on rust best practices for APIs, e.g., what to accept, + how to integrate with streams or so? +- [ ] (1) Go through clippy rules, and such, to add strict code styles +- [ ] (1) Make sure that rust character groups match CM character groups (e.g., is + `unicode_whitespace` or so the same?) +- [ ] (1) Any special handling of surrogates? +- [ ] (1) Make sure debugging is useful for other folks +- [ ] (3) Add some benchmarks, do some perf testing +- [ ] (3) Write comparison to other parsers +- [ ] (3) Add node/etc bindings? +- [ ] (8) After all extensions, including MDX, are done, see if we can integrate + this with SWC to compile MDX +- [ ] (3) Bunch of docs +- [ ] (5) Site + +### Constructs + +- [ ] (5) attention (strong, emphasis) (text) +- [ ] (1) autolink +- [x] blank line +- [ ] (5) block quote +- [x] character escape +- [x] character reference +- [x] code (fenced) +- [x] code (indented) +- [ ] (1) code (text) +- [ ] (3) content +- [ ] (3) definition +- [ ] (1) hard break escape +- [x] heading (atx) +- [ ] (1) heading (setext) +- [x] html (flow) +- [ ] html (text) +- [ ] (3) label end +- [ ] (3) label start (image) +- [ ] (3) label start (link) +- [ ] (8) list +- [ ] (1) paragraph +- [x] thematic break + +### Content types + +- [ ] (8) container + - [ ] block quote + - [ ] list +- [ ] (1) flow + - [x] blank line + - [x] code (fenced) + - [x] code (indented) + - [ ] content + - [x] heading (atx) + - [x] html (flow) + - [x] thematic break +- [ ] (3) content + - [ ] definition + - [ ] heading (setext) + - [ ] paragraph +- [ ] (5) text + - [ ] attention (strong, emphasis) (text) + - [ ] autolink + - [x] character escape + - [x] character reference + - [ ] code (text) + - [ ] hard break escape + - [ ] html (text) + - [ ] label end + - [ ] label start (image) + - [ ] label start (link) +- [x] string + - [x] character escape + - [x] character reference + +### Extensions + +The main thing here is is to figure out if folks could extend from the outside +with their own code, or if we need to maintain it all here. +Regardless, it is essential for the launch of `micromark-rs` that extensions +are theoretically or practically possible. +The extensions below are listed from top to bottom from more important to less +important. + +- [ ] (1) frontmatter (yaml, toml) (flow) + — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter) +- [ ] (3) autolink literal (GFM) (text) + — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) +- [ ] (3) footnote (GFM) (content, text) + — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) +- [ ] (3) strikethrough (GFM) (text) + — [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough) +- [ ] (5) table (GFM) (flow) + — [`micromark-extension-gfm-table`](https://github.com/micromark/micromark-extension-gfm-table) +- [ ] (1) task list item (GFM) (text) + — [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-task-list-item) +- [ ] (3) math (flow, text) + — [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) +- [ ] (8) directive (flow, text) + — [`micromark-extension-directive`](https://github.com/micromark/micromark-extension-directive) +- [ ] (8) expression (MDX) (flow, text) + — [`micromark-extension-mdx-expression`](https://github.com/micromark/micromark-extension-mdx-expression) +- [ ] (5) JSX (MDX) (flow, text) + — [`micromark-extension-mdx-jsx`](https://github.com/micromark/micromark-extension-mdx-jsx) +- [ ] (3) ESM (MDX) (flow) + — [`micromark-extension-mdxjs-esm`](https://github.com/micromark/micromark-extension-mdxjs-esm) +- [ ] (1) tagfilter (GFM) (n/a, renderer) + — [`micromark-extension-gfm-tagfilter`](https://github.com/micromark/micromark-extension-gfm-tagfilter) diff --git a/src/compiler.rs b/src/compiler.rs new file mode 100644 index 0000000..166950e --- /dev/null +++ b/src/compiler.rs @@ -0,0 +1,367 @@ +//! Turn events into a string of HTML. +use crate::construct::character_reference::Kind as CharacterReferenceKind; +use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::util::{ + decode_named_character_reference, decode_numeric_character_reference, encode, get_span, + slice_serialize, +}; + +/// Configuration (optional). +#[derive(Default, Debug)] +pub struct CompileOptions { + /// Whether to allow (dangerous) HTML. + /// The default is `false`, you can turn it on to `true` for trusted + /// content. + pub allow_dangerous_html: bool, +} + +/// Turn events and codes into a string of HTML. +#[allow(clippy::too_many_lines)] +pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> String { + let mut index = 0; + // let mut last_was_tag = false; + let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]]; + let mut atx_opening_sequence_size: Option<usize> = None; + let mut atx_heading_buffer: Option<String> = None; + let mut code_flow_seen_data: Option<bool> = None; + let mut code_fenced_fences_count: Option<usize> = None; + let mut slurp_one_line_ending = false; + let mut ignore_encode = false; + let mut character_reference_kind: Option<CharacterReferenceKind> = None; + // let mut slurp_all_line_endings = false; + + println!("events: {:#?}", events); + + while index < events.len() { + let event = &events[index]; + let token_type = &event.token_type; + + match event.event_type { + EventType::Enter => match token_type { + TokenType::Content => { + buf_tail_mut(buffers).push("<p>".to_string()); + } + TokenType::CodeIndented => { + code_flow_seen_data = Some(false); + line_ending_if_needed(buffers); + buf_tail_mut(buffers).push("<pre><code>".to_string()); + } + TokenType::CodeFenced => { + code_flow_seen_data = Some(false); + line_ending_if_needed(buffers); + // Note: no `>`, which is added later. + buf_tail_mut(buffers).push("<pre><code".to_string()); + code_fenced_fences_count = Some(0); + } + TokenType::CodeFencedFenceInfo | TokenType::CodeFencedFenceMeta => { + buffer(buffers); + } + TokenType::HtmlFlow => { + line_ending_if_needed(buffers); + if options.allow_dangerous_html { + ignore_encode = true; + } + } + TokenType::ContentPhrasing + | TokenType::AtxHeading + | TokenType::AtxHeadingSequence + | TokenType::AtxHeadingWhitespace + | TokenType::AtxHeadingText + | TokenType::LineEnding + | TokenType::ThematicBreak + | TokenType::ThematicBreakSequence + | TokenType::ThematicBreakWhitespace + | TokenType::CodeIndentedPrefixWhitespace + | TokenType::CodeFlowChunk + | TokenType::BlankLineEnding + | TokenType::BlankLineWhitespace + | TokenType::Whitespace + | TokenType::HtmlFlowData + | TokenType::CodeFencedFence + | TokenType::CodeFencedFenceSequence + | TokenType::ChunkString + | TokenType::CodeFencedFenceWhitespace + | TokenType::Data + | TokenType::CharacterEscape + | TokenType::CharacterEscapeMarker + | TokenType::CharacterEscapeValue + | TokenType::CharacterReference + | TokenType::CharacterReferenceMarker + | TokenType::CharacterReferenceMarkerNumeric + | TokenType::CharacterReferenceMarkerHexadecimal + | TokenType::CharacterReferenceMarkerSemi + | TokenType::CharacterReferenceValue => {} + #[allow(unreachable_patterns)] + _ => { + unreachable!("unhandled `enter` of TokenType {:?}", token_type) + } + }, + EventType::Exit => match token_type { + TokenType::ThematicBreakSequence + | TokenType::ThematicBreakWhitespace + | TokenType::CodeIndentedPrefixWhitespace + | TokenType::BlankLineEnding + | TokenType::BlankLineWhitespace + | TokenType::Whitespace + | TokenType::CodeFencedFenceSequence + | TokenType::CodeFencedFenceWhitespace + | TokenType::CharacterEscape + | TokenType::CharacterEscapeMarker + | TokenType::CharacterReference + | TokenType::CharacterReferenceMarkerSemi => {} + TokenType::HtmlFlow => { + ignore_encode = false; + } + TokenType::HtmlFlowData => { + let slice = slice_serialize(codes, &get_span(events, index), false); + + let res = if ignore_encode { slice } else { encode(&slice) }; + + // last_was_tag = false; + buf_tail_mut(buffers).push(res); + } + TokenType::Content => { + buf_tail_mut(buffers).push("</p>".to_string()); + } + TokenType::CodeIndented | TokenType::CodeFenced => { + let seen_data = + code_flow_seen_data.expect("`code_flow_seen_data` must be defined"); + + // To do: containers. + // One special case is if we are inside a container, and the fenced code was + // not closed (meaning it runs to the end). + // In that case, the following line ending, is considered *outside* the + // fenced code and block quote by micromark, but CM wants to treat that + // ending as part of the code. + // if fenced_count != None && fenced_count < 2 && tightStack.length > 0 && !last_was_tag { + // line_ending(); + // } + + // But in most cases, it’s simpler: when we’ve seen some data, emit an extra + // line ending when needed. + if seen_data { + line_ending_if_needed(buffers); + } + + buf_tail_mut(buffers).push("</code></pre>".to_string()); + + if let Some(count) = code_fenced_fences_count { + if count < 2 { + line_ending_if_needed(buffers); + } + } + + code_flow_seen_data = None; + code_fenced_fences_count = None; + slurp_one_line_ending = false; + } + TokenType::CodeFencedFence => { + let count = if let Some(count) = code_fenced_fences_count { + count + } else { + 0 + }; + + if count == 0 { + buf_tail_mut(buffers).push(">".to_string()); + // tag = true; + slurp_one_line_ending = true; + } + + code_fenced_fences_count = Some(count + 1); + } + TokenType::CodeFencedFenceInfo => { + let value = resume(buffers); + buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); + // tag = true; + } + TokenType::CodeFencedFenceMeta => { + resume(buffers); + } + TokenType::CodeFlowChunk => { + code_flow_seen_data = Some(true); + buf_tail_mut(buffers).push(encode(&slice_serialize( + codes, + &get_span(events, index), + false, + ))); + } + // `AtxHeadingWhitespace` is ignored after the opening sequence, + // before the closing sequence, and after the closing sequence. + // But it is used around intermediate sequences. + // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`. + // `AtxHeadingSequence` is ignored as the opening and closing sequence, + // but not when intermediate. + TokenType::AtxHeadingWhitespace | TokenType::AtxHeadingSequence => { + if let Some(buf) = atx_heading_buffer { + atx_heading_buffer = Some( + buf.to_string() + + &encode(&slice_serialize(codes, &get_span(events, index), false)), + ); + } + + // First fence we see. + if None == atx_opening_sequence_size { + let rank = slice_serialize(codes, &get_span(events, index), false).len(); + atx_opening_sequence_size = Some(rank); + buf_tail_mut(buffers).push(format!("<h{}>", rank)); + } + } + TokenType::AtxHeadingText => { + println!("text: {:?}", atx_heading_buffer); + if let Some(ref buf) = atx_heading_buffer { + if !buf.is_empty() { + buf_tail_mut(buffers).push(encode(buf)); + atx_heading_buffer = Some("".to_string()); + } + } else { + atx_heading_buffer = Some("".to_string()); + } + + let slice = encode(&slice_serialize(codes, &get_span(events, index), false)); + println!("slice: {:?}", slice); + buf_tail_mut(buffers).push(slice); + } + TokenType::AtxHeading => { + let rank = atx_opening_sequence_size + .expect("`atx_opening_sequence_size` must be set in headings"); + buf_tail_mut(buffers).push(format!("</h{}>", rank)); + atx_opening_sequence_size = None; + atx_heading_buffer = None; + } + TokenType::ThematicBreak => { + buf_tail_mut(buffers).push("<hr />".to_string()); + } + TokenType::LineEnding => { + // if slurp_all_line_endings { + // // Empty. + // } else + if slurp_one_line_ending { + slurp_one_line_ending = false; + // } else if code_text_inside { + // buf_tail_mut(buffers).push(" ".to_string()); + } else { + buf_tail_mut(buffers).push(encode(&slice_serialize( + codes, + &get_span(events, index), + false, + ))); + } + } + TokenType::CharacterReferenceMarker => { + character_reference_kind = Some(CharacterReferenceKind::Named); + } + TokenType::CharacterReferenceMarkerNumeric => { + character_reference_kind = Some(CharacterReferenceKind::Decimal); + } + TokenType::CharacterReferenceMarkerHexadecimal => { + character_reference_kind = Some(CharacterReferenceKind::Hexadecimal); + } + TokenType::CharacterReferenceValue => { + let kind = character_reference_kind + .expect("expected `character_reference_kind` to be set"); + let reference = slice_serialize(codes, &get_span(events, index), false); + let ref_string = reference.as_str(); + let value = match kind { + CharacterReferenceKind::Decimal => { + decode_numeric_character_reference(ref_string, 10).to_string() + } + CharacterReferenceKind::Hexadecimal => { + decode_numeric_character_reference(ref_string, 16).to_string() + } + CharacterReferenceKind::Named => { + decode_named_character_reference(ref_string) + } + }; + + buf_tail_mut(buffers).push(value); + + character_reference_kind = None; + } + // To do: `ContentPhrasing` should be parsed as phrasing first. + // This branch below currently acts as the resulting `data` tokens. + TokenType::ContentPhrasing + // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported. + | TokenType::ChunkString + | TokenType::Data + | TokenType::CharacterEscapeValue => { + // last_was_tag = false; + buf_tail_mut(buffers).push(encode(&slice_serialize( + codes, + &get_span(events, index), + false, + ))); + } + #[allow(unreachable_patterns)] + _ => { + unreachable!("unhandled `exit` of TokenType {:?}", token_type) + } + }, + } + + index += 1; + } + + assert!(buffers.len() == 1, "expected 1 final buffer"); + buffers.get(0).expect("expected 1 final buffer").concat() +} + +/// Push a buffer. +fn buffer(buffers: &mut Vec<Vec<String>>) { + buffers.push(vec![]); +} + +/// Pop a buffer, returning its value. +fn resume(buffers: &mut Vec<Vec<String>>) -> String { + let buf = buffers.pop().expect("Cannot resume w/o buffer"); + buf.concat() +} + +/// Get the last chunk of current buffer. +fn buf_tail_slice(buffers: &mut [Vec<String>]) -> Option<&String> { + let tail = buf_tail(buffers); + tail.last() +} + +/// Get the mutable last chunk of current buffer. +fn buf_tail_mut(buffers: &mut [Vec<String>]) -> &mut Vec<String> { + buffers + .last_mut() + .expect("at least one buffer should exist") +} + +/// Get the current buffer. +fn buf_tail(buffers: &mut [Vec<String>]) -> &Vec<String> { + buffers.last().expect("at least one buffer should exist") +} + +/// Add a line ending. +fn line_ending(buffers: &mut [Vec<String>]) { + let tail = buf_tail_mut(buffers); + // To do: use inferred line ending style. + // lastWasTag = false + tail.push("\n".to_string()); +} + +/// Add a line ending if needed (as in, there’s no eol/eof already). +fn line_ending_if_needed(buffers: &mut [Vec<String>]) { + let slice = buf_tail_slice(buffers); + let last_char = if let Some(x) = slice { + x.chars().last() + } else { + None + }; + let mut add = true; + + if let Some(x) = last_char { + if x == '\n' || x == '\r' { + add = false; + } + } else { + add = false; + } + + if add { + line_ending(buffers); + } +} diff --git a/src/constant.rs b/src/constant.rs new file mode 100644 index 0000000..332fdaf --- /dev/null +++ b/src/constant.rs @@ -0,0 +1,2561 @@ +//! Constants needed to parse markdown. +//! +//! Most of these constants are magic numbers, such as the number of markers +//! needed to parse [code (fenced)][code_fenced] +//! ([`CODE_FENCED_SEQUENCE_SIZE_MIN`][]) or the max number of allowed markers +//! in a [heading (atx)][heading_atx] +//! ([`HEADING_ATX_OPENING_FENCE_SIZE_MAX`][]). +//! +//! Some constants are instead lists of things, such as the list of tag names +//! considered in the **raw** production of [HTML (flow)][html_flow] +//! ([`HTML_RAW_NAMES`][]), or the list of allowed named character references +//! ([`CHARACTER_REFERENCE_NAMES`][]). +//! +//! [code_fenced]: crate::construct::code_fenced +//! [heading_atx]: crate::construct::heading_atx +//! [html_flow]: crate::construct::html_flow + +/// The number of characters that form a tab stop. +/// +/// This relates to the number of whitespace characters needed to form certain +/// constructs in markdown, most notable the whitespace required to form +/// [code (indented)][code_indented]. +/// +/// <!-- To do: link to somewhere that discusses virtual spaces. --> +/// <!-- Ref: https://github.com/syntax-tree/mdast-util-to-markdown/issues/51 --> +/// +/// [code_indented]: crate::construct::code_indented +pub const TAB_SIZE: usize = 4; + +/// The number of markers needed for a [thematic break][thematic_break] to form. +/// +/// Like many things in markdown, the number is `3`. +/// +/// [thematic_break]: crate::construct::thematic_break +pub const THEMATIC_BREAK_MARKER_COUNT_MIN: usize = 3; + +/// The max number of markers allowed to form a [heading (atx)][heading_atx]. +/// +/// This limitation is imposed by HTML, which imposes a max heading rank of +/// `6`. +/// +/// [heading_atx]: crate::construct::heading_atx +pub const HEADING_ATX_OPENING_FENCE_SIZE_MAX: usize = 6; + +/// The number of markers needed for [code (fenced)][code_fenced] to form. +/// +/// Like many things in markdown, the number is `3`. +/// +/// [code_fenced]: crate::construct::code_fenced +pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3; + +/// List of HTML tag names that form the **raw** production of +/// [HTML (flow)][html_flow]. +/// +/// The **raw** production allows blank lines and thus no interleaving with +/// markdown. +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// The number of the longest tag name is also stored as a constant in +/// [`HTML_RAW_SIZE_MAX`][]. +/// +/// > 👉 **Note**: `textarea` was added in `CommonMark@0.30`. +/// +/// ## References +/// +/// * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +/// +/// [html_flow]: crate::construct::html_flow +pub const HTML_RAW_NAMES: [&str; 4] = ["pre", "script", "style", "textarea"]; + +/// The number of the longest tag name in [`HTML_RAW_NAMES`][]. +/// +/// This is currently the size of `textarea`. +pub const HTML_RAW_SIZE_MAX: usize = 8; + +/// List of HTML tag names that form the **basic** production of +/// [HTML (flow)][html_flow]. +/// +/// The **basic** production allows interleaving HTML and markdown with blank lines +/// and allows flow (block) elements to interrupt content. +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// Tag names not on this list result in the **complete** production. +/// +/// > 👉 **Note**: `source` was removed on `main` of the `CommonMark` spec and +/// > is slated to be released in `CommonMark@0.31`. +/// +/// ## References +/// +/// * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +/// * [*Remove source element as HTML block start condition* as `commonmark/commonmark-spec#710`](https://github.com/commonmark/commonmark-spec/pull/710) +/// +/// [html_flow]: crate::construct::html_flow +pub const HTML_BLOCK_NAMES: [&str; 61] = [ + "address", + "article", + "aside", + "base", + "basefont", + "blockquote", + "body", + "caption", + "center", + "col", + "colgroup", + "dd", + "details", + "dialog", + "dir", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "iframe", + "legend", + "li", + "link", + "main", + "menu", + "menuitem", + "nav", + "noframes", + "ol", + "optgroup", + "option", + "p", + "param", + "section", + "summary", + "table", + "tbody", + "td", + "tfoot", + "th", + "thead", + "title", + "tr", + "track", + "ul", +]; + +/// The max number of characters in a hexadecimal numeric +/// [character reference][character_reference]. +/// +/// To illustrate, this allows `�` and disallows `�`. +/// This limit is imposed because all bigger numbers are invalid. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX: usize = 6; + +/// The max number of characters in a decimal numeric +/// [character reference][character_reference]. +/// +/// To illustrate, this allows `�` and disallows `�`. +/// This limit is imposed because all bigger numbers are invalid. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_DECIMAL_SIZE_MAX: usize = 7; + +/// The max number of characters in a named +/// [character reference][character_reference]. +/// +/// This is the number of the longest name in [`CHARACTER_REFERENCE_NAMES`][]. +/// It allows `∳` and prevents the parser from +/// continuing for eons. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_NAMED_SIZE_MAX: usize = 31; + +/// List of names that can form a named +/// [character reference][character_reference]. +/// +/// This list is sensitive to casing. +/// +/// The number of the longest name (`CounterClockwiseContourIntegral`) is also +/// stored as a constant in [`CHARACTER_REFERENCE_NAMED_SIZE_MAX`][]. +/// +/// The corresponding values of this list are stored in +/// [`CHARACTER_REFERENCE_VALUES`][]. +/// They correspond through their index. +/// +/// ## References +/// +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_NAMES: [&str; 2222] = [ + "AEli", + "AElig", + "AM", + "AMP", + "Aacut", + "Aacute", + "Abreve", + "Acir", + "Acirc", + "Acy", + "Afr", + "Agrav", + "Agrave", + "Alpha", + "Amacr", + "And", + "Aogon", + "Aopf", + "ApplyFunction", + "Arin", + "Aring", + "Ascr", + "Assign", + "Atild", + "Atilde", + "Aum", + "Auml", + "Backslash", + "Barv", + "Barwed", + "Bcy", + "Because", + "Bernoullis", + "Beta", + "Bfr", + "Bopf", + "Breve", + "Bscr", + "Bumpeq", + "CHcy", + "COP", + "COPY", + "Cacute", + "Cap", + "CapitalDifferentialD", + "Cayleys", + "Ccaron", + "Ccedi", + "Ccedil", + "Ccirc", + "Cconint", + "Cdot", + "Cedilla", + "CenterDot", + "Cfr", + "Chi", + "CircleDot", + "CircleMinus", + "CirclePlus", + "CircleTimes", + "ClockwiseContourIntegral", + "CloseCurlyDoubleQuote", + "CloseCurlyQuote", + "Colon", + "Colone", + "Congruent", + "Conint", + "ContourIntegral", + "Copf", + "Coproduct", + "CounterClockwiseContourIntegral", + "Cross", + "Cscr", + "Cup", + "CupCap", + "DD", + "DDotrahd", + "DJcy", + "DScy", + "DZcy", + "Dagger", + "Darr", + "Dashv", + "Dcaron", + "Dcy", + "Del", + "Delta", + "Dfr", + "DiacriticalAcute", + "DiacriticalDot", + "DiacriticalDoubleAcute", + "DiacriticalGrave", + "DiacriticalTilde", + "Diamond", + "DifferentialD", + "Dopf", + "Dot", + "DotDot", + "DotEqual", + "DoubleContourIntegral", + "DoubleDot", + "DoubleDownArrow", + "DoubleLeftArrow", + "DoubleLeftRightArrow", + "DoubleLeftTee", + "DoubleLongLeftArrow", + "DoubleLongLeftRightArrow", + "DoubleLongRightArrow", + "DoubleRightArrow", + "DoubleRightTee", + "DoubleUpArrow", + "DoubleUpDownArrow", + "DoubleVerticalBar", + "DownArrow", + "DownArrowBar", + "DownArrowUpArrow", + "DownBreve", + "DownLeftRightVector", + "DownLeftTeeVector", + "DownLeftVector", + "DownLeftVectorBar", + "DownRightTeeVector", + "DownRightVector", + "DownRightVectorBar", + "DownTee", + "DownTeeArrow", + "Downarrow", + "Dscr", + "Dstrok", + "ENG", + "ET", + "ETH", + "Eacut", + "Eacute", + "Ecaron", + "Ecir", + "Ecirc", + "Ecy", + "Edot", + "Efr", + "Egrav", + "Egrave", + "Element", + "Emacr", + "EmptySmallSquare", + "EmptyVerySmallSquare", + "Eogon", + "Eopf", + "Epsilon", + "Equal", + "EqualTilde", + "Equilibrium", + "Escr", + "Esim", + "Eta", + "Eum", + "Euml", + "Exists", + "ExponentialE", + "Fcy", + "Ffr", + "FilledSmallSquare", + "FilledVerySmallSquare", + "Fopf", + "ForAll", + "Fouriertrf", + "Fscr", + "GJcy", + "G", + "GT", + "Gamma", + "Gammad", + "Gbreve", + "Gcedil", + "Gcirc", + "Gcy", + "Gdot", + "Gfr", + "Gg", + "Gopf", + "GreaterEqual", + "GreaterEqualLess", + "GreaterFullEqual", + "GreaterGreater", + "GreaterLess", + "GreaterSlantEqual", + "GreaterTilde", + "Gscr", + "Gt", + "HARDcy", + "Hacek", + "Hat", + "Hcirc", + "Hfr", + "HilbertSpace", + "Hopf", + "HorizontalLine", + "Hscr", + "Hstrok", + "HumpDownHump", + "HumpEqual", + "IEcy", + "IJlig", + "IOcy", + "Iacut", + "Iacute", + "Icir", + "Icirc", + "Icy", + "Idot", + "Ifr", + "Igrav", + "Igrave", + "Im", + "Imacr", + "ImaginaryI", + "Implies", + "Int", + "Integral", + "Intersection", + "InvisibleComma", + "InvisibleTimes", + "Iogon", + "Iopf", + "Iota", + "Iscr", + "Itilde", + "Iukcy", + "Ium", + "Iuml", + "Jcirc", + "Jcy", + "Jfr", + "Jopf", + "Jscr", + "Jsercy", + "Jukcy", + "KHcy", + "KJcy", + "Kappa", + "Kcedil", + "Kcy", + "Kfr", + "Kopf", + "Kscr", + "LJcy", + "L", + "LT", + "Lacute", + "Lambda", + "Lang", + "Laplacetrf", + "Larr", + "Lcaron", + "Lcedil", + "Lcy", + "LeftAngleBracket", + "LeftArrow", + "LeftArrowBar", + "LeftArrowRightArrow", + "LeftCeiling", + "LeftDoubleBracket", + "LeftDownTeeVector", + "LeftDownVector", + "LeftDownVectorBar", + "LeftFloor", + "LeftRightArrow", + "LeftRightVector", + "LeftTee", + "LeftTeeArrow", + "LeftTeeVector", + "LeftTriangle", + "LeftTriangleBar", + "LeftTriangleEqual", + "LeftUpDownVector", + "LeftUpTeeVector", + "LeftUpVector", + "LeftUpVectorBar", + "LeftVector", + "LeftVectorBar", + "Leftarrow", + "Leftrightarrow", + "LessEqualGreater", + "LessFullEqual", + "LessGreater", + "LessLess", + "LessSlantEqual", + "LessTilde", + "Lfr", + "Ll", + "Lleftarrow", + "Lmidot", + "LongLeftArrow", + "LongLeftRightArrow", + "LongRightArrow", + "Longleftarrow", + "Longleftrightarrow", + "Longrightarrow", + "Lopf", + "LowerLeftArrow", + "LowerRightArrow", + "Lscr", + "Lsh", + "Lstrok", + "Lt", + "Map", + "Mcy", + "MediumSpace", + "Mellintrf", + "Mfr", + "MinusPlus", + "Mopf", + "Mscr", + "Mu", + "NJcy", + "Nacute", + "Ncaron", + "Ncedil", + "Ncy", + "NegativeMediumSpace", + "NegativeThickSpace", + "NegativeThinSpace", + "NegativeVeryThinSpace", + "NestedGreaterGreater", + "NestedLessLess", + "NewLine", + "Nfr", + "NoBreak", + "NonBreakingSpace", + "Nopf", + "Not", + "NotCongruent", + "NotCupCap", + "NotDoubleVerticalBar", + "NotElement", + "NotEqual", + "NotEqualTilde", + "NotExists", + "NotGreater", + "NotGreaterEqual", + "NotGreaterFullEqual", + "NotGreaterGreater", + "NotGreaterLess", + "NotGreaterSlantEqual", + "NotGreaterTilde", + "NotHumpDownHump", + "NotHumpEqual", + "NotLeftTriangle", + "NotLeftTriangleBar", + "NotLeftTriangleEqual", + "NotLess", + "NotLessEqual", + "NotLessGreater", + "NotLessLess", + "NotLessSlantEqual", + "NotLessTilde", + "NotNestedGreaterGreater", + "NotNestedLessLess", + "NotPrecedes", + "NotPrecedesEqual", + "NotPrecedesSlantEqual", + "NotReverseElement", + "NotRightTriangle", + "NotRightTriangleBar", + "NotRightTriangleEqual", + "NotSquareSubset", + "NotSquareSubsetEqual", + "NotSquareSuperset", + "NotSquareSupersetEqual", + "NotSubset", + "NotSubsetEqual", + "NotSucceeds", + "NotSucceedsEqual", + "NotSucceedsSlantEqual", + "NotSucceedsTilde", + "NotSuperset", + "NotSupersetEqual", + "NotTilde", + "NotTildeEqual", + "NotTildeFullEqual", + "NotTildeTilde", + "NotVerticalBar", + "Nscr", + "Ntild", + "Ntilde", + "Nu", + "OElig", + "Oacut", + "Oacute", + "Ocir", + "Ocirc", + "Ocy", + "Odblac", + "Ofr", + "Ograv", + "Ograve", + "Omacr", + "Omega", + "Omicron", + "Oopf", + "OpenCurlyDoubleQuote", + "OpenCurlyQuote", + "Or", + "Oscr", + "Oslas", + "Oslash", + "Otild", + "Otilde", + "Otimes", + "Oum", + "Ouml", + "OverBar", + "OverBrace", + "OverBracket", + "OverParenthesis", + "PartialD", + "Pcy", + "Pfr", + "Phi", + "Pi", + "PlusMinus", + "Poincareplane", + "Popf", + "Pr", + "Precedes", + "PrecedesEqual", + "PrecedesSlantEqual", + "PrecedesTilde", + "Prime", + "Product", + "Proportion", + "Proportional", + "Pscr", + "Psi", + "QUO", + "QUOT", + "Qfr", + "Qopf", + "Qscr", + "RBarr", + "RE", + "REG", + "Racute", + "Rang", + "Rarr", + "Rarrtl", + "Rcaron", + "Rcedil", + "Rcy", + "Re", + "ReverseElement", + "ReverseEquilibrium", + "ReverseUpEquilibrium", + "Rfr", + "Rho", + "RightAngleBracket", + "RightArrow", + "RightArrowBar", + "RightArrowLeftArrow", + "RightCeiling", + "RightDoubleBracket", + "RightDownTeeVector", + "RightDownVector", + "RightDownVectorBar", + "RightFloor", + "RightTee", + "RightTeeArrow", + "RightTeeVector", + "RightTriangle", + "RightTriangleBar", + "RightTriangleEqual", + "RightUpDownVector", + "RightUpTeeVector", + "RightUpVector", + "RightUpVectorBar", + "RightVector", + "RightVectorBar", + "Rightarrow", + "Ropf", + "RoundImplies", + "Rrightarrow", + "Rscr", + "Rsh", + "RuleDelayed", + "SHCHcy", + "SHcy", + "SOFTcy", + "Sacute", + "Sc", + "Scaron", + "Scedil", + "Scirc", + "Scy", + "Sfr", + "ShortDownArrow", + "ShortLeftArrow", + "ShortRightArrow", + "ShortUpArrow", + "Sigma", + "SmallCircle", + "Sopf", + "Sqrt", + "Square", + "SquareIntersection", + "SquareSubset", + "SquareSubsetEqual", + "SquareSuperset", + "SquareSupersetEqual", + "SquareUnion", + "Sscr", + "Star", + "Sub", + "Subset", + "SubsetEqual", + "Succeeds", + "SucceedsEqual", + "SucceedsSlantEqual", + "SucceedsTilde", + "SuchThat", + "Sum", + "Sup", + "Superset", + "SupersetEqual", + "Supset", + "THOR", + "THORN", + "TRADE", + "TSHcy", + "TScy", + "Tab", + "Tau", + "Tcaron", + "Tcedil", + "Tcy", + "Tfr", + "Therefore", + "Theta", + "ThickSpace", + "ThinSpace", + "Tilde", + "TildeEqual", + "TildeFullEqual", + "TildeTilde", + "Topf", + "TripleDot", + "Tscr", + "Tstrok", + "Uacut", + "Uacute", + "Uarr", + "Uarrocir", + "Ubrcy", + "Ubreve", + "Ucir", + "Ucirc", + "Ucy", + "Udblac", + "Ufr", + "Ugrav", + "Ugrave", + "Umacr", + "UnderBar", + "UnderBrace", + "UnderBracket", + "UnderParenthesis", + "Union", + "UnionPlus", + "Uogon", + "Uopf", + "UpArrow", + "UpArrowBar", + "UpArrowDownArrow", + "UpDownArrow", + "UpEquilibrium", + "UpTee", + "UpTeeArrow", + "Uparrow", + "Updownarrow", + "UpperLeftArrow", + "UpperRightArrow", + "Upsi", + "Upsilon", + "Uring", + "Uscr", + "Utilde", + "Uum", + "Uuml", + "VDash", + "Vbar", + "Vcy", + "Vdash", + "Vdashl", + "Vee", + "Verbar", + "Vert", + "VerticalBar", + "VerticalLine", + "VerticalSeparator", + "VerticalTilde", + "VeryThinSpace", + "Vfr", + "Vopf", + "Vscr", + "Vvdash", + "Wcirc", + "Wedge", + "Wfr", + "Wopf", + "Wscr", + "Xfr", + "Xi", + "Xopf", + "Xscr", + "YAcy", + "YIcy", + "YUcy", + "Yacut", + "Yacute", + "Ycirc", + "Ycy", + "Yfr", + "Yopf", + "Yscr", + "Yuml", + "ZHcy", + "Zacute", + "Zcaron", + "Zcy", + "Zdot", + "ZeroWidthSpace", + "Zeta", + "Zfr", + "Zopf", + "Zscr", + "aacut", + "aacute", + "abreve", + "ac", + "acE", + "acd", + "acir", + "acirc", + "acut", + "acute", + "acy", + "aeli", + "aelig", + "af", + "afr", + "agrav", + "agrave", + "alefsym", + "aleph", + "alpha", + "amacr", + "amalg", + "am", + "amp", + "and", + "andand", + "andd", + "andslope", + "andv", + "ang", + "ange", + "angle", + "angmsd", + "angmsdaa", + "angmsdab", + "angmsdac", + "angmsdad", + "angmsdae", + "angmsdaf", + "angmsdag", + "angmsdah", + "angrt", + "angrtvb", + "angrtvbd", + "angsph", + "angst", + "angzarr", + "aogon", + "aopf", + "ap", + "apE", + "apacir", + "ape", + "apid", + "apos", + "approx", + "approxeq", + "arin", + "aring", + "ascr", + "ast", + "asymp", + "asympeq", + "atild", + "atilde", + "aum", + "auml", + "awconint", + "awint", + "bNot", + "backcong", + "backepsilon", + "backprime", + "backsim", + "backsimeq", + "barvee", + "barwed", + "barwedge", + "bbrk", + "bbrktbrk", + "bcong", + "bcy", + "bdquo", + "becaus", + "because", + "bemptyv", + "bepsi", + "bernou", + "beta", + "beth", + "between", + "bfr", + "bigcap", + "bigcirc", + "bigcup", + "bigodot", + "bigoplus", + "bigotimes", + "bigsqcup", + "bigstar", + "bigtriangledown", + "bigtriangleup", + "biguplus", + "bigvee", + "bigwedge", + "bkarow", + "blacklozenge", + "blacksquare", + "blacktriangle", + "blacktriangledown", + "blacktriangleleft", + "blacktriangleright", + "blank", + "blk12", + "blk14", + "blk34", + "block", + "bne", + "bnequiv", + "bnot", + "bopf", + "bot", + "bottom", + "bowtie", + "boxDL", + "boxDR", + "boxDl", + "boxDr", + "boxH", + "boxHD", + "boxHU", + "boxHd", + "boxHu", + "boxUL", + "boxUR", + "boxUl", + "boxUr", + "boxV", + "boxVH", + "boxVL", + "boxVR", + "boxVh", + "boxVl", + "boxVr", + "boxbox", + "boxdL", + "boxdR", + "boxdl", + "boxdr", + "boxh", + "boxhD", + "boxhU", + "boxhd", + "boxhu", + "boxminus", + "boxplus", + "boxtimes", + "boxuL", + "boxuR", + "boxul", + "boxur", + "boxv", + "boxvH", + "boxvL", + "boxvR", + "boxvh", + "boxvl", + "boxvr", + "bprime", + "breve", + "brvba", + "brvbar", + "bscr", + "bsemi", + "bsim", + "bsime", + "bsol", + "bsolb", + "bsolhsub", + "bull", + "bullet", + "bump", + "bumpE", + "bumpe", + "bumpeq", + "cacute", + "cap", + "capand", + "capbrcup", + "capcap", + "capcup", + "capdot", + "caps", + "caret", + "caron", + "ccaps", + "ccaron", + "ccedi", + "ccedil", + "ccirc", + "ccups", + "ccupssm", + "cdot", + "cedi", + "cedil", + "cemptyv", + "cen", + "cent", + "centerdot", + "cfr", + "chcy", + "check", + "checkmark", + "chi", + "cir", + "cirE", + "circ", + "circeq", + "circlearrowleft", + "circlearrowright", + "circledR", + "circledS", + "circledast", + "circledcirc", + "circleddash", + "cire", + "cirfnint", + "cirmid", + "cirscir", + "clubs", + "clubsuit", + "colon", + "colone", + "coloneq", + "comma", + "commat", + "comp", + "compfn", + "complement", + "complexes", + "cong", + "congdot", + "conint", + "copf", + "coprod", + "cop", + "copy", + "copysr", + "crarr", + "cross", + "cscr", + "csub", + "csube", + "csup", + "csupe", + "ctdot", + "cudarrl", + "cudarrr", + "cuepr", + "cuesc", + "cularr", + "cularrp", + "cup", + "cupbrcap", + "cupcap", + "cupcup", + "cupdot", + "cupor", + "cups", + "curarr", + "curarrm", + "curlyeqprec", + "curlyeqsucc", + "curlyvee", + "curlywedge", + "curre", + "curren", + "curvearrowleft", + "curvearrowright", + "cuvee", + "cuwed", + "cwconint", + "cwint", + "cylcty", + "dArr", + "dHar", + "dagger", + "daleth", + "darr", + "dash", + "dashv", + "dbkarow", + "dblac", + "dcaron", + "dcy", + "dd", + "ddagger", + "ddarr", + "ddotseq", + "de", + "deg", + "delta", + "demptyv", + "dfisht", + "dfr", + "dharl", + "dharr", + "diam", + "diamond", + "diamondsuit", + "diams", + "die", + "digamma", + "disin", + "div", + "divid", + "divide", + "divideontimes", + "divonx", + "djcy", + "dlcorn", + "dlcrop", + "dollar", + "dopf", + "dot", + "doteq", + "doteqdot", + "dotminus", + "dotplus", + "dotsquare", + "doublebarwedge", + "downarrow", + "downdownarrows", + "downharpoonleft", + "downharpoonright", + "drbkarow", + "drcorn", + "drcrop", + "dscr", + "dscy", + "dsol", + "dstrok", + "dtdot", + "dtri", + "dtrif", + "duarr", + "duhar", + "dwangle", + "dzcy", + "dzigrarr", + "eDDot", + "eDot", + "eacut", + "eacute", + "easter", + "ecaron", + "ecir", + "ecirc", + "ecolon", + "ecy", + "edot", + "ee", + "efDot", + "efr", + "eg", + "egrav", + "egrave", + "egs", + "egsdot", + "el", + "elinters", + "ell", + "els", + "elsdot", + "emacr", + "empty", + "emptyset", + "emptyv", + "emsp13", + "emsp14", + "emsp", + "eng", + "ensp", + "eogon", + "eopf", + "epar", + "eparsl", + "eplus", + "epsi", + "epsilon", + "epsiv", + "eqcirc", + "eqcolon", + "eqsim", + "eqslantgtr", + "eqslantless", + "equals", + "equest", + "equiv", + "equivDD", + "eqvparsl", + "erDot", + "erarr", + "escr", + "esdot", + "esim", + "eta", + "et", + "eth", + "eum", + "euml", + "euro", + "excl", + "exist", + "expectation", + "exponentiale", + "fallingdotseq", + "fcy", + "female", + "ffilig", + "fflig", + "ffllig", + "ffr", + "filig", + "fjlig", + "flat", + "fllig", + "fltns", + "fnof", + "fopf", + "forall", + "fork", + "forkv", + "fpartint", + "frac1", + "frac12", + "frac13", + "frac14", + "frac15", + "frac16", + "frac18", + "frac23", + "frac25", + "frac3", + "frac34", + "frac35", + "frac38", + "frac45", + "frac56", + "frac58", + "frac78", + "frasl", + "frown", + "fscr", + "gE", + "gEl", + "gacute", + "gamma", + "gammad", + "gap", + "gbreve", + "gcirc", + "gcy", + "gdot", + "ge", + "gel", + "geq", + "geqq", + "geqslant", + "ges", + "gescc", + "gesdot", + "gesdoto", + "gesdotol", + "gesl", + "gesles", + "gfr", + "gg", + "ggg", + "gimel", + "gjcy", + "gl", + "glE", + "gla", + "glj", + "gnE", + "gnap", + "gnapprox", + "gne", + "gneq", + "gneqq", + "gnsim", + "gopf", + "grave", + "gscr", + "gsim", + "gsime", + "gsiml", + "g", + "gt", + "gtcc", + "gtcir", + "gtdot", + "gtlPar", + "gtquest", + "gtrapprox", + "gtrarr", + "gtrdot", + "gtreqless", + "gtreqqless", + "gtrless", + "gtrsim", + "gvertneqq", + "gvnE", + "hArr", + "hairsp", + "half", + "hamilt", + "hardcy", + "harr", + "harrcir", + "harrw", + "hbar", + "hcirc", + "hearts", + "heartsuit", + "hellip", + "hercon", + "hfr", + "hksearow", + "hkswarow", + "hoarr", + "homtht", + "hookleftarrow", + "hookrightarrow", + "hopf", + "horbar", + "hscr", + "hslash", + "hstrok", + "hybull", + "hyphen", + "iacut", + "iacute", + "ic", + "icir", + "icirc", + "icy", + "iecy", + "iexc", + "iexcl", + "iff", + "ifr", + "igrav", + "igrave", + "ii", + "iiiint", + "iiint", + "iinfin", + "iiota", + "ijlig", + "imacr", + "image", + "imagline", + "imagpart", + "imath", + "imof", + "imped", + "in", + "incare", + "infin", + "infintie", + "inodot", + "int", + "intcal", + "integers", + "intercal", + "intlarhk", + "intprod", + "iocy", + "iogon", + "iopf", + "iota", + "iprod", + "iques", + "iquest", + "iscr", + "isin", + "isinE", + "isindot", + "isins", + "isinsv", + "isinv", + "it", + "itilde", + "iukcy", + "ium", + "iuml", + "jcirc", + "jcy", + "jfr", + "jmath", + "jopf", + "jscr", + "jsercy", + "jukcy", + "kappa", + "kappav", + "kcedil", + "kcy", + "kfr", + "kgreen", + "khcy", + "kjcy", + "kopf", + "kscr", + "lAarr", + "lArr", + "lAtail", + "lBarr", + "lE", + "lEg", + "lHar", + "lacute", + "laemptyv", + "lagran", + "lambda", + "lang", + "langd", + "langle", + "lap", + "laqu", + "laquo", + "larr", + "larrb", + "larrbfs", + "larrfs", + "larrhk", + "larrlp", + "larrpl", + "larrsim", + "larrtl", + "lat", + "latail", + "late", + "lates", + "lbarr", + "lbbrk", + "lbrace", + "lbrack", + "lbrke", + "lbrksld", + "lbrkslu", + "lcaron", + "lcedil", + "lceil", + "lcub", + "lcy", + "ldca", + "ldquo", + "ldquor", + "ldrdhar", + "ldrushar", + "ldsh", + "le", + "leftarrow", + "leftarrowtail", + "leftharpoondown", + "leftharpoonup", + "leftleftarrows", + "leftrightarrow", + "leftrightarrows", + "leftrightharpoons", + "leftrightsquigarrow", + "leftthreetimes", + "leg", + "leq", + "leqq", + "leqslant", + "les", + "lescc", + "lesdot", + "lesdoto", + "lesdotor", + "lesg", + "lesges", + "lessapprox", + "lessdot", + "lesseqgtr", + "lesseqqgtr", + "lessgtr", + "lesssim", + "lfisht", + "lfloor", + "lfr", + "lg", + "lgE", + "lhard", + "lharu", + "lharul", + "lhblk", + "ljcy", + "ll", + "llarr", + "llcorner", + "llhard", + "lltri", + "lmidot", + "lmoust", + "lmoustache", + "lnE", + "lnap", + "lnapprox", + "lne", + "lneq", + "lneqq", + "lnsim", + "loang", + "loarr", + "lobrk", + "longleftarrow", + "longleftrightarrow", + "longmapsto", + "longrightarrow", + "looparrowleft", + "looparrowright", + "lopar", + "lopf", + "loplus", + "lotimes", + "lowast", + "lowbar", + "loz", + "lozenge", + "lozf", + "lpar", + "lparlt", + "lrarr", + "lrcorner", + "lrhar", + "lrhard", + "lrm", + "lrtri", + "lsaquo", + "lscr", + "lsh", + "lsim", + "lsime", + "lsimg", + "lsqb", + "lsquo", + "lsquor", + "lstrok", + "l", + "lt", + "ltcc", + "ltcir", + "ltdot", + "lthree", + "ltimes", + "ltlarr", + "ltquest", + "ltrPar", + "ltri", + "ltrie", + "ltrif", + "lurdshar", + "luruhar", + "lvertneqq", + "lvnE", + "mDDot", + "mac", + "macr", + "male", + "malt", + "maltese", + "map", + "mapsto", + "mapstodown", + "mapstoleft", + "mapstoup", + "marker", + "mcomma", + "mcy", + "mdash", + "measuredangle", + "mfr", + "mho", + "micr", + "micro", + "mid", + "midast", + "midcir", + "middo", + "middot", + "minus", + "minusb", + "minusd", + "minusdu", + "mlcp", + "mldr", + "mnplus", + "models", + "mopf", + "mp", + "mscr", + "mstpos", + "mu", + "multimap", + "mumap", + "nGg", + "nGt", + "nGtv", + "nLeftarrow", + "nLeftrightarrow", + "nLl", + "nLt", + "nLtv", + "nRightarrow", + "nVDash", + "nVdash", + "nabla", + "nacute", + "nang", + "nap", + "napE", + "napid", + "napos", + "napprox", + "natur", + "natural", + "naturals", + "nbs", + "nbsp", + "nbump", + "nbumpe", + "ncap", + "ncaron", + "ncedil", + "ncong", + "ncongdot", + "ncup", + "ncy", + "ndash", + "ne", + "neArr", + "nearhk", + "nearr", + "nearrow", + "nedot", + "nequiv", + "nesear", + "nesim", + "nexist", + "nexists", + "nfr", + "ngE", + "nge", + "ngeq", + "ngeqq", + "ngeqslant", + "nges", + "ngsim", + "ngt", + "ngtr", + "nhArr", + "nharr", + "nhpar", + "ni", + "nis", + "nisd", + "niv", + "njcy", + "nlArr", + "nlE", + "nlarr", + "nldr", + "nle", + "nleftarrow", + "nleftrightarrow", + "nleq", + "nleqq", + "nleqslant", + "nles", + "nless", + "nlsim", + "nlt", + "nltri", + "nltrie", + "nmid", + "nopf", + "no", + "not", + "notin", + "notinE", + "notindot", + "notinva", + "notinvb", + "notinvc", + "notni", + "notniva", + "notnivb", + "notnivc", + "npar", + "nparallel", + "nparsl", + "npart", + "npolint", + "npr", + "nprcue", + "npre", + "nprec", + "npreceq", + "nrArr", + "nrarr", + "nrarrc", + "nrarrw", + "nrightarrow", + "nrtri", + "nrtrie", + "nsc", + "nsccue", + "nsce", + "nscr", + "nshortmid", + "nshortparallel", + "nsim", + "nsime", + "nsimeq", + "nsmid", + "nspar", + "nsqsube", + "nsqsupe", + "nsub", + "nsubE", + "nsube", + "nsubset", + "nsubseteq", + "nsubseteqq", + "nsucc", + "nsucceq", + "nsup", + "nsupE", + "nsupe", + "nsupset", + "nsupseteq", + "nsupseteqq", + "ntgl", + "ntild", + "ntilde", + "ntlg", + "ntriangleleft", + "ntrianglelefteq", + "ntriangleright", + "ntrianglerighteq", + "nu", + "num", + "numero", + "numsp", + "nvDash", + "nvHarr", + "nvap", + "nvdash", + "nvge", + "nvgt", + "nvinfin", + "nvlArr", + "nvle", + "nvlt", + "nvltrie", + "nvrArr", + "nvrtrie", + "nvsim", + "nwArr", + "nwarhk", + "nwarr", + "nwarrow", + "nwnear", + "oS", + "oacut", + "oacute", + "oast", + "ocir", + "ocirc", + "ocy", + "odash", + "odblac", + "odiv", + "odot", + "odsold", + "oelig", + "ofcir", + "ofr", + "ogon", + "ograv", + "ograve", + "ogt", + "ohbar", + "ohm", + "oint", + "olarr", + "olcir", + "olcross", + "oline", + "olt", + "omacr", + "omega", + "omicron", + "omid", + "ominus", + "oopf", + "opar", + "operp", + "oplus", + "or", + "orarr", + "ord", + "order", + "orderof", + "ordf", + "ordm", + "origof", + "oror", + "orslope", + "orv", + "oscr", + "oslas", + "oslash", + "osol", + "otild", + "otilde", + "otimes", + "otimesas", + "oum", + "ouml", + "ovbar", + "par", + "para", + "parallel", + "parsim", + "parsl", + "part", + "pcy", + "percnt", + "period", + "permil", + "perp", + "pertenk", + "pfr", + "phi", + "phiv", + "phmmat", + "phone", + "pi", + "pitchfork", + "piv", + "planck", + "planckh", + "plankv", + "plus", + "plusacir", + "plusb", + "pluscir", + "plusdo", + "plusdu", + "pluse", + "plusm", + "plusmn", + "plussim", + "plustwo", + "pm", + "pointint", + "popf", + "poun", + "pound", + "pr", + "prE", + "prap", + "prcue", + "pre", + "prec", + "precapprox", + "preccurlyeq", + "preceq", + "precnapprox", + "precneqq", + "precnsim", + "precsim", + "prime", + "primes", + "prnE", + "prnap", + "prnsim", + "prod", + "profalar", + "profline", + "profsurf", + "prop", + "propto", + "prsim", + "prurel", + "pscr", + "psi", + "puncsp", + "qfr", + "qint", + "qopf", + "qprime", + "qscr", + "quaternions", + "quatint", + "quest", + "questeq", + "quo", + "quot", + "rAarr", + "rArr", + "rAtail", + "rBarr", + "rHar", + "race", + "racute", + "radic", + "raemptyv", + "rang", + "rangd", + "range", + "rangle", + "raqu", + "raquo", + "rarr", + "rarrap", + "rarrb", + "rarrbfs", + "rarrc", + "rarrfs", + "rarrhk", + "rarrlp", + "rarrpl", + "rarrsim", + "rarrtl", + "rarrw", + "ratail", + "ratio", + "rationals", + "rbarr", + "rbbrk", + "rbrace", + "rbrack", + "rbrke", + "rbrksld", + "rbrkslu", + "rcaron", + "rcedil", + "rceil", + "rcub", + "rcy", + "rdca", + "rdldhar", + "rdquo", + "rdquor", + "rdsh", + "real", + "realine", + "realpart", + "reals", + "rect", + "re", + "reg", + "rfisht", + "rfloor", + "rfr", + "rhard", + "rharu", + "rharul", + "rho", + "rhov", + "rightarrow", + "rightarrowtail", + "rightharpoondown", + "rightharpoonup", + "rightleftarrows", + "rightleftharpoons", + "rightrightarrows", + "rightsquigarrow", + "rightthreetimes", + "ring", + "risingdotseq", + "rlarr", + "rlhar", + "rlm", + "rmoust", + "rmoustache", + "rnmid", + "roang", + "roarr", + "robrk", + "ropar", + "ropf", + "roplus", + "rotimes", + "rpar", + "rpargt", + "rppolint", + "rrarr", + "rsaquo", + "rscr", + "rsh", + "rsqb", + "rsquo", + "rsquor", + "rthree", + "rtimes", + "rtri", + "rtrie", + "rtrif", + "rtriltri", + "ruluhar", + "rx", + "sacute", + "sbquo", + "sc", + "scE", + "scap", + "scaron", + "sccue", + "sce", + "scedil", + "scirc", + "scnE", + "scnap", + "scnsim", + "scpolint", + "scsim", + "scy", + "sdot", + "sdotb", + "sdote", + "seArr", + "searhk", + "searr", + "searrow", + "sec", + "sect", + "semi", + "seswar", + "setminus", + "setmn", + "sext", + "sfr", + "sfrown", + "sharp", + "shchcy", + "shcy", + "shortmid", + "shortparallel", + "sh", + "shy", + "sigma", + "sigmaf", + "sigmav", + "sim", + "simdot", + "sime", + "simeq", + "simg", + "simgE", + "siml", + "simlE", + "simne", + "simplus", + "simrarr", + "slarr", + "smallsetminus", + "smashp", + "smeparsl", + "smid", + "smile", + "smt", + "smte", + "smtes", + "softcy", + "sol", + "solb", + "solbar", + "sopf", + "spades", + "spadesuit", + "spar", + "sqcap", + "sqcaps", + "sqcup", + "sqcups", + "sqsub", + "sqsube", + "sqsubset", + "sqsubseteq", + "sqsup", + "sqsupe", + "sqsupset", + "sqsupseteq", + "squ", + "square", + "squarf", + "squf", + "srarr", + "sscr", + "ssetmn", + "ssmile", + "sstarf", + "star", + "starf", + "straightepsilon", + "straightphi", + "strns", + "sub", + "subE", + "subdot", + "sube", + "subedot", + "submult", + "subnE", + "subne", + "subplus", + "subrarr", + "subset", + "subseteq", + "subseteqq", + "subsetneq", + "subsetneqq", + "subsim", + "subsub", + "subsup", + "succ", + "succapprox", + "succcurlyeq", + "succeq", + "succnapprox", + "succneqq", + "succnsim", + "succsim", + "sum", + "sung", + "sup", + "sup1", + "sup2", + "sup3", + "supE", + "supdot", + "supdsub", + "supe", + "supedot", + "suphsol", + "suphsub", + "suplarr", + "supmult", + "supnE", + "supne", + "supplus", + "supset", + "supseteq", + "supseteqq", + "supsetneq", + "supsetneqq", + "supsim", + "supsub", + "supsup", + "swArr", + "swarhk", + "swarr", + "swarrow", + "swnwar", + "szli", + "szlig", + "target", + "tau", + "tbrk", + "tcaron", + "tcedil", + "tcy", + "tdot", + "telrec", + "tfr", + "there4", + "therefore", + "theta", + "thetasym", + "thetav", + "thickapprox", + "thicksim", + "thinsp", + "thkap", + "thksim", + "thor", + "thorn", + "tilde", + "time", + "times", + "timesb", + "timesbar", + "timesd", + "tint", + "toea", + "top", + "topbot", + "topcir", + "topf", + "topfork", + "tosa", + "tprime", + "trade", + "triangle", + "triangledown", + "triangleleft", + "trianglelefteq", + "triangleq", + "triangleright", + "trianglerighteq", + "tridot", + "trie", + "triminus", + "triplus", + "trisb", + "tritime", + "trpezium", + "tscr", + "tscy", + "tshcy", + "tstrok", + "twixt", + "twoheadleftarrow", + "twoheadrightarrow", + "uArr", + "uHar", + "uacut", + "uacute", + "uarr", + "ubrcy", + "ubreve", + "ucir", + "ucirc", + "ucy", + "udarr", + "udblac", + "udhar", + "ufisht", + "ufr", + "ugrav", + "ugrave", + "uharl", + "uharr", + "uhblk", + "ulcorn", + "ulcorner", + "ulcrop", + "ultri", + "umacr", + "um", + "uml", + "uogon", + "uopf", + "uparrow", + "updownarrow", + "upharpoonleft", + "upharpoonright", + "uplus", + "upsi", + "upsih", + "upsilon", + "upuparrows", + "urcorn", + "urcorner", + "urcrop", + "uring", + "urtri", + "uscr", + "utdot", + "utilde", + "utri", + "utrif", + "uuarr", + "uum", + "uuml", + "uwangle", + "vArr", + "vBar", + "vBarv", + "vDash", + "vangrt", + "varepsilon", + "varkappa", + "varnothing", + "varphi", + "varpi", + "varpropto", + "varr", + "varrho", + "varsigma", + "varsubsetneq", + "varsubsetneqq", + "varsupsetneq", + "varsupsetneqq", + "vartheta", + "vartriangleleft", + "vartriangleright", + "vcy", + "vdash", + "vee", + "veebar", + "veeeq", + "vellip", + "verbar", + "vert", + "vfr", + "vltri", + "vnsub", + "vnsup", + "vopf", + "vprop", + "vrtri", + "vscr", + "vsubnE", + "vsubne", + "vsupnE", + "vsupne", + "vzigzag", + "wcirc", + "wedbar", + "wedge", + "wedgeq", + "weierp", + "wfr", + "wopf", + "wp", + "wr", + "wreath", + "wscr", + "xcap", + "xcirc", + "xcup", + "xdtri", + "xfr", + "xhArr", + "xharr", + "xi", + "xlArr", + "xlarr", + "xmap", + "xnis", + "xodot", + "xopf", + "xoplus", + "xotime", + "xrArr", + "xrarr", + "xscr", + "xsqcup", + "xuplus", + "xutri", + "xvee", + "xwedge", + "yacut", + "yacute", + "yacy", + "ycirc", + "ycy", + "ye", + "yen", + "yfr", + "yicy", + "yopf", + "yscr", + "yucy", + "yum", + "yuml", + "zacute", + "zcaron", + "zcy", + "zdot", + "zeetrf", + "zeta", + "zfr", + "zhcy", + "zigrarr", + "zopf", + "zscr", + "zwj", + "zwnj", +]; + +/// List of values corresponding to names of named +/// [character references][character_reference]. +/// +/// The corresponding names of this list are stored in +/// [`CHARACTER_REFERENCE_NAMES`][]. +/// They correspond through their index. +/// +/// ## References +/// +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_VALUES: [&str; 2222] = [ + "Æ", "Æ", "&", "&", "Á", "Á", "Ă", "Â", "Â", "А", "𝔄", "À", "À", "Α", "Ā", "⩓", "Ą", "𝔸", "", + "Å", "Å", "𝒜", "≔", "Ã", "Ã", "Ä", "Ä", "∖", "⫧", "⌆", "Б", "∵", "ℬ", "Β", "𝔅", "𝔹", "˘", "ℬ", + "≎", "Ч", "©", "©", "Ć", "⋒", "ⅅ", "ℭ", "Č", "Ç", "Ç", "Ĉ", "∰", "Ċ", "¸", "·", "ℭ", "Χ", "⊙", + "⊖", "⊕", "⊗", "∲", "”", "’", "∷", "⩴", "≡", "∯", "∮", "ℂ", "∐", "∳", "⨯", "𝒞", "⋓", "≍", "ⅅ", + "⤑", "Ђ", "Ѕ", "Џ", "‡", "↡", "⫤", "Ď", "Д", "∇", "Δ", "𝔇", "´", "˙", "˝", "`", "˜", "⋄", "ⅆ", + "𝔻", "¨", "⃜", "≐", "∯", "¨", "⇓", "⇐", "⇔", "⫤", "⟸", "⟺", "⟹", "⇒", "⊨", "⇑", "⇕", "∥", "↓", + "⤓", "⇵", "̑", "⥐", "⥞", "↽", "⥖", "⥟", "⇁", "⥗", "⊤", "↧", "⇓", "𝒟", "Đ", "Ŋ", "Ð", "Ð", "É", + "É", "Ě", "Ê", "Ê", "Э", "Ė", "𝔈", "È", "È", "∈", "Ē", "◻", "▫", "Ę", "𝔼", "Ε", "⩵", "≂", "⇌", + "ℰ", "⩳", "Η", "Ë", "Ë", "∃", "ⅇ", "Ф", "𝔉", "◼", "▪", "𝔽", "∀", "ℱ", "ℱ", "Ѓ", ">", ">", "Γ", + "Ϝ", "Ğ", "Ģ", "Ĝ", "Г", "Ġ", "𝔊", "⋙", "𝔾", "≥", "⋛", "≧", "⪢", "≷", "⩾", "≳", "𝒢", "≫", "Ъ", + "ˇ", "^", "Ĥ", "ℌ", "ℋ", "ℍ", "─", "ℋ", "Ħ", "≎", "≏", "Е", "IJ", "Ё", "Í", "Í", "Î", "Î", "И", + "İ", "ℑ", "Ì", "Ì", "ℑ", "Ī", "ⅈ", "⇒", "∬", "∫", "⋂", "", "", "Į", "𝕀", "Ι", "ℐ", "Ĩ", "І", + "Ï", "Ï", "Ĵ", "Й", "𝔍", "𝕁", "𝒥", "Ј", "Є", "Х", "Ќ", "Κ", "Ķ", "К", "𝔎", "𝕂", "𝒦", "Љ", "<", + "<", "Ĺ", "Λ", "⟪", "ℒ", "↞", "Ľ", "Ļ", "Л", "⟨", "←", "⇤", "⇆", "⌈", "⟦", "⥡", "⇃", "⥙", "⌊", + "↔", "⥎", "⊣", "↤", "⥚", "⊲", "⧏", "⊴", "⥑", "⥠", "↿", "⥘", "↼", "⥒", "⇐", "⇔", "⋚", "≦", "≶", + "⪡", "⩽", "≲", "𝔏", "⋘", "⇚", "Ŀ", "⟵", "⟷", "⟶", "⟸", "⟺", "⟹", "𝕃", "↙", "↘", "ℒ", "↰", "Ł", + "≪", "⤅", "М", " ", "ℳ", "𝔐", "∓", "𝕄", "ℳ", "Μ", "Њ", "Ń", "Ň", "Ņ", "Н", "\u{200B}", + "\u{200B}", "\u{200B}", "\u{200B}", "≫", "≪", "\n", "𝔑", "\u{2060}", " ", "ℕ", "⫬", "≢", "≭", + "∦", "∉", "≠", "≂̸", "∄", "≯", "≱", "≧̸", "≫̸", "≹", "⩾̸", "≵", "≎̸", "≏̸", "⋪", "⧏̸", "⋬", "≮", "≰", + "≸", "≪̸", "⩽̸", "≴", "⪢̸", "⪡̸", "⊀", "⪯̸", "⋠", "∌", "⋫", "⧐̸", "⋭", "⊏̸", "⋢", "⊐̸", "⋣", "⊂⃒", "⊈", + "⊁", "⪰̸", "⋡", "≿̸", "⊃⃒", "⊉", "≁", "≄", "≇", "≉", "∤", "𝒩", "Ñ", "Ñ", "Ν", "Œ", "Ó", "Ó", "Ô", + "Ô", "О", "Ő", "𝔒", "Ò", "Ò", "Ō", "Ω", "Ο", "𝕆", "“", "‘", "⩔", "𝒪", "Ø", "Ø", "Õ", "Õ", "⨷", + "Ö", "Ö", "‾", "⏞", "⎴", "⏜", "∂", "П", "𝔓", "Φ", "Π", "±", "ℌ", "ℙ", "⪻", "≺", "⪯", "≼", "≾", + "″", "∏", "∷", "∝", "𝒫", "Ψ", "\"", "\"", "𝔔", "ℚ", "𝒬", "⤐", "®", "®", "Ŕ", "⟫", "↠", "⤖", + "Ř", "Ŗ", "Р", "ℜ", "∋", "⇋", "⥯", "ℜ", "Ρ", "⟩", "→", "⇥", "⇄", "⌉", "⟧", "⥝", "⇂", "⥕", "⌋", + "⊢", "↦", "⥛", "⊳", "⧐", "⊵", "⥏", "⥜", "↾", "⥔", "⇀", "⥓", "⇒", "ℝ", "⥰", "⇛", "ℛ", "↱", "⧴", + "Щ", "Ш", "Ь", "Ś", "⪼", "Š", "Ş", "Ŝ", "С", "𝔖", "↓", "←", "→", "↑", "Σ", "∘", "𝕊", "√", "□", + "⊓", "⊏", "⊑", "⊐", "⊒", "⊔", "𝒮", "⋆", "⋐", "⋐", "⊆", "≻", "⪰", "≽", "≿", "∋", "∑", "⋑", "⊃", + "⊇", "⋑", "Þ", "Þ", "™", "Ћ", "Ц", "\t", "Τ", "Ť", "Ţ", "Т", "𝔗", "∴", "Θ", " ", " ", "∼", + "≃", "≅", "≈", "𝕋", "⃛", "𝒯", "Ŧ", "Ú", "Ú", "↟", "⥉", "Ў", "Ŭ", "Û", "Û", "У", "Ű", "𝔘", "Ù", + "Ù", "Ū", "_", "⏟", "⎵", "⏝", "⋃", "⊎", "Ų", "𝕌", "↑", "⤒", "⇅", "↕", "⥮", "⊥", "↥", "⇑", "⇕", + "↖", "↗", "ϒ", "Υ", "Ů", "𝒰", "Ũ", "Ü", "Ü", "⊫", "⫫", "В", "⊩", "⫦", "⋁", "‖", "‖", "∣", "|", + "❘", "≀", " ", "𝔙", "𝕍", "𝒱", "⊪", "Ŵ", "⋀", "𝔚", "𝕎", "𝒲", "𝔛", "Ξ", "𝕏", "𝒳", "Я", "Ї", "Ю", + "Ý", "Ý", "Ŷ", "Ы", "𝔜", "𝕐", "𝒴", "Ÿ", "Ж", "Ź", "Ž", "З", "Ż", "\u{200B}", "Ζ", "ℨ", "ℤ", + "𝒵", "á", "á", "ă", "∾", "∾̳", "∿", "â", "â", "´", "´", "а", "æ", "æ", "", "𝔞", "à", "à", "ℵ", + "ℵ", "α", "ā", "⨿", "&", "&", "∧", "⩕", "⩜", "⩘", "⩚", "∠", "⦤", "∠", "∡", "⦨", "⦩", "⦪", "⦫", + "⦬", "⦭", "⦮", "⦯", "∟", "⊾", "⦝", "∢", "Å", "⍼", "ą", "𝕒", "≈", "⩰", "⩯", "≊", "≋", "'", "≈", + "≊", "å", "å", "𝒶", "*", "≈", "≍", "ã", "ã", "ä", "ä", "∳", "⨑", "⫭", "≌", "϶", "‵", "∽", "⋍", + "⊽", "⌅", "⌅", "⎵", "⎶", "≌", "б", "„", "∵", "∵", "⦰", "϶", "ℬ", "β", "ℶ", "≬", "𝔟", "⋂", "◯", + "⋃", "⨀", "⨁", "⨂", "⨆", "★", "▽", "△", "⨄", "⋁", "⋀", "⤍", "⧫", "▪", "▴", "▾", "◂", "▸", "␣", + "▒", "░", "▓", "█", "=⃥", "≡⃥", "⌐", "𝕓", "⊥", "⊥", "⋈", "╗", "╔", "╖", "╓", "═", "╦", "╩", "╤", + "╧", "╝", "╚", "╜", "╙", "║", "╬", "╣", "╠", "╫", "╢", "╟", "⧉", "╕", "╒", "┐", "┌", "─", "╥", + "╨", "┬", "┴", "⊟", "⊞", "⊠", "╛", "╘", "┘", "└", "│", "╪", "╡", "╞", "┼", "┤", "├", "‵", "˘", + "¦", "¦", "𝒷", "⁏", "∽", "⋍", "\\", "⧅", "⟈", "•", "•", "≎", "⪮", "≏", "≏", "ć", "∩", "⩄", "⩉", + "⩋", "⩇", "⩀", "∩︀", "⁁", "ˇ", "⩍", "č", "ç", "ç", "ĉ", "⩌", "⩐", "ċ", "¸", "¸", "⦲", "¢", "¢", + "·", "𝔠", "ч", "✓", "✓", "χ", "○", "⧃", "ˆ", "≗", "↺", "↻", "®", "Ⓢ", "⊛", "⊚", "⊝", "≗", "⨐", + "⫯", "⧂", "♣", "♣", ":", "≔", "≔", ",", "@", "∁", "∘", "∁", "ℂ", "≅", "⩭", "∮", "𝕔", "∐", "©", + "©", "℗", "↵", "✗", "𝒸", "⫏", "⫑", "⫐", "⫒", "⋯", "⤸", "⤵", "⋞", "⋟", "↶", "⤽", "∪", "⩈", "⩆", + "⩊", "⊍", "⩅", "∪︀", "↷", "⤼", "⋞", "⋟", "⋎", "⋏", "¤", "¤", "↶", "↷", "⋎", "⋏", "∲", "∱", "⌭", + "⇓", "⥥", "†", "ℸ", "↓", "‐", "⊣", "⤏", "˝", "ď", "д", "ⅆ", "‡", "⇊", "⩷", "°", "°", "δ", "⦱", + "⥿", "𝔡", "⇃", "⇂", "⋄", "⋄", "♦", "♦", "¨", "ϝ", "⋲", "÷", "÷", "÷", "⋇", "⋇", "ђ", "⌞", "⌍", + "$", "𝕕", "˙", "≐", "≑", "∸", "∔", "⊡", "⌆", "↓", "⇊", "⇃", "⇂", "⤐", "⌟", "⌌", "𝒹", "ѕ", "⧶", + "đ", "⋱", "▿", "▾", "⇵", "⥯", "⦦", "џ", "⟿", "⩷", "≑", "é", "é", "⩮", "ě", "ê", "ê", "≕", "э", + "ė", "ⅇ", "≒", "𝔢", "⪚", "è", "è", "⪖", "⪘", "⪙", "⏧", "ℓ", "⪕", "⪗", "ē", "∅", "∅", "∅", " ", + " ", " ", "ŋ", " ", "ę", "𝕖", "⋕", "⧣", "⩱", "ε", "ε", "ϵ", "≖", "≕", "≂", "⪖", "⪕", "=", "≟", + "≡", "⩸", "⧥", "≓", "⥱", "ℯ", "≐", "≂", "η", "ð", "ð", "ë", "ë", "€", "!", "∃", "ℰ", "ⅇ", "≒", + "ф", "♀", "ffi", "ff", "ffl", "𝔣", "fi", "fj", "♭", "fl", "▱", "ƒ", "𝕗", "∀", "⋔", "⫙", "⨍", "¼", "½", + "⅓", "¼", "⅕", "⅙", "⅛", "⅔", "⅖", "¾", "¾", "⅗", "⅜", "⅘", "⅚", "⅝", "⅞", "⁄", "⌢", "𝒻", "≧", + "⪌", "ǵ", "γ", "ϝ", "⪆", "ğ", "ĝ", "г", "ġ", "≥", "⋛", "≥", "≧", "⩾", "⩾", "⪩", "⪀", "⪂", "⪄", + "⋛︀", "⪔", "𝔤", "≫", "⋙", "ℷ", "ѓ", "≷", "⪒", "⪥", "⪤", "≩", "⪊", "⪊", "⪈", "⪈", "≩", "⋧", "𝕘", + "`", "ℊ", "≳", "⪎", "⪐", ">", ">", "⪧", "⩺", "⋗", "⦕", "⩼", "⪆", "⥸", "⋗", "⋛", "⪌", "≷", "≳", + "≩︀", "≩︀", "⇔", " ", "½", "ℋ", "ъ", "↔", "⥈", "↭", "ℏ", "ĥ", "♥", "♥", "…", "⊹", "𝔥", "⤥", "⤦", + "⇿", "∻", "↩", "↪", "𝕙", "―", "𝒽", "ℏ", "ħ", "⁃", "‐", "í", "í", "", "î", "î", "и", "е", "¡", + "¡", "⇔", "𝔦", "ì", "ì", "ⅈ", "⨌", "∭", "⧜", "℩", "ij", "ī", "ℑ", "ℐ", "ℑ", "ı", "⊷", "Ƶ", "∈", + "℅", "∞", "⧝", "ı", "∫", "⊺", "ℤ", "⊺", "⨗", "⨼", "ё", "į", "𝕚", "ι", "⨼", "¿", "¿", "𝒾", "∈", + "⋹", "⋵", "⋴", "⋳", "∈", "", "ĩ", "і", "ï", "ï", "ĵ", "й", "𝔧", "ȷ", "𝕛", "𝒿", "ј", "є", "κ", + "ϰ", "ķ", "к", "𝔨", "ĸ", "х", "ќ", "𝕜", "𝓀", "⇚", "⇐", "⤛", "⤎", "≦", "⪋", "⥢", "ĺ", "⦴", "ℒ", + "λ", "⟨", "⦑", "⟨", "⪅", "«", "«", "←", "⇤", "⤟", "⤝", "↩", "↫", "⤹", "⥳", "↢", "⪫", "⤙", "⪭", + "⪭︀", "⤌", "❲", "{", "[", "⦋", "⦏", "⦍", "ľ", "ļ", "⌈", "{", "л", "⤶", "“", "„", "⥧", "⥋", "↲", + "≤", "←", "↢", "↽", "↼", "⇇", "↔", "⇆", "⇋", "↭", "⋋", "⋚", "≤", "≦", "⩽", "⩽", "⪨", "⩿", "⪁", + "⪃", "⋚︀", "⪓", "⪅", "⋖", "⋚", "⪋", "≶", "≲", "⥼", "⌊", "𝔩", "≶", "⪑", "↽", "↼", "⥪", "▄", "љ", + "≪", "⇇", "⌞", "⥫", "◺", "ŀ", "⎰", "⎰", "≨", "⪉", "⪉", "⪇", "⪇", "≨", "⋦", "⟬", "⇽", "⟦", "⟵", + "⟷", "⟼", "⟶", "↫", "↬", "⦅", "𝕝", "⨭", "⨴", "∗", "_", "◊", "◊", "⧫", "(", "⦓", "⇆", "⌟", "⇋", + "⥭", "", "⊿", "‹", "𝓁", "↰", "≲", "⪍", "⪏", "[", "‘", "‚", "ł", "<", "<", "⪦", "⩹", "⋖", "⋋", + "⋉", "⥶", "⩻", "⦖", "◃", "⊴", "◂", "⥊", "⥦", "≨︀", "≨︀", "∺", "¯", "¯", "♂", "✠", "✠", "↦", "↦", + "↧", "↤", "↥", "▮", "⨩", "м", "—", "∡", "𝔪", "℧", "µ", "µ", "∣", "*", "⫰", "·", "·", "−", "⊟", + "∸", "⨪", "⫛", "…", "∓", "⊧", "𝕞", "∓", "𝓂", "∾", "μ", "⊸", "⊸", "⋙̸", "≫⃒", "≫̸", "⇍", "⇎", "⋘̸", + "≪⃒", "≪̸", "⇏", "⊯", "⊮", "∇", "ń", "∠⃒", "≉", "⩰̸", "≋̸", "ʼn", "≉", "♮", "♮", "ℕ", " ", " ", "≎̸", + "≏̸", "⩃", "ň", "ņ", "≇", "⩭̸", "⩂", "н", "–", "≠", "⇗", "⤤", "↗", "↗", "≐̸", "≢", "⤨", "≂̸", "∄", + "∄", "𝔫", "≧̸", "≱", "≱", "≧̸", "⩾̸", "⩾̸", "≵", "≯", "≯", "⇎", "↮", "⫲", "∋", "⋼", "⋺", "∋", "њ", + "⇍", "≦̸", "↚", "‥", "≰", "↚", "↮", "≰", "≦̸", "⩽̸", "⩽̸", "≮", "≴", "≮", "⋪", "⋬", "∤", "𝕟", "¬", + "¬", "∉", "⋹̸", "⋵̸", "∉", "⋷", "⋶", "∌", "∌", "⋾", "⋽", "∦", "∦", "⫽⃥", "∂̸", "⨔", "⊀", "⋠", "⪯̸", + "⊀", "⪯̸", "⇏", "↛", "⤳̸", "↝̸", "↛", "⋫", "⋭", "⊁", "⋡", "⪰̸", "𝓃", "∤", "∦", "≁", "≄", "≄", "∤", + "∦", "⋢", "⋣", "⊄", "⫅̸", "⊈", "⊂⃒", "⊈", "⫅̸", "⊁", "⪰̸", "⊅", "⫆̸", "⊉", "⊃⃒", "⊉", "⫆̸", "≹", "ñ", + "ñ", "≸", "⋪", "⋬", "⋫", "⋭", "ν", "#", "№", " ", "⊭", "⤄", "≍⃒", "⊬", "≥⃒", ">⃒", "⧞", "⤂", "≤⃒", + "<⃒", "⊴⃒", "⤃", "⊵⃒", "∼⃒", "⇖", "⤣", "↖", "↖", "⤧", "Ⓢ", "ó", "ó", "⊛", "ô", "ô", "о", "⊝", "ő", + "⨸", "⊙", "⦼", "œ", "⦿", "𝔬", "˛", "ò", "ò", "⧁", "⦵", "Ω", "∮", "↺", "⦾", "⦻", "‾", "⧀", "ō", + "ω", "ο", "⦶", "⊖", "𝕠", "⦷", "⦹", "⊕", "∨", "↻", "º", "ℴ", "ℴ", "ª", "º", "⊶", "⩖", "⩗", "⩛", + "ℴ", "ø", "ø", "⊘", "õ", "õ", "⊗", "⨶", "ö", "ö", "⌽", "¶", "¶", "∥", "⫳", "⫽", "∂", "п", "%", + ".", "‰", "⊥", "‱", "𝔭", "φ", "ϕ", "ℳ", "☎", "π", "⋔", "ϖ", "ℏ", "ℎ", "ℏ", "+", "⨣", "⊞", "⨢", + "∔", "⨥", "⩲", "±", "±", "⨦", "⨧", "±", "⨕", "𝕡", "£", "£", "≺", "⪳", "⪷", "≼", "⪯", "≺", "⪷", + "≼", "⪯", "⪹", "⪵", "⋨", "≾", "′", "ℙ", "⪵", "⪹", "⋨", "∏", "⌮", "⌒", "⌓", "∝", "∝", "≾", "⊰", + "𝓅", "ψ", " ", "𝔮", "⨌", "𝕢", "⁗", "𝓆", "ℍ", "⨖", "?", "≟", "\"", "\"", "⇛", "⇒", "⤜", "⤏", + "⥤", "∽̱", "ŕ", "√", "⦳", "⟩", "⦒", "⦥", "⟩", "»", "»", "→", "⥵", "⇥", "⤠", "⤳", "⤞", "↪", "↬", + "⥅", "⥴", "↣", "↝", "⤚", "∶", "ℚ", "⤍", "❳", "}", "]", "⦌", "⦎", "⦐", "ř", "ŗ", "⌉", "}", "р", + "⤷", "⥩", "”", "”", "↳", "ℜ", "ℛ", "ℜ", "ℝ", "▭", "®", "®", "⥽", "⌋", "𝔯", "⇁", "⇀", "⥬", "ρ", + "ϱ", "→", "↣", "⇁", "⇀", "⇄", "⇌", "⇉", "↝", "⋌", "˚", "≓", "⇄", "⇌", "", "⎱", "⎱", "⫮", "⟭", + "⇾", "⟧", "⦆", "𝕣", "⨮", "⨵", ")", "⦔", "⨒", "⇉", "›", "𝓇", "↱", "]", "’", "’", "⋌", "⋊", "▹", + "⊵", "▸", "⧎", "⥨", "℞", "ś", "‚", "≻", "⪴", "⪸", "š", "≽", "⪰", "ş", "ŝ", "⪶", "⪺", "⋩", "⨓", + "≿", "с", "⋅", "⊡", "⩦", "⇘", "⤥", "↘", "↘", "§", "§", ";", "⤩", "∖", "∖", "✶", "𝔰", "⌢", "♯", + "щ", "ш", "∣", "∥", "\u{AD}", "\u{AD}", "σ", "ς", "ς", "∼", "⩪", "≃", "≃", "⪞", "⪠", "⪝", "⪟", + "≆", "⨤", "⥲", "←", "∖", "⨳", "⧤", "∣", "⌣", "⪪", "⪬", "⪬︀", "ь", "/", "⧄", "⌿", "𝕤", "♠", "♠", + "∥", "⊓", "⊓︀", "⊔", "⊔︀", "⊏", "⊑", "⊏", "⊑", "⊐", "⊒", "⊐", "⊒", "□", "□", "▪", "▪", "→", "𝓈", + "∖", "⌣", "⋆", "☆", "★", "ϵ", "ϕ", "¯", "⊂", "⫅", "⪽", "⊆", "⫃", "⫁", "⫋", "⊊", "⪿", "⥹", "⊂", + "⊆", "⫅", "⊊", "⫋", "⫇", "⫕", "⫓", "≻", "⪸", "≽", "⪰", "⪺", "⪶", "⋩", "≿", "∑", "♪", "⊃", "¹", + "²", "³", "⫆", "⪾", "⫘", "⊇", "⫄", "⟉", "⫗", "⥻", "⫂", "⫌", "⊋", "⫀", "⊃", "⊇", "⫆", "⊋", "⫌", + "⫈", "⫔", "⫖", "⇙", "⤦", "↙", "↙", "⤪", "ß", "ß", "⌖", "τ", "⎴", "ť", "ţ", "т", "⃛", "⌕", "𝔱", + "∴", "∴", "θ", "ϑ", "ϑ", "≈", "∼", " ", "≈", "∼", "þ", "þ", "˜", "×", "×", "⊠", "⨱", "⨰", "∭", + "⤨", "⊤", "⌶", "⫱", "𝕥", "⫚", "⤩", "‴", "™", "▵", "▿", "◃", "⊴", "≜", "▹", "⊵", "◬", "≜", "⨺", + "⨹", "⧍", "⨻", "⏢", "𝓉", "ц", "ћ", "ŧ", "≬", "↞", "↠", "⇑", "⥣", "ú", "ú", "↑", "ў", "ŭ", "û", + "û", "у", "⇅", "ű", "⥮", "⥾", "𝔲", "ù", "ù", "↿", "↾", "▀", "⌜", "⌜", "⌏", "◸", "ū", "¨", "¨", + "ų", "𝕦", "↑", "↕", "↿", "↾", "⊎", "υ", "ϒ", "υ", "⇈", "⌝", "⌝", "⌎", "ů", "◹", "𝓊", "⋰", "ũ", + "▵", "▴", "⇈", "ü", "ü", "⦧", "⇕", "⫨", "⫩", "⊨", "⦜", "ϵ", "ϰ", "∅", "ϕ", "ϖ", "∝", "↕", "ϱ", + "ς", "⊊︀", "⫋︀", "⊋︀", "⫌︀", "ϑ", "⊲", "⊳", "в", "⊢", "∨", "⊻", "≚", "⋮", "|", "|", "𝔳", "⊲", "⊂⃒", + "⊃⃒", "𝕧", "∝", "⊳", "𝓋", "⫋︀", "⊊︀", "⫌︀", "⊋︀", "⦚", "ŵ", "⩟", "∧", "≙", "℘", "𝔴", "𝕨", "℘", "≀", + "≀", "𝓌", "⋂", "◯", "⋃", "▽", "𝔵", "⟺", "⟷", "ξ", "⟸", "⟵", "⟼", "⋻", "⨀", "𝕩", "⨁", "⨂", "⟹", + "⟶", "𝓍", "⨆", "⨄", "△", "⋁", "⋀", "ý", "ý", "я", "ŷ", "ы", "¥", "¥", "𝔶", "ї", "𝕪", "𝓎", "ю", + "ÿ", "ÿ", "ź", "ž", "з", "ż", "ℨ", "ζ", "𝔷", "ж", "⇝", "𝕫", "𝓏", "", "", +]; diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs new file mode 100644 index 0000000..7b7962b --- /dev/null +++ b/src/construct/blank_line.rs @@ -0,0 +1,61 @@ +//! Blank lines are a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! blank_line ::= *(' ' '\t') +//! ``` +//! +//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! from another paragraph. +//! In several cases, blank lines are not needed between flow constructs, +//! such as between two headings. +//! Sometimes, whether blank lines are present, changes the behavior of how +//! HTML is rendered, such as whether blank lines are present between list +//! items in a list. +//! More than one blank line is never needed in `CommonMark`. +//! +//! Because blank lines can be empty (line endings are not considered part of +//! it), and events cannot be empty, blank lines are not present as a token. +//! +//! ## References +//! +//! * [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js) +//! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) +//! +//! <!-- To do: link `flow`, `heading`, `list`, `paragraph` --> + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a blank line. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace), + |_ok| Box::new(after), + )(tokenizer, code) +} + +/// After zero or more spaces or tabs, before a line ending or EOF. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +fn after(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs new file mode 100644 index 0000000..5ea995e --- /dev/null +++ b/src/construct/character_escape.rs @@ -0,0 +1,69 @@ +//! Character escapes are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_escape ::= '\\' ascii_punctuation +//! ``` +//! +//! Like much of markdown, there are no “invalid” character escapes: just a +//! slash, or a slash followed by anything other than an ASCII punctuation +//! character, is exactly that: just a slash. +//! To escape (most) arbitrary characters, use a +//! [character reference][] instead +//! (as in, `&`, `{`, or say `	`). +//! It is also possible to escape a line ending in text with a similar +//! construct: a backslash followed by a line ending (that is part of the +//! construct instead of ending it). +//! +//! ## References +//! +//! * [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js) +//! * [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes) +//! +//! [character reference]: crate::construct::character_reference +//! +//! <!-- To do: link `hard_break_escape`, `string`, `text` --> + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a character escape. +/// +/// ```markdown +/// a|\*b +/// a|\b +/// a|\ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('\\') => { + tokenizer.enter(TokenType::CharacterEscape); + tokenizer.enter(TokenType::CharacterEscapeMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeMarker); + (State::Fn(Box::new(inside)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character escape, after `\`. +/// +/// ```markdown +/// a\|*b +/// a\|b +/// a\| b +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_punctuation() => { + tokenizer.enter(TokenType::CharacterEscapeValue); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeValue); + tokenizer.exit(TokenType::CharacterEscape); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs new file mode 100644 index 0000000..27275d5 --- /dev/null +++ b/src/construct/character_reference.rs @@ -0,0 +1,237 @@ +//! Character references are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_reference ::= '&' (numeric | named) ';' +//! +//! numeric ::= '#' (hexadecimal | decimal) +//! ; Note: Limit of `6` imposed as all bigger numbers are invalid: +//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) +//! ; Note: Limit of `7` imposed as all bigger numbers are invalid: +//! decimal ::= 1*7(ascii_digit) +//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`: +//! ; Note: Limited to any known named character reference (see `constants.rs`) +//! named ::= 1*31(ascii_alphanumeric) +//! ``` +//! +//! Like much of markdown, there are no “invalid” character references. +//! However, for security reasons, several numeric character references parse +//! fine but are not rendered as their corresponding character and they are +//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). +//! See [`decode_numeric_character_reference`][decode_numeric] for more info. +//! +//! To escape ASCII punctuation characters, use the terser +//! [character escape][character_escape] construct instead (as in, `\&`). +//! +//! Character references in markdown are not the same as character references +//! in HTML. +//! Notably, HTML allows several character references without a closing +//! semicolon. +//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info. +//! +//! Character references are parsed insensitive to casing. +//! The casing of hexadecimal numeric character references has no effect. +//! The casing of named character references does not matter when parsing them, +//! but does affect whether they match. +//! Depending on the name, one or more cases are allowed, such as that `AMP` +//! and `amp` are both allowed but other cases are not. +//! See [`CHARACTER_REFERENCE_NAMES`][character_reference_names] for which +//! names match. +//! +//! ## References +//! +//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) +//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +//! +//! [character_escape]: crate::construct::character_reference +//! [decode_numeric]: crate::util::decode_numeric_character_reference +//! [character_reference_names]: crate::constant::CHARACTER_REFERENCE_NAMES +//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state +//! +//! <!-- To do: link `string`, `text` --> + +use crate::constant::{ + CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_NAMED_SIZE_MAX, CHARACTER_REFERENCE_NAMES, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of a character reference. +#[derive(Debug, Clone)] +pub enum Kind { + /// Numeric decimal character reference (`	`). + Decimal, + /// Numeric hexadecimal character reference (`{`). + Hexadecimal, + /// Named character reference (`&`). + Named, +} + +/// State needed to parse character references. +#[derive(Debug, Clone)] +struct Info { + /// All parsed characters. + buffer: Vec<char>, + /// Kind of character reference. + kind: Kind, +} + +/// Start of a character reference. +/// +/// ```markdown +/// a|&b +/// a|{b +/// a|	b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('&') => { + tokenizer.enter(TokenType::CharacterReference); + tokenizer.enter(TokenType::CharacterReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarker); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character reference, after `&`, before `#` for numeric references +/// or an alphanumeric for named references. +/// +/// ```markdown +/// a&|amp;b +/// a&|#123;b +/// a&|#x9;b +/// ``` +fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); + (State::Fn(Box::new(numeric)), None) + } else { + tokenizer.enter(TokenType::CharacterReferenceValue); + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Named, + }, + ) + } +} + +/// Inside a numeric character reference, right before `x` for hexadecimals, +/// or a digit for decimals. +/// +/// ```markdown +/// a&#|123;b +/// a&#|x9;b +/// ``` +fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == 'x' || char == 'X' => { + tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.enter(TokenType::CharacterReferenceValue); + + ( + State::Fn(Box::new(|tokenizer, code| { + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Hexadecimal, + }, + ) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::CharacterReferenceValue); + + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Decimal, + }, + ) + } + } +} + +/// Inside a character reference value, after the markers (`&#x`, `&#`, or +/// `&`) that define its kind, but before the `;`. +/// The character reference kind defines what and how many characters are +/// allowed. +/// +/// ```markdown +/// a&a|mp;b +/// a|23;b +/// a&#x|9;b +/// ``` +fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + match code { + Code::Char(';') if !info.buffer.is_empty() => { + tokenizer.exit(TokenType::CharacterReferenceValue); + let value = info.buffer.iter().collect::<String>(); + + if let Kind::Named = info.kind { + if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { + return (State::Nok, Some(vec![code])); + } + } + + tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); + tokenizer.exit(TokenType::CharacterReference); + (State::Ok, None) + } + Code::Char(char) => { + let len = info.buffer.len(); + + let cont = match info.kind { + Kind::Hexadecimal + if char.is_ascii_hexdigit() + && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX => + { + true + } + Kind::Decimal + if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX => + { + true + } + Kind::Named + if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX => + { + true + } + _ => false, + }; + + if cont { + let mut clone = info; + clone.buffer.push(char); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))), + None, + ) + } else { + (State::Nok, None) + } + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs new file mode 100644 index 0000000..2068a62 --- /dev/null +++ b/src/construct/code_fenced.rs @@ -0,0 +1,581 @@ +//! Code (fenced) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! +//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' +//! info ::= 1*text +//! meta ::= 1*text *( *space_or_tab 1*text ) +//! +//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the +//! ; marker of the opening fence sequence. +//! text ::= code - eol - space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! code ::= . ; any unicode code point (other than line endings). +//! ``` +//! +//! The above grammar does not show how whitespace is handled. +//! To parse code (fenced), let `X` be the number of whitespace characters +//! before the opening fence sequence. +//! Each line of content is then allowed (not required) to be indented with up +//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the code. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the code instead of a fence. +//! +//! Code (fenced) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` +//! element*][html-code] in the HTML spec for more info. +//! +//! The optional `meta` part is ignored: it is not used when parsing or +//! rendering. +//! The optional `info` part is used and is expected to specify the programming +//! language that the code is in. +//! Which value it holds depends on what your syntax highlighter supports, if +//! one is used. +//! The `info` is, when rendering to HTML, typically exposed as a class. +//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code` +//! element*][html-code]). +//! For example: +//! +//! ```markdown +//! ~~~css +//! * { color: tomato } +//! ~~~ +//! ``` +//! +//! Yields: +//! +//! ```html +//! <pre><code class="language-css">* { color: tomato } +//! </code></pre> +//! ``` +//! +//! The `info` and `meta` parts are interpreted as the string content type. +//! That means that character escapes and character reference are allowed. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the +//! [code (indented)][code-indented] construct. +//! That construct is less explicit, different from code (text), and has no +//! support for specifying the programming language, so it is recommended to +//! use code (fenced) instead of code (indented). +//! +//! ## References +//! +//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! [code-indented]: crate::construct::code_indented +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! <!-- To do: link `flow`, `text`, `code_text`, `string` --> + +use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::get_span; + +/// Kind of fences. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + GraveAccent, + /// Tilde code. + Tilde, +} + +/// State needed to parse code (fenced). +#[derive(Debug, Clone)] +struct Info { + /// Number of markers on the opening fence sequence. + size: usize, + /// Number of tabs or spaces of indentation before the opening fence + /// sequence. + prefix: usize, + /// Kind of fences. + kind: Kind, +} + +/// Start of fenced code. +/// +/// ```markdown +/// | ~~~js +/// console.log(1); +/// ~~~ +/// ``` +/// +/// Parsing note: normally, the prefix is already stripped. +/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need +/// it. +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFenced); + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_sequence_open), + )(tokenizer, code) +} + +/// Inside the opening fence, after an optional prefix, before a sequence. +/// +/// ```markdown +/// |~~~js +/// console.log(1); +/// ~~~ +/// ``` +fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + Code::Char(char) if char == '`' || char == '~' => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + kind: if char == '`' { + Kind::GraveAccent + } else { + Kind::Tilde + }, + }, + code, + ) + } + _ => (State::Nok, None), + } +} + +/// Inside the opening fence sequence. +/// +/// ```markdown +/// ~|~~js +/// console.log(1); +/// ~~~ +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + let mut info = info; + info.size += 1; + sequence_open(tokenizer, info, code) + })), + None, + ) + } + _ => { + if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { + (State::Nok, None) + } else { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| { + whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) + }, + |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), + )(tokenizer, code) + } + } + } +} + +/// Inside the opening fence, after the sequence (and optional whitespace), before the info. +/// +/// ```markdown +/// ~~~|js +/// console.log(1); +/// ~~~ +/// ``` +fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceInfo); + tokenizer.enter(TokenType::ChunkString); + info_inside(tokenizer, info, code, vec![]) + } + } +} + +/// Inside the opening fence info. +/// +/// ```markdown +/// ~~~j|s +/// console.log(1); +/// ~~~ +/// ``` +fn info_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, + codes: Vec<Code>, +) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), + )(tokenizer, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char(_) => { + let mut codes = codes; + codes.push(code); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + info_inside(tokenizer, info, code, codes) + })), + None, + ) + } + } +} + +/// Inside the opening fence, after the info and whitespace, before the meta. +/// +/// ```markdown +/// ~~~js |eval +/// console.log(1); +/// ~~~ +/// ``` +fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceMeta); + tokenizer.enter(TokenType::ChunkString); + meta(tokenizer, info, code) + } + } +} + +/// Inside the opening fence meta. +/// +/// ```markdown +/// ~~~js e|val +/// console.log(1); +/// ~~~ +/// ``` +fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceMeta); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), + None, + ) + } + } +} + +/// At an eol/eof in code, before a closing fence or before content. +/// +/// ```markdown +/// ~~~js| +/// aa| +/// ~~~ +/// ``` +fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let clone = info.clone(); + + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt( + |tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + close_before(tokenizer, info, code) + })), + None, + ) + }, + |ok| { + if ok { + Box::new(after) + } else { + Box::new(|tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + content_start(tokenizer, clone, code) + })), + None, + ) + }) + } + }, + )(tokenizer, code), + _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), + } +} + +/// Before a closing fence, before optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// | ~~~ +/// ``` +fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), + )(tokenizer, code) +} + +/// In a closing fence, after optional whitespace, before sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// |~~~ +/// ``` +fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + close_sequence(tokenizer, info, code, 0) + } + _ => (State::Nok, None), + } +} + +/// In the closing fence sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~|~~ +/// ``` +fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + close_sequence(tokenizer, info, code, size + 1) + })), + None, + ) + } + _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(close_whitespace_after), + )(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// After the closing fence sequence after optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~ | +/// ``` +fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// ~~~js +/// |aa +/// ~~~ +/// ``` +fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { + tokenizer.enter(TokenType::Whitespace); + content_prefix(tokenizer, info, 0, code) + } + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// Before code content, in a prefix. +/// +/// ```markdown +/// ~~~js +/// | aa +/// ~~~ +/// ``` +fn content_prefix( + tokenizer: &mut Tokenizer, + info: Info, + prefix: usize, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + content_prefix(tokenizer, info, prefix + 1, code) + })), + None, + ) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Whitespace); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.exit(TokenType::Whitespace); + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// In code content. +/// +/// ```markdown +/// ~~~js +/// |ab +/// a|b +/// ab| +/// ~~~ +/// ``` +fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + content_continue(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After fenced code. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeFenced); + (State::Ok, Some(vec![code])) +} diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs new file mode 100644 index 0000000..6bf089b --- /dev/null +++ b/src/construct/code_indented.rs @@ -0,0 +1,190 @@ +//! Code (indented) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line ) +//! +//! ; Restriction: at least one `code` must not be whitespace. +//! indented_filled_line ::= 4space_or_tab *code +//! blank_line ::= *space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Code (indented) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` +//! element*][html-code] in the HTML spec for more info. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the [code (fenced)][code-fenced] +//! construct. +//! That construct is more explicit, more similar to code (text), and has +//! support for specifying the programming language that the code is in, so it +//! is recommended to use that instead of indented code. +//! +//! ## References +//! +//! * [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js) +//! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks) +//! +//! [code-fenced]: crate::construct::code_fenced +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! <!-- To do: link `flow`, `code_text` --> + +use crate::constant::TAB_SIZE; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of code (indented). +/// +/// ```markdown +/// | asd +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.enter(TokenType::CodeIndented); + tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); + indent(tokenizer, code, 0) + } + _ => (State::Nok, None), + } +} + +/// Inside the initial whitespace. +/// +/// ```markdown +/// | asd +/// | asd +/// | asd +/// |asd +/// ``` +/// +/// > **Parsing note**: it is not needed to check if this first line is a +/// > filled line (that it has a non-whitespace character), because blank lines +/// > are parsed already, so we never run into that. +fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + _ if size == TAB_SIZE => { + tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); + at_break(tokenizer, code) + } + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + indent(tokenizer, code, size + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// At a break. +/// +/// ```markdown +/// |asd +/// asd| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer + .attempt(further_start, |ok| { + Box::new(if ok { at_break } else { after }) + })(tokenizer, code), + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content(tokenizer, code) + } + } +} + +/// Inside code content. +/// +/// ```markdown +/// |ab +/// a|b +/// ab| +/// ``` +fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(content)), None) + } + } +} + +/// After indented code. +/// +/// ```markdown +/// ab| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeIndented); + (State::Ok, Some(vec![code])) +} + +/// Right at a line ending, trying to parse another indent. +/// +/// ```markdown +/// ab| +/// cd +/// ``` +fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + // To do: `nok` if lazy line. + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(further_start)), None) + } + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); + further_indent(tokenizer, code, 0) + } + _ => (State::Nok, None), + } +} + +/// Inside further whitespace. +/// +/// ```markdown +/// asd +/// | asd +/// ``` +fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { + match code { + _ if size == TAB_SIZE => { + tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char(' ' | '\t') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + further_indent(tokenizer, code, size + 1) + })), + None, + ) + } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); + further_start(tokenizer, code) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs new file mode 100644 index 0000000..b3aef1b --- /dev/null +++ b/src/construct/heading_atx.rs @@ -0,0 +1,175 @@ +//! Heading (atx) is a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! `CommonMark` introduced the requirement on whitespace existing after the +//! opening sequence and before text. +//! In older markdown versions, this was not required, and headings would form +//! without it. +//! +//! In markdown, it is also possible to create headings with the setext heading +//! construct. +//! The benefit of setext headings is that their text can include line endings. +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) +//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) +//! +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ +//! +//! <!-- To do: link `flow`, `setext` --> + +use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a heading (atx). +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('#') == code { + tokenizer.enter(TokenType::AtxHeading); + tokenizer.enter(TokenType::AtxHeadingSequence); + sequence_open(tokenizer, code, 0) + } else { + (State::Nok, None) + } +} + +/// In the opening sequence. +/// +/// ```markdown +/// #|# alpha +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ') + if rank > 0 => + { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } + Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_open(tokenizer, code, rank + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// ## |alpha +/// ## alpha| bravo +/// ## alpha |bravo +/// ## alpha bravo|## +/// ## alpha bravo ##| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::AtxHeading); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::AtxHeadingWhitespace); + whitespace(tokenizer, code) + } + Code::Char('#') => { + tokenizer.enter(TokenType::AtxHeadingSequence); + further_sequence(tokenizer, code) + } + Code::Char(_) => { + tokenizer.enter(TokenType::AtxHeadingText); + data(tokenizer, code) + } + } +} + +/// In a further sequence (after whitespace). +/// Could be normal “visible” hashes in the heading or a final sequence. +/// +/// ```markdown +/// ## alpha #|# +/// ``` +fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.consume(code); + (State::Fn(Box::new(further_sequence)), None) + } else { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } +} + +/// In whitespace. +/// +/// ```markdown +/// ## alpha | bravo +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(whitespace)), None) + } + _ => { + tokenizer.exit(TokenType::AtxHeadingWhitespace); + at_break(tokenizer, code) + } + } +} + +/// In text. +/// +/// ```markdown +/// ## al|pha +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + tokenizer.exit(TokenType::AtxHeadingText); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(data)), None) + } + } +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ] +//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ] +//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '</' tag_name whitespace_optional '>' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//! <div>This is a <code>div</code> but *this* is not emphasis.</div> +//! +//! <div> +//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//! </div> +//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! <!-- To do: link stuff --> + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { + /// Not yet known. + Unknown, + /// Symbol for `<script>` (condition 1). + Raw, + /// Symbol for `<!---->` (condition 2). + Comment, + /// Symbol for `<?php?>` (condition 3). + Instruction, + /// Symbol for `<!doctype>` (condition 4). + Declaration, + /// Symbol for `<![CDATA[]]>` (condition 5). + Cdata, + /// Symbol for `<div` (condition 6). + Basic, + /// Symbol for `<x>` (condition 7). + Complete, +} + +/// Type of quote, if we’re in an attribure, in complete (condition 7). +#[derive(Debug, Clone, PartialEq)] +enum QuoteKind { + /// Not in a quoted attribute. + None, + /// In a double quoted (`"`) attribute. + Double, + /// In a single quoted (`"`) attribute. + Single, +} + +/// State needed to parse HTML (flow). +#[derive(Debug, Clone)] +struct Info { + /// Kind of HTML (flow). + kind: Kind, + /// Whether this is a start tag (`<` not followed by `/`). + start_tag: bool, + /// Used depending on `kind` to either collect all parsed characters, or to + /// store expected characters. + buffer: Vec<char>, + /// `index` into `buffer` when expecting certain characters. + index: usize, + /// Current quote, when in a double or single quoted attribute value. + quote: QuoteKind, +} + +// To do: mark as concrete (block quotes or lists can’t “pierce” into HTML). + +/// Start of HTML (flow), before optional whitespace. +/// +/// ```markdown +/// |<x /> +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::HtmlFlow); + tokenizer.enter(TokenType::HtmlFlowData); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before), + )(tokenizer, code) +} + +/// After optional whitespace, before `<`. +/// +/// ```markdown +/// |<x /> +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('<') == code { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + open( + tokenizer, + Info { + kind: Kind::Unknown, + start_tag: false, + buffer: vec![], + index: 0, + quote: QuoteKind::None, + }, + code, + ) + })), + None, + ) + } else { + (State::Nok, None) + } +} + +/// After `<`, before a tag name or other stuff. +/// +/// ```markdown +/// <|x /> +/// <|!doctype /> +/// <|!--xxx--/> +/// ``` +fn open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('!') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + declaration_start(tokenizer, info, code) + })), + None, + ) + } + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + tag_close_start(tokenizer, info, code) + })), + None, + ) + } + Code::Char('?') => { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Instruction; + tokenizer.consume(code); + // While we’re in an instruction instead of a declaration, we’re on a `?` + // right now, so we do need to search for `>`, similar to declarations. + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + // To do: life times. + let mut clone = info; + clone.start_tag = true; + tag_name(tokenizer, clone, code) + } + _ => (State::Nok, None), + } +} + +/// After `<!`, so inside a declaration, comment, or CDATA. +/// +/// ```markdown +/// <!|doctype /> +/// <!|--xxx--/> +/// <!|[CDATA[>&<]]> +/// ``` +fn declaration_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + let mut clone = info; + clone.kind = Kind::Comment; + ( + State::Fn(Box::new(|tokenizer, code| { + comment_open_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char('[') => { + tokenizer.consume(code); + let mut clone = info; + clone.kind = Kind::Cdata; + clone.buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + clone.index = 0; + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open_inside(tokenizer, clone, code) + })), + None, + ) + } + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.kind = Kind::Declaration; + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, clone, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `<!-`, inside a comment, before another `-`. +/// +/// ```markdown +/// <!-|-xxx--/> +/// ``` +fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `<![`, inside CDATA, expecting `CDATA[`. +/// +/// ```markdown +/// <![|CDATA[>&<]]> +/// <![CD|ATA[>&<]]> +/// <![CDA|TA[>&<]]> +/// <![CDAT|A[>&<]]> +/// <![CDATA|[>&<]]> +/// ``` +fn cdata_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == info.buffer[info.index] => { + let mut clone = info; + clone.index += 1; + tokenizer.consume(code); + + if clone.index == clone.buffer.len() { + clone.buffer.clear(); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, clone, code) + })), + None, + ) + } else { + ( + State::Fn(Box::new(|tokenizer, code| { + cdata_open_inside(tokenizer, clone, code) + })), + None, + ) + } + } + _ => (State::Nok, None), + } +} + +/// After `</`, in a closing tag, before a tag name. +/// +/// ```markdown +/// </|x> +/// ``` +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_alphabetic() => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// In a tag name. +/// +/// ```markdown +/// <a|b> +/// </a|b> +/// ``` +fn tag_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { + let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + let name = tag_name_buffer.as_str(); + let slash = if let Code::Char(char) = code { + char == '/' + } else { + false + }; + + if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Raw; + clone.buffer.clear(); + continuation(tokenizer, clone, code) + } else if HTML_BLOCK_NAMES.contains(&name) { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Basic; + clone.buffer.clear(); + + if slash { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + basic_self_closing(tokenizer, clone, code) + })), + None, + ) + } else { + continuation(tokenizer, clone, code) + } + } else { + // To do: life times. + let mut clone = info; + clone.kind = Kind::Complete; + + // To do: do not support complete HTML when interrupting. + if clone.start_tag { + complete_attribute_name_before(tokenizer, clone, code) + } else { + complete_closing_tag_after(tokenizer, clone, code) + } + } + } + Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + tokenizer.consume(code); + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), + None, + ) + } + Code::Char(_) => (State::Nok, None), + } +} + +/// After a closing slash of a basic tag name. +/// +/// ```markdown +/// <div/|> +/// ``` +fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After a closing slash of a complete tag name. +/// +/// ```markdown +/// <x/|> +/// </x/|> +/// ``` +fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_closing_tag_after(tokenizer, info, code) + })), + None, + ) + } + _ => complete_end(tokenizer, info, code), + } +} + +/// At a place where an attribute name would be valid. +/// +/// At first, this state is used after a complete tag name, after whitespace, +/// where it expects optional attributes or the end of the tag. +/// It is also reused after attributes, when expecting more optional +/// attributes. +/// +/// ```markdown +/// <x |/> +/// <x |:asd> +/// <x |_asd> +/// <x |asd> +/// <x | > +/// <x |> +/// ``` +fn complete_attribute_name_before( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_end(tokenizer, info, code) + })), + None, + ) + } + Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name(tokenizer, info, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name_before(tokenizer, info, code) + })), + None, + ) + } + _ => complete_end(tokenizer, info, code), + } +} + +/// In an attribute name. +/// +/// ```markdown +/// <x :|> +/// <x _|> +/// <x a|> +/// ``` +fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char(char) + if char == '-' + || char == '.' + || char == ':' + || char == '_' + || char.is_ascii_alphanumeric() => + { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_name_after(tokenizer, info, code), + } +} + +/// After an attribute name, before an attribute initializer, the end of the +/// tag, or whitespace. +/// +/// ```markdown +/// <x a|> +/// <x a|=b> +/// <x a|="c"> +/// ``` +fn complete_attribute_name_after( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('=') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_before(tokenizer, info, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_name_after(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_name_before(tokenizer, info, code), + } +} + +/// Before an unquoted, double quoted, or single quoted attribute value, +/// allowing whitespace. +/// +/// ```markdown +/// <x a=|b> +/// <x a=|"c"> +/// ``` +fn complete_attribute_value_before( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), + Code::Char(char) if char == '"' || char == '\'' => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.quote = if char == '"' { + QuoteKind::Double + } else { + QuoteKind::Single + }; + + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted(tokenizer, clone, code) + })), + None, + ) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_before(tokenizer, info, code) + })), + None, + ) + } + _ => complete_attribute_value_unquoted(tokenizer, info, code), + } +} + +/// In a double or single quoted attribute value. +/// +/// ```markdown +/// <x a="|"> +/// <x a='|'> +/// ``` +fn complete_attribute_value_quoted( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + let marker = if info.quote == QuoteKind::Double { + '"' + } else { + '\'' + }; + + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted_after(tokenizer, info, code) + })), + None, + ) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_quoted(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// In an unquoted attribute value. +/// +/// ```markdown +/// <x a=b|c> +/// ``` +fn complete_attribute_value_unquoted( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + complete_attribute_name_after(tokenizer, info, code) + } + Code::Char(_) => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_attribute_value_unquoted(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After a double or single quoted attribute value, before whitespace or the +/// end of the tag. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_attribute_value_quoted_after( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { + complete_attribute_name_before(tokenizer, info, code) + } + _ => (State::Nok, None), + } +} + +/// In certain circumstances of a complete tag where only an `>` is allowed. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_end(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_after(tokenizer, info, code) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After `>` in a complete tag. +/// +/// ```markdown +/// <x>| +/// ``` +fn complete_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + continuation(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + complete_after(tokenizer, info, code) + })), + None, + ) + } + Code::Char(_) => (State::Nok, None), + } +} + +/// Inside continuation of any HTML kind. +/// +/// ```markdown +/// <!--x|xx--> +/// ``` +fn continuation(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_comment_inside(tokenizer, info, code) + })), + None, + ) + } + Code::Char('<') if info.kind == Kind::Raw => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_tag_open(tokenizer, info, code) + })), + None, + ) + } + Code::Char('>') if info.kind == Kind::Declaration => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + Code::Char('?') if info.kind == Kind::Instruction => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + Code::Char(']') if info.kind == Kind::Cdata => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_character_data_inside(tokenizer, info, code) + })), + None, + ) + } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') + if info.kind == Kind::Basic || info.kind == Kind::Complete => + { + let clone = info; + + tokenizer.check(blank_line_before, |ok| { + if ok { + Box::new(|tokenizer, code| continuation_close(tokenizer, clone, code)) + } else { + Box::new(|tokenizer, code| continuation_at_line_ending(tokenizer, clone, code)) + } + })(tokenizer, code) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + continuation_at_line_ending(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// In continuation, before an eol or eof. +/// +/// ```markdown +/// <x>| +/// ``` +fn continuation_at_line_ending(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::HtmlFlowData); + html_continue_start(tokenizer, info, code) +} + +/// In continuation, after an eol. +/// +/// ```markdown +/// <x>| +/// asd +/// ``` +fn html_continue_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::HtmlFlow); + (State::Ok, Some(vec![code])) + } + // To do: do not allow lazy lines. + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + html_continue_start(tokenizer, info, code) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::HtmlFlowData); + continuation(tokenizer, info, code) + } + } +} + +/// In comment continuation, after one `-`, expecting another. +/// +/// ```markdown +/// <!--xxx-|-> +/// ``` +fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In raw continuation, after `<`, expecting a `/`. +/// +/// ```markdown +/// <script>console.log(1)<|/script> +/// ``` +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('/') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_end_tag(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In raw continuation, after `</`, expecting or inside a raw tag name. +/// +/// ```markdown +/// <script>console.log(1)</|script> +/// <script>console.log(1)</s|cript> +/// <script>console.log(1)</script|> +/// ``` +fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::Char('>') => { + let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + // To do: life times. + let mut clone = info; + clone.buffer.clear(); + + if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, clone, code) + })), + None, + ) + } else { + continuation(tokenizer, clone, code) + } + } + Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => { + tokenizer.consume(code); + // To do: life times. + let mut clone = info; + clone.buffer.push(char); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_raw_end_tag(tokenizer, clone, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In cdata continuation, after `]`, expecting `]>`. +/// +/// ```markdown +/// <![CDATA[>&<]|]> +/// ``` +fn continuation_character_data_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char(']') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In declaration or instruction continuation, waiting for `>` to close it. +/// +/// ```markdown +/// <!--|> +/// <?ab?|> +/// <?|> +/// <!q|> +/// <!--ab--|> +/// <!--ab--|-> +/// <!--ab---|> +/// <![CDATA[>&<]]|> +/// ``` +fn continuation_declaration_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, +) -> StateFnResult { + match code { + Code::Char('>') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + Code::Char('-') if info.kind == Kind::Comment => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_declaration_inside(tokenizer, info, code) + })), + None, + ) + } + _ => continuation(tokenizer, info, code), + } +} + +/// In closed continuation: everything we get until the eol/eof is part of it. +/// +/// ```markdown +/// <!doctype>| +/// ``` +fn continuation_close(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::HtmlFlowData); + tokenizer.exit(TokenType::HtmlFlow); + (State::Ok, Some(vec![code])) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + continuation_close(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// Before a line ending, expecting a blank line. +/// +/// ```markdown +/// <div>| +/// +/// ``` +fn blank_line_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(blank_line)), None) +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs new file mode 100644 index 0000000..d671db6 --- /dev/null +++ b/src/construct/mod.rs @@ -0,0 +1,11 @@ +//! Constructs found in markdown. + +pub mod blank_line; +pub mod character_escape; +pub mod character_reference; +pub mod code_fenced; +pub mod code_indented; +pub mod heading_atx; +pub mod html_flow; +pub mod partial_whitespace; +pub mod thematic_break; diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs new file mode 100644 index 0000000..dd0d2b5 --- /dev/null +++ b/src/construct/partial_whitespace.rs @@ -0,0 +1,66 @@ +//! A little helper to parse `space_or_tab` +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! space_or_tab ::= 1*(' ' '\t') +//! ``` +//! +//! Depending on where whitespace can occur, it can be optional (or not), +//! and present in the rendered result (or not). +//! +//! ## References +//! +//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) +//! +//! <!-- To do: link stuff --> + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +// To do: should `token_type` be a `Some`, with `None` defaulting to something? +// To do: should `max: Some(usize)` be added? + +/// Before whitespace. +/// +/// ```markdown +/// alpha| bravo +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + // To do: lifetimes. + let clone = token_type.clone(); + tokenizer.enter(token_type); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// In whitespace. +/// +/// ```markdown +/// alpha |bravo +/// alpha | bravo +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + inside(tokenizer, code, token_type) + })), + None, + ) + } + _ => { + tokenizer.exit(token_type); + (State::Ok, Some(vec![code])) + } + } +} diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs new file mode 100644 index 0000000..15ebac7 --- /dev/null +++ b/src/construct/thematic_break.rs @@ -0,0 +1,137 @@ +//! Thematic breaks, sometimes called horizontal rules, are a construct that +//! occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: all markers must be identical. +//! ; Restriction: at least 3 markers must be used. +//! thematic_break ::= *space_or_tab 1*(1*marker *space_or_tab) +//! +//! space_or_tab ::= ' ' | '\t' +//! marker ::= '*' | '-' | '_' +//! ``` +//! +//! Thematic breaks in markdown typically relate to the HTML element `<hr>`. +//! See [*§ 4.4.2 The `hr` element* in the HTML spec][html] for more info. +//! +//! It is recommended to use exactly three asterisks without whitespace when +//! writing markdown. +//! As using more than three markers has no effect other than wasting space, +//! it is recommended to use exactly three markers. +//! Thematic breaks formed with asterisks or dashes can interfere with lists +//! in if there is whitespace between them: `* * *` and `- - -`. +//! For these reasons, it is recommend to not use spaces or tabs between the +//! markers. +//! Thematic breaks formed with dashes (without whitespace) can also form +//! setext headings. +//! As dashes and underscores frequently occur in natural language and URLs, it +//! is recommended to use asterisks for thematic breaks to distinguish from +//! such use. +//! Because asterisks can be used to form the most markdown constructs, using +//! them has the added benefit of making it easier to gloss over markdown: you +//! can look for asterisks to find syntax while not worrying about other +//! characters. +//! +//! ## References +//! +//! * [`thematic-break.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/thematic-break.js) +//! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks) +//! +//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element +//! +//! <!-- To do: link `flow` --> + +use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a thematic break. +/// +/// ```markdown +/// |*** +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == '*' || char == '-' || char == '_' => { + tokenizer.enter(TokenType::ThematicBreak); + at_break(tokenizer, code, char, 0) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// |*** +/// *| * * +/// * |* * +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::ThematicBreakSequence); + sequence(tokenizer, code, marker, size) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::ThematicBreakWhitespace); + whitespace(tokenizer, code, marker, size) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') + if size >= THEMATIC_BREAK_MARKER_COUNT_MIN => + { + tokenizer.exit(TokenType::ThematicBreak); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// In a sequence of markers. +/// +/// ```markdown +/// |*** +/// *|** +/// **|* +/// ``` +fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence(tokenizer, code, marker, size + 1) + })), + None, + ) + } + _ => { + tokenizer.exit(TokenType::ThematicBreakSequence); + at_break(tokenizer, code, marker, size) + } + } +} + +/// In whitespace. +/// +/// ```markdown +/// * |* * +/// * | * * +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + whitespace(tokenizer, code, marker, size) + })), + None, + ) + } + _ => { + tokenizer.exit(TokenType::ThematicBreakWhitespace); + at_break(tokenizer, code, marker, size) + } + } +} diff --git a/src/content/flow.rs b/src/content/flow.rs new file mode 100644 index 0000000..21c5721 --- /dev/null +++ b/src/content/flow.rs @@ -0,0 +1,258 @@ +//! The flow content type. +//! +//! **Flow** represents the sections, such as headings, code, and content, which +//! is parsed per line. +//! An example is HTML, which has a certain starting condition (such as +//! `<script>` on its own line), then continues for a while, until an end +//! condition is found (such as `</style>`). +//! If that line with an end condition is never found, that flow goes until +//! the end. +//! +//! The constructs found in flow are: +//! +//! * [Blank line][crate::construct::blank_line] +//! * [Code (fenced)][crate::construct::code_fenced] +//! * [Code (indented)][crate::construct::code_indented] +//! * [Heading (atx)][crate::construct::heading_atx] +//! * [HTML (flow)][crate::construct::html_flow] +//! * [Thematic break][crate::construct::thematic_break] +//! +//! <!-- To do: `setext` in content? Link to content. --> + +use crate::construct::{ + blank_line::start as blank_line, code_fenced::start as code_fenced, + code_indented::start as code_indented, heading_atx::start as heading_atx, + html_flow::start as html_flow, partial_whitespace::start as whitespace, + thematic_break::start as thematic_break, +}; +use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; + +/// Turn `codes` as the flow content type into events. +// To do: remove this `allow` when all the content types are glued together. +#[allow(dead_code)] +pub fn flow(codes: Vec<Code>) -> Vec<Event> { + let mut tokenizer = Tokenizer::new(); + let (state, remainder) = tokenizer.feed(codes, Box::new(start), true); + + if let Some(ref x) = remainder { + if !x.is_empty() { + unreachable!("expected no final remainder {:?}", x); + } + } + + match state { + State::Ok => {} + _ => unreachable!("expected final state to be `State::Ok`"), + } + + tokenizer.events +} + +/// Before flow. +/// +/// First we assume a blank line. +// +/// ```markdown +/// | +/// |## alpha +/// | bravo +/// |*** +/// ``` +fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(blank_line, |ok| { + Box::new(if ok { blank_line_after } else { initial_before }) + })(tokenizer, code), + } +} + +/// After a blank line. +/// +/// Move to `start` afterwards. +/// +/// ```markdown +/// ␠␠| +/// ``` +fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::BlankLineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::BlankLineEnding); + (State::Fn(Box::new(start)), None) + } + _ => unreachable!("expected eol/eof after blank line `{:?}`", code), + } +} + +/// Before flow (initial). +/// +/// “Initial” flow means unprefixed flow, so right at the start of a line. +/// Interestingly, the only flow (initial) construct is indented code. +/// Move to `before` afterwards. +/// +/// ```markdown +/// |qwe +/// | asd +/// ``` +fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(code_indented, |ok| { + Box::new(if ok { + after + } else { + initial_before_not_code_indented + }) + })(tokenizer, code), + } +} + +/// After a flow construct. +/// +/// ```markdown +/// ## alpha| +/// | +/// ~~~js +/// asd +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(start)), None) + } + _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code), + } +} + +/// Before flow (initial), but not at code (indented). +/// +/// ```markdown +/// |qwe +/// ``` +fn initial_before_not_code_indented(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(code_fenced, |ok| { + Box::new(if ok { + after + } else { + initial_before_not_code_fenced + }) + })(tokenizer, code), + } +} + +/// Before flow (initial), but not at code (fenced). +/// +/// ```markdown +/// |qwe +/// ``` +fn initial_before_not_code_fenced(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(html_flow, |ok| Box::new(if ok { after } else { before }))( + tokenizer, code, + ), + } +} + +/// Before flow, but not at code (indented) or code (fenced). +/// +/// Compared to flow (initial), normal flow can be arbitrarily prefixed. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_after_prefix), + )(tokenizer, code) +} + +/// Before flow, after potential whitespace. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(heading_atx, |ok| { + Box::new(if ok { after } else { before_not_heading_atx }) + })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_not_heading_atx(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(thematic_break, |ok| { + Box::new(if ok { after } else { before_not_thematic_break }) + })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) or thematic break. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_not_thematic_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(html_flow, |ok| { + Box::new(if ok { after } else { content_before }) + })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) or thematic break. +/// +/// At this point, we’re at content (zero or more definitions and zero or one +/// paragraph/setext heading). +/// +/// ```markdown +/// |qwe +/// ``` +// To do: currently only parses a single line. +// To do: +// - Multiline +// - One or more definitions. +// - Setext heading. +fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + after(tokenizer, code) + } + _ => { + tokenizer.enter(TokenType::Content); + tokenizer.enter(TokenType::ContentPhrasing); + tokenizer.consume(code); + (State::Fn(Box::new(content)), None) + } + } +} +/// In content. +/// +/// ```markdown +/// al|pha +/// ``` +// To do: lift limitations as documented above. +fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ContentPhrasing); + tokenizer.exit(TokenType::Content); + after(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(content)), None) + } + } +} diff --git a/src/content/mod.rs b/src/content/mod.rs new file mode 100644 index 0000000..d5771a3 --- /dev/null +++ b/src/content/mod.rs @@ -0,0 +1,4 @@ +//! Content types found in markdown. + +pub mod flow; +pub mod string; diff --git a/src/content/string.rs b/src/content/string.rs new file mode 100644 index 0000000..a8a81b2 --- /dev/null +++ b/src/content/string.rs @@ -0,0 +1,120 @@ +//! The string content type. +//! +//! **String** is a limited **text** like content type which only allows +//! character escapes and character references. +//! It exists in things such as identifiers (media references, definitions), +//! titles, URLs, code (fenced) info and meta parts. +//! +//! The constructs found in strin are: +//! +//! * [Character escape][crate::construct::character_escape] +//! * [Character reference][crate::construct::character_reference] + +use crate::construct::{ + character_escape::start as character_escape, character_reference::start as character_reference, +}; +use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; + +/// Turn `codes` as the string content type into events. +// To do: remove this `allow` when all the content types are glued together. +#[allow(dead_code)] +pub fn string(codes: Vec<Code>) -> Vec<Event> { + let mut tokenizer = Tokenizer::new(); + let (state, remainder) = tokenizer.feed(codes, Box::new(before), true); + + if let Some(ref x) = remainder { + if !x.is_empty() { + unreachable!("expected no final remainder {:?}", x); + } + } + + match state { + State::Ok => {} + _ => unreachable!("expected final state to be `State::Ok`"), + } + + tokenizer.events +} + +/// Before string. +/// +/// First we assume character reference. +/// +/// ```markdown +/// |& +/// |\& +/// |qwe +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(character_reference, |ok| { + Box::new(if ok { + before + } else { + before_not_character_reference + }) + })(tokenizer, code), + } +} + +/// Before string, not at a character reference. +/// +/// Assume character escape. +/// +/// ```markdown +/// |\& +/// |qwe +/// ``` +fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(character_escape, |ok| { + Box::new(if ok { + before + } else { + before_not_character_escape + }) + })(tokenizer, code), + } +} + +/// Before string, not at a character reference or character escape. +/// +/// We’re at data. +/// +/// ```markdown +/// |qwe +/// ``` +fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::None = code { + (State::Ok, None) + } else { + tokenizer.enter(TokenType::Data); + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } +} + +/// In data. +/// +/// ```markdown +/// q|w|e +/// ``` +fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::Data); + (State::Ok, None) + } + // To do: somehow get these markers from constructs. + Code::Char('&' | '\\') => { + tokenizer.exit(TokenType::Data); + before(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..1624a22 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,52 @@ +//! Public API of micromark. +//! +//! This module exposes [`micromark`][] (and [`micromark_with_options`][]). +//! `micromark` is a safe way to transform (untrusted?) markdown into HTML. +//! `micromark_with_options` allows you to configure how markdown is turned into +//! HTML, such as by allowing dangerous HTML when you trust it. +mod compiler; +mod constant; +mod construct; +mod content; +mod parser; +mod tokenizer; +mod util; + +use crate::compiler::compile; +pub use crate::compiler::CompileOptions; +use crate::parser::parse; + +/// Turn markdown into HTML. +/// +/// ## Examples +/// +/// ```rust +/// use micromark::micromark; +/// +/// let result = micromark("# Hello, world!"); +/// +/// assert_eq!(result, "<h1>Hello, world!</h1>"); +/// ``` +#[must_use] +pub fn micromark(value: &str) -> String { + micromark_with_options(value, &CompileOptions::default()) +} + +/// Turn markdown into HTML, with configuration. +/// +/// ## Examples +/// +/// ```rust +/// use micromark::{micromark_with_options, CompileOptions}; +/// +/// let result = micromark_with_options("<div>\n\n# Hello, world!\n\n</div>", &CompileOptions { +/// allow_dangerous_html: true, +/// }); +/// +/// assert_eq!(result, "<div>\n<h1>Hello, world!</h1>\n</div>"); +/// ``` +#[must_use] +pub fn micromark_with_options(value: &str, options: &CompileOptions) -> String { + let (events, codes) = parse(value); + compile(&events, &codes, options) +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..10c6e7a --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,14 @@ +//! Turn a string of markdown into events. +// To do: this should start with `containers`, when they’re done. +// To do: definitions and such will mean more data has to be passed around. +use crate::content::flow::flow; +use crate::tokenizer::{as_codes, Code, Event}; + +/// Turn a string of markdown into events. +/// Passes the codes back so the compiler can access the source. +pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) { + let codes = as_codes(value); + // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough. + let events = flow(codes.clone()); + (events, codes) +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..c8b1440 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,580 @@ +//! The tokenizer glues states from the state machine together. +//! +//! It facilitates everything needed to turn codes into tokens and events with +//! a state machine. +//! It also enables logic needed for parsing markdown, such as an [`attempt`][] +//! to parse something, which can succeed or, when unsuccessful, revert the +//! attempt. +//! Similarly, a [`check`][] exists, which does the same as an `attempt` but +//! reverts even if successful. +//! +//! [`attempt`]: Tokenizer::attempt +//! [`check`]: Tokenizer::check + +use crate::constant::TAB_SIZE; + +/// Semantic label of a span. +// To do: figure out how to share this so extensions can add their own stuff, +// though perhaps that’s impossible and we should inline all extensions? +// To do: document each variant. +#[derive(Debug, Clone, PartialEq)] +pub enum TokenType { + AtxHeading, + AtxHeadingSequence, + AtxHeadingWhitespace, + AtxHeadingText, + + CharacterEscape, + CharacterEscapeMarker, + CharacterEscapeValue, + + CharacterReference, + CharacterReferenceMarker, + CharacterReferenceMarkerNumeric, + CharacterReferenceMarkerHexadecimal, + CharacterReferenceMarkerSemi, + CharacterReferenceValue, + + CodeFenced, + CodeFencedFence, + CodeFencedFenceSequence, + CodeFencedFenceWhitespace, + CodeFencedFenceInfo, + CodeFencedFenceMeta, + + CodeIndented, + CodeIndentedPrefixWhitespace, + + CodeFlowChunk, + + Data, + + HtmlFlow, + HtmlFlowData, + + ThematicBreak, + ThematicBreakSequence, + ThematicBreakWhitespace, + + Whitespace, + LineEnding, + BlankLineEnding, + BlankLineWhitespace, + + Content, + ContentPhrasing, + ChunkString, +} + +/// Enum representing a character code. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Code { + /// End of the input stream (called eof). + None, + /// Used to make parsing line endings easier as it represents both + /// `Code::Char('\r')` and `Code::Char('\n')` combined. + CarriageReturnLineFeed, + /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab + /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s. + VirtualSpace, + /// The most frequent variant of this enum is `Code::Char(char)`, which just + /// represents a char, but micromark adds meaning to certain other values. + Char(char), +} + +/// A location in the document (`line`/`column`/`offset`). +/// +/// The interface for the location in the document comes from unist `Point`: +/// <https://github.com/syntax-tree/unist#point>. +#[derive(Debug, Clone, PartialEq)] +pub struct Point { + /// 1-indexed line number. + pub line: usize, + /// 1-indexed column number. + /// Note that this is increases up to a tab stop for tabs. + /// Some editors count tabs as 1 character, so this position is not always + /// the same as editors. + pub column: usize, + /// 0-indexed position in the document. + pub offset: usize, +} + +/// Possible event types. +#[derive(Debug, PartialEq)] +pub enum EventType { + /// The start of something. + Enter, + /// The end of something. + Exit, +} + +/// Something semantic happening somewhere. +#[derive(Debug)] +pub struct Event { + pub event_type: EventType, + pub token_type: TokenType, + pub point: Point, + pub index: usize, +} + +/// The essence of the state machine are functions: `StateFn`. +/// It’s responsible for dealing with that single passed [`Code`][]. +/// It yields a [`StateFnResult`][]. +pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult; +/// Each [`StateFn`][] yields something back: primarily the state. +/// In certain cases, it can also yield back up parsed codes that were passed down. +pub type StateFnResult = (State, Option<Vec<Code>>); + +/// The result of a state. +pub enum State { + /// There is a future state: a boxed [`StateFn`][] to pass the next code to. + Fn(Box<StateFn>), + /// The state is successful. + Ok, + /// The state is not successful. + Nok, +} + +/// The internal state of a tokenizer, not to be confused with states from the +/// state machine, this instead is all the information about where we currently +/// are and what’s going on. +#[derive(Debug, Clone)] +struct InternalState { + /// Length of `events`. We only add to events, so reverting will just pop stuff off. + events_len: usize, + /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. + stack_len: usize, + /// Current code. + current: Code, + /// `index` in codes of the current code. + index: usize, + /// Current relative and absolute position in the file. + point: Point, +} + +/// A tokenizer itself. +#[derive(Debug)] +pub struct Tokenizer { + /// Track whether a character is expected to be consumed, and whether it’s + /// actually consumed + /// + /// Tracked to make sure everything’s valid. + consumed: bool, + /// Semantic labels of one or more codes in `codes`. + pub events: Vec<Event>, + /// Hierarchy of semantic labels. + /// + /// Tracked to make sure everything’s valid. + stack: Vec<TokenType>, + /// Current character code. + current: Code, + /// `index` in codes of the current code. + index: usize, + /// Current relative and absolute place in the file. + point: Point, +} + +impl Tokenizer { + /// Create a new tokenizer. + pub fn new() -> Tokenizer { + Tokenizer { + current: Code::None, + index: 0, + consumed: true, + point: Point { + line: 1, + column: 1, + offset: 0, + }, + stack: vec![], + events: vec![], + } + } + + /// Prepare for a next code to get consumed. + fn expect(&mut self, code: Code) { + assert!(self.consumed, "expected previous character to be consumed"); + self.consumed = false; + self.current = code; + } + + /// Consume the current character. + /// Each [`StateFn`][] is expected to call this to signal that this code is + /// used, or call a next `StateFn`. + pub fn consume(&mut self, code: Code) { + assert_eq!( + code, self.current, + "expected given code to equal expected code" + ); + log::debug!("consume: `{:?}` ({:?})", code, self.point); + assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); + + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + self.point.line += 1; + self.point.column = 1; + self.point.offset += if code == Code::CarriageReturnLineFeed { + 2 + } else { + 1 + }; + // To do: accountForPotentialSkip() + log::debug!("position: after eol: `{:?}`", self.point); + } + Code::VirtualSpace => { + // Empty. + } + _ => { + self.point.column += 1; + self.point.offset += 1; + } + } + + self.index += 1; + // Mark as consumed. + self.consumed = true; + } + + /// Mark the start of a semantic label. + pub fn enter(&mut self, token_type: TokenType) { + log::debug!("enter `{:?}` ({:?})", token_type, self.point); + let event = Event { + event_type: EventType::Enter, + token_type: token_type.clone(), + point: self.point.clone(), + index: self.index, + }; + + self.events.push(event); + self.stack.push(token_type); + } + + /// Mark the end of a semantic label. + pub fn exit(&mut self, token_type: TokenType) { + let token_on_stack = self.stack.pop().expect("cannot close w/o open tokens"); + + assert_eq!( + token_on_stack, token_type, + "expected exit TokenType to match current TokenType" + ); + + let ev = self.events.last().expect("cannot close w/o open event"); + + let point = self.point.clone(); + + assert!( + token_on_stack != ev.token_type || ev.point != point, + "expected non-empty TokenType" + ); + + log::debug!("exit `{:?}` ({:?})", token_type, self.point); + let event = Event { + event_type: EventType::Exit, + token_type, + point, + index: self.index, + }; + + self.events.push(event); + } + + /// Capture the internal state. + fn capture(&mut self) -> InternalState { + InternalState { + index: self.index, + current: self.current, + point: self.point.clone(), + events_len: self.events.len(), + stack_len: self.stack.len(), + } + } + + /// Apply the internal state. + fn free(&mut self, previous: InternalState) { + self.index = previous.index; + self.current = previous.current; + self.point = previous.point; + assert!( + self.events.len() >= previous.events_len, + "expected to restore less events than before" + ); + self.events.truncate(previous.events_len); + assert!( + self.stack.len() >= previous.stack_len, + "expected to restore less stack items than before" + ); + self.stack.truncate(previous.stack_len); + } + + /// Check if `state` and its future states are successful or not. + /// + /// This captures the current state of the tokenizer, returns a wrapped + /// state that captures all codes and feeds them to `state` and its future + /// states until it yields [`State::Ok`][] or [`State::Nok`][]. + /// It then applies the captured state, calls `done`, and feeds all + /// captured codes to its future states. + pub fn check( + &mut self, + state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + done: impl FnOnce(bool) -> Box<StateFn> + 'static, + ) -> Box<StateFn> { + let previous = self.capture(); + + attempt_impl( + state, + vec![], + |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| { + let codes = result.0; + tokenizer.free(previous); + log::debug!( + "check: {:?}, codes: {:?}, at {:?}", + ok, + codes, + tokenizer.point + ); + let result = done(ok); + tokenizer.feed(codes, result, false) + }, + ) + } + + /// Attempt to parse with `state` and its future states, reverting if + /// unsuccessful. + /// + /// This captures the current state of the tokenizer, returns a wrapped + /// state that captures all codes and feeds them to `state` and its future + /// states until it yields [`State::Ok`][], at which point it calls `done` + /// and yields its result. + /// If instead [`State::Nok`][] was yielded, the captured state is applied, + /// `done` is called, and all captured codes are fed to its future states. + pub fn attempt( + &mut self, + state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + done: impl FnOnce(bool) -> Box<StateFn> + 'static, + ) -> Box<StateFn> { + let previous = self.capture(); + + attempt_impl( + state, + vec![], + |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| { + let codes = if ok { + result.1 + } else { + tokenizer.free(previous); + result.0 + }; + + log::debug!( + "attempt: {:?}, codes: {:?}, at {:?}", + ok, + codes, + tokenizer.point + ); + let result = done(ok); + tokenizer.feed(codes, result, false) + }, + ) + } + + /// Feed a list of `codes` into `start`. + /// + /// This is set up to support repeatedly calling `feed`, and thus streaming + /// markdown into the state machine, and normally pauses after feeding. + /// When `done: true` is passed, the EOF is fed. + pub fn feed( + &mut self, + codes: Vec<Code>, + start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + drain: bool, + ) -> StateFnResult { + let mut codes = codes; + let mut state = State::Fn(Box::new(start)); + let mut index = 0; + + self.consumed = true; + + while index < codes.len() { + let code = codes[index]; + + match state { + State::Nok | State::Ok => { + break; + } + State::Fn(func) => { + log::debug!("main: passing `{:?}`", code); + self.expect(code); + let (next, remainder) = check_statefn_result(func(self, code)); + state = next; + index = index + 1 + - (if let Some(ref x) = remainder { + x.len() + } else { + 0 + }); + } + } + } + + // Yield to a higher loop if we shouldn’t feed EOFs. + if !drain { + return (state, Some(codes.split_off(index))); + } + + loop { + // Feed EOF. + match state { + State::Ok | State::Nok => break, + State::Fn(func) => { + let code = Code::None; + log::debug!("main: passing eof"); + self.expect(code); + let (next, remainder) = check_statefn_result(func(self, code)); + + if let Some(ref x) = remainder { + if !x.is_empty() { + // To do: handle? + unreachable!("drain:remainder {:?}", x); + } + } + + state = next; + } + } + } + + check_statefn_result((state, None)) + } +} + +/// Internal utility to wrap states to also capture codes. +/// +/// Recurses into itself. +/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. +fn attempt_impl( + state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + codes: Vec<Code>, + done: impl FnOnce((Vec<Code>, Vec<Code>), bool, &mut Tokenizer) -> StateFnResult + 'static, +) -> Box<StateFn> { + Box::new(|tokenizer, code| { + let mut codes = codes; + + let (next, remainder) = check_statefn_result(state(tokenizer, code)); + + match code { + Code::None => {} + _ => { + codes.push(code); + } + } + + // To do: `remainder` must never be bigger than codes I guess? + // To do: `remainder` probably has to be taken *from* `codes`, in a similar vain to the `Ok` handling below. + match next { + State::Ok => { + let remaining = if let Some(x) = remainder { x } else { vec![] }; + check_statefn_result(done((codes, remaining), true, tokenizer)) + } + State::Nok => check_statefn_result(done((codes, vec![]), false, tokenizer)), + State::Fn(func) => { + check_statefn_result((State::Fn(attempt_impl(func, codes, done)), None)) + } + } + }) +} + +/// Turn a string into codes. +// To do: handle BOM at start? +pub fn as_codes(value: &str) -> Vec<Code> { + let mut codes: Vec<Code> = vec![]; + let mut at_carriage_return = false; + let mut column = 1; + + for char in value.chars() { + // Send a CRLF. + if at_carriage_return && '\n' == char { + at_carriage_return = false; + codes.push(Code::CarriageReturnLineFeed); + } else { + // Send the previous CR: we’re not at a next `\n`. + if at_carriage_return { + at_carriage_return = false; + codes.push(Code::Char('\r')); + } + + match char { + // Send a replacement character. + '\0' => { + column += 1; + codes.push(Code::Char('�')); + } + // Send a tab and virtual spaces. + '\t' => { + // To do: is this correct? + let virtual_spaces = TAB_SIZE - (column % TAB_SIZE); + println!("tabs, expand {:?}, {:?}", column, virtual_spaces); + codes.push(Code::Char(char)); + column += 1; + let mut index = 0; + while index < virtual_spaces { + codes.push(Code::VirtualSpace); + column += 1; + index += 1; + } + } + // Send an LF. + '\n' => { + column = 1; + codes.push(Code::Char(char)); + } + // Don’t send anything yet. + '\r' => { + column = 1; + at_carriage_return = true; + } + // Send the char. + _ => { + column += 1; + codes.push(Code::Char(char)); + } + } + }; + } + + // To do: handle a final CR? + + codes +} + +/// Check a [`StateFnResult`][], make sure its valid (that there are no bugs), +/// and clean a final eof passed back in `remainder`. +fn check_statefn_result(result: StateFnResult) -> StateFnResult { + let (state, mut remainder) = result; + + match state { + State::Nok | State::Fn(_) => { + if let Some(ref x) = remainder { + assert_eq!( + x.len(), + 0, + "expected `None` to be passed back as remainder from `State::Nok`, `State::Fn`" + ); + } + } + State::Ok => {} + } + + // Remove an eof. + // For convencience, feeding back an eof is allowed, but cleaned here. + // Most states handle eof and eol in the same branch, and hence pass + // all back. + // This might not be needed, because if EOF is passed back, we’re at the EOF. + // But they’re not supposed to be in codes, so here we remove them. + if let Some(ref mut list) = remainder { + if Some(&Code::None) == list.last() { + list.pop(); + } + } + + (state, remainder) +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..47359a3 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,241 @@ +//! Some utilities helpful when parsing and compiling markdown. + +use crate::constant::{CHARACTER_REFERENCE_NAMES, CHARACTER_REFERENCE_VALUES}; +use crate::tokenizer::{Code, Event, EventType}; + +/// Encode dangerous html characters. +/// +/// This ensures that certain characters which have special meaning in HTML are +/// dealt with. +/// Technically, we can skip `>` and `"` in many cases, but CM includes them. +/// +/// This behavior is not explained in prose in `CommonMark` but can be inferred +/// from the input/output test cases. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::encode; +/// +/// assert_eq!(encode("I <3 🦀"), "I <3 🦀"); +/// ``` +/// +/// ## References +/// +/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) +pub fn encode(value: &str) -> String { + value + .replace('&', "&") + .replace('"', """) + .replace('<', "<") + .replace('>', ">") +} + +/// Decode numeric character references. +/// +/// Turn the number (in string form as either hexadecimal or decimal) coming +/// from a numeric character reference into a character. +/// Whether the base of the string form is `10` (decimal) or `16` (hexadecimal) +/// must be passed as the `radix` parameter. +/// +/// This returns the `char` associated with that number or a replacement +/// character for C0 control characters (except for ASCII whitespace), C1 +/// control characters, lone surrogates, noncharacters, and out of range +/// characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_numeric_character_reference; +/// +/// assert_eq!(decode_numeric_character_reference("123", 10), '{'); +/// assert_eq!(decode_numeric_character_reference("9", 16), '\t'); +/// assert_eq!(decode_numeric_character_reference("0", 10), '�'); // Not allowed. +/// ``` +/// +/// ## Panics +/// +/// This function panics if a invalid string or an out of bounds valid string +/// is given. +/// It is expected that figuring out whether a number is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_numeric_character_reference(value: &str, radix: u32) -> char { + let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int"); + + if + // C0 except for HT, LF, FF, CR, space + code < 0x09 || + code == 0x0B || + (code > 0x0D && code < 0x20) || + // Control character (DEL) of the basic block and C1 controls. + (code > 0x7E && code < 0xA0) || + // Lone high surrogates and low surrogates. + (code > 0xd7ff && code < 0xe000) || + // Noncharacters. + (code > 0xfdcf && code < 0xfdf0) || + ((code & 0xffff) == 0xffff) || + ((code & 0xffff) == 0xfffe) || + // Out of range + code > 0x0010_ffff + { + '�' + } else { + char::from_u32(code).expect("expected valid `code`") + } +} + +/// Decode named character references. +/// +/// Turn the name coming from a named character reference (without the `&` or +/// `;`) into a string. +/// This looks the given string up in [`CHARACTER_REFERENCE_NAMES`][] and then +/// takes the corresponding value from [`CHARACTER_REFERENCE_VALUES`][]. +/// +/// The result is `String` instead of `char` because named character references +/// can expand into multiple characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_named_character_reference; +/// +/// assert_eq!(decode_named_character_reference("amp"), "&"); +/// assert_eq!(decode_named_character_reference("AElig"), "Æ"); +/// assert_eq!(decode_named_character_reference("aelig"), "æ"); +/// ``` +/// +/// ## Panics +/// +/// This function panics if a name not in [`CHARACTER_REFERENCE_NAMES`][] is +/// given. +/// It is expected that figuring out whether a name is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_named_character_reference(value: &str) -> String { + let position = CHARACTER_REFERENCE_NAMES.iter().position(|&x| x == value); + if let Some(index) = position { + CHARACTER_REFERENCE_VALUES[index].to_string() + } else { + unreachable!("expected valid `name`") + } +} + +/// A struct representing the span of an opening and closing event of a token. +#[derive(Debug)] +pub struct Span { + // To do: probably needed in the future. + // start: Point, + /// Absolute offset (and `index` in `codes`) of where this span starts. + pub start_index: usize, + // To do: probably needed in the future. + // end: Point, + /// Absolute offset (and `index` in `codes`) of where this span ends. + pub end_index: usize, + // To do: probably needed in the future. + // token_type: TokenType, +} + +/// Get a span from an event. +/// +/// Get the span of an `exit` event, by looking backwards through the events to +/// find the corresponding `enter` event. +/// This assumes that tokens with the same are not nested. +/// +/// ## Panics +/// +/// This function panics if an enter event is given. +/// When `micromark` is used, this function never panics. +pub fn get_span(events: &[Event], index: usize) -> Span { + let exit = &events[index]; + // let end = exit.point.clone(); + let end_index = exit.index; + let token_type = exit.token_type.clone(); + // To do: support `enter` events if needed and walk forwards? + assert_eq!( + exit.event_type, + EventType::Exit, + "expected get_span to be called on `exit` event" + ); + let mut start_index = index - 1; + + loop { + let enter = &events[start_index]; + if enter.event_type == EventType::Enter && enter.token_type == token_type { + return Span { + // start: enter.point.clone(), + start_index: enter.index, + // end, + end_index, + // token_type, + }; + } + + start_index -= 1; + } +} + +/// Serialize a span, optionally expanding tabs. +pub fn slice_serialize(codes: &[Code], span: &Span, expand_tabs: bool) -> String { + serialize_chunks(slice_codes(codes, span), expand_tabs) +} + +/// Get a slice of codes from a span. +pub fn slice_codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] { + &codes[span.start_index..span.end_index] +} + +/// Serialize a slice of codes, optionally expanding tabs. +pub fn serialize_chunks(codes: &[Code], expand_tabs: bool) -> String { + let mut at_tab = false; + let mut index = 0; + let mut value: Vec<char> = vec![]; + + while index < codes.len() { + let code = codes[index]; + let mut at_tab_next = false; + + match code { + Code::CarriageReturnLineFeed => { + value.push('\r'); + value.push('\n'); + } + Code::Char(char) if char == '\n' || char == '\r' => { + value.push(char); + } + Code::Char(char) if char == '\t' => { + at_tab_next = true; + value.push(if expand_tabs { ' ' } else { char }); + } + Code::VirtualSpace => { + if !expand_tabs && at_tab { + index += 1; + continue; + } + value.push(' '); + } + Code::Char(char) => { + value.push(char); + } + Code::None => { + unreachable!("unexpected EOF code in codes"); + } + } + + at_tab = at_tab_next; + + index += 1; + } + + value.into_iter().collect() +} diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs new file mode 100644 index 0000000..46fa9cb --- /dev/null +++ b/tests/code_fenced.rs @@ -0,0 +1,266 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn code_fenced() { + assert_eq!( + micromark("```\n<\n >\n```"), + "<pre><code><\n >\n</code></pre>", + "should support fenced code w/ grave accents" + ); + + assert_eq!( + micromark("~~~\n<\n >\n~~~"), + "<pre><code><\n >\n</code></pre>", + "should support fenced code w/ tildes" + ); + + // To do: code (text). + // assert_eq!( + // micromark("``\nfoo\n``"), + // "<p><code>foo</code></p>", + // "should not support fenced code w/ less than three markers" + // ); + + assert_eq!( + micromark("```\naaa\n~~~\n```"), + "<pre><code>aaa\n~~~\n</code></pre>", + "should not support a tilde closing sequence for a grave accent opening sequence" + ); + + assert_eq!( + micromark("~~~\naaa\n```\n~~~"), + "<pre><code>aaa\n```\n</code></pre>", + "should not support a grave accent closing sequence for a tilde opening sequence" + ); + + assert_eq!( + micromark("````\naaa\n```\n``````"), + "<pre><code>aaa\n```\n</code></pre>", + "should support a closing sequence longer, but not shorter than, the opening" + ); + + assert_eq!( + micromark("~~~~\naaa\n~~~\n~~~~"), + "<pre><code>aaa\n~~~\n</code></pre>", + "should support a closing sequence equal to, but not shorter than, the opening" + ); + + assert_eq!( + micromark("```"), + "<pre><code></code></pre>\n", + "should support an eof right after an opening sequence" + ); + + assert_eq!( + micromark("`````\n\n```\naaa\n"), + "<pre><code>\n```\naaa\n</code></pre>\n", + "should support an eof somewhere in content" + ); + + // To do: blockquote. + // assert_eq!( + // micromark("> ```\n> aaa\n\nbbb"), + // "<blockquote>\n<pre><code>aaa\n</code></pre>\n</blockquote>\n<p>bbb</p>", + // "should support no closing sequence in a block quote" + // ); + + assert_eq!( + micromark("```\n\n \n```"), + "<pre><code>\n \n</code></pre>", + "should support blank lines in fenced code" + ); + + assert_eq!( + micromark("```\n```"), + "<pre><code></code></pre>", + "should support empty fenced code" + ); + + assert_eq!( + micromark(" ```\n aaa\naaa\n```"), + "<pre><code>aaa\naaa\n</code></pre>", + "should remove up to one space from the content if the opening sequence is indented w/ 1 space" + ); + + assert_eq!( + micromark(" ```\naaa\n aaa\naaa\n ```"), + "<pre><code>aaa\naaa\naaa\n</code></pre>", + "should remove up to two space from the content if the opening sequence is indented w/ 2 spaces" + ); + + assert_eq!( + micromark(" ```\n aaa\n aaa\n aaa\n ```"), + "<pre><code>aaa\n aaa\naaa\n</code></pre>", + "should remove up to three space from the content if the opening sequence is indented w/ 3 spaces" + ); + + assert_eq!( + micromark(" ```\n aaa\n ```"), + "<pre><code>```\naaa\n```\n</code></pre>", + "should not support indenteding the opening sequence w/ 4 spaces" + ); + + assert_eq!( + micromark("```\naaa\n ```"), + "<pre><code>aaa\n</code></pre>", + "should support an indented closing sequence" + ); + + assert_eq!( + micromark(" ```\naaa\n ```"), + "<pre><code>aaa\n</code></pre>", + "should support a differently indented closing sequence than the opening sequence" + ); + + assert_eq!( + micromark("```\naaa\n ```\n"), + "<pre><code>aaa\n ```\n</code></pre>\n", + "should not support an indented closing sequence w/ 4 spaces" + ); + + // To do: code (text). + // assert_eq!( + // micromark("``` ```\naaa"), + // "<p><code> </code>\naaa</p>", + // "should not support grave accents in the opening fence after the opening sequence" + // ); + + assert_eq!( + micromark("~~~~~~\naaa\n~~~ ~~\n"), + "<pre><code>aaa\n~~~ ~~\n</code></pre>\n", + "should not support spaces in the closing sequence" + ); + + assert_eq!( + micromark("foo\n```\nbar\n```\nbaz"), + "<p>foo</p>\n<pre><code>bar\n</code></pre>\n<p>baz</p>", + "should support interrupting paragraphs" + ); + + // To do: setext. + // assert_eq!( + // micromark("foo\n---\n~~~\nbar\n~~~\n# baz"), + // "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>", + // "should support interrupting other content" + // ); + + assert_eq!( + micromark("```ruby\ndef foo(x)\n return 3\nend\n```"), + "<pre><code class=\"language-ruby\">def foo(x)\n return 3\nend\n</code></pre>", + "should support the info string as a `language-` class (1)" + ); + + assert_eq!( + micromark("````;\n````"), + "<pre><code class=\"language-;\"></code></pre>", + "should support the info string as a `language-` class (2)" + ); + + assert_eq!( + micromark("~~~~ ruby startline=3 $%@#$\ndef foo(x)\n return 3\nend\n~~~~~~~"), + "<pre><code class=\"language-ruby\">def foo(x)\n return 3\nend\n</code></pre>", + "should support the info string as a `language-` class, but not the meta string" + ); + + // To do: code (text). + // assert_eq!( + // micromark("``` aa ```\nfoo"), + // "<p><code>aa</code>\nfoo</p>", + // "should not support grave accents in the meta string" + // ); + + assert_eq!( + micromark("~~~ aa ``` ~~~\nfoo\n~~~"), + "<pre><code class=\"language-aa\">foo\n</code></pre>", + "should support grave accents and tildes in the meta string of tilde fenced code" + ); + + assert_eq!( + micromark("```\n``` aaa\n```"), + "<pre><code>``` aaa\n</code></pre>", + "should not support info string on closing sequences" + ); + + // Our own: + assert_eq!( + micromark("``` "), + "<pre><code></code></pre>\n", + "should support an eof after whitespace, after the start fence sequence" + ); + + assert_eq!( + micromark("``` js\nalert(1)\n```"), + "<pre><code class=\"language-js\">alert(1)\n</code></pre>", + "should support whitespace between the sequence and the info string" + ); + + assert_eq!( + micromark("```js"), + "<pre><code class=\"language-js\"></code></pre>\n", + "should support an eof after the info string" + ); + + assert_eq!( + micromark("``` js \nalert(1)\n```"), + "<pre><code class=\"language-js\">alert(1)\n</code></pre>", + "should support whitespace after the info string" + ); + + assert_eq!( + micromark("```\n "), + "<pre><code> \n</code></pre>\n", + "should support an eof after whitespace in content" + ); + + assert_eq!( + micromark(" ```\n "), + "<pre><code></code></pre>\n", + "should support an eof in the prefix, in content" + ); + + // To do: strings. + // assert_eq!( + // micromark("```j\\+s©"), + // "<pre><code class=\"language-j+s©\"></code></pre>\n", + // "should support character escapes and character references in info strings" + // ); + + assert_eq!( + micromark(" ```\naaa\n ```"), + "<pre><code>aaa\n ```\n</code></pre>\n", + "should not support a closing sequence w/ too much indent, regardless of opening sequence (1)" + ); + + // To do: blockquote. + // assert_eq!( + // micromark("> ```\n>\n>\n>\n\na"), + // "<blockquote>\n<pre><code>\n\n\n</code></pre>\n</blockquote>\n<p>a</p>", + // "should not support a closing sequence w/ too much indent, regardless of opening sequence (2)" + // ); + + // assert_eq!( + // micromark("> ```a\nb"), + // "<blockquote>\n<pre><code class=\"language-a\"></code></pre>\n</blockquote>\n<p>b</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark("> a\n```b"), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<pre><code class=\"language-b\"></code></pre>\n", + // "should not support lazyness (2)" + // ); + + // assert_eq!( + // micromark("> ```a\n```"), + // "<blockquote>\n<pre><code class=\"language-a\"></code></pre>\n</blockquote>\n<pre><code></code></pre>\n", + // "should not support lazyness (3)" + // ); + + // To do: extensions. + // assert_eq!( + // micromark("```", {extensions: [{disable: {null: ["codeFenced"]}}]}), + // "<p>```</p>", + // "should support turning off code (fenced)" + // ); +} diff --git a/tests/code_indented.rs b/tests/code_indented.rs new file mode 100644 index 0000000..f5926c0 --- /dev/null +++ b/tests/code_indented.rs @@ -0,0 +1,196 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn code_indented() { + assert_eq!( + micromark(" a simple\n indented code block"), + "<pre><code>a simple\n indented code block\n</code></pre>", + "should support indented code" + ); + + // To do: list. + // assert_eq!( + // micromark(" - foo\n\n bar"), + // "<ul>\n<li>\n<p>foo</p>\n<p>bar</p>\n</li>\n</ul>", + // "should prefer list item content over indented code (1)" + // ); + + // assert_eq!( + // micromark("1. foo\n\n - bar"), + // "<ol>\n<li>\n<p>foo</p>\n<ul>\n<li>bar</li>\n</ul>\n</li>\n</ol>", + // "should prefer list item content over indented code (2)" + // ); + + assert_eq!( + micromark(" <a/>\n *hi*\n\n - one"), + "<pre><code><a/>\n*hi*\n\n- one\n</code></pre>", + "should support blank lines in indented code (1)" + ); + + assert_eq!( + micromark(" chunk1\n\n chunk2\n \n \n \n chunk3"), + "<pre><code>chunk1\n\nchunk2\n\n\n\nchunk3\n</code></pre>", + "should support blank lines in indented code (2)" + ); + + assert_eq!( + micromark(" chunk1\n \n chunk2"), + "<pre><code>chunk1\n \n chunk2\n</code></pre>", + "should support blank lines in indented code (3)" + ); + + // To do: paragraphs. + // assert_eq!( + // micromark("Foo\n bar"), + // "<p>Foo\nbar</p>", + // "should not support interrupting paragraphs" + // ); + + // To do: paragraphs. + // assert_eq!( + // micromark(" foo\nbar"), + // "<pre><code>foo\n</code></pre>\n<p>bar</p>", + // "should support paragraphs directly after indented code" + // ); + + // To do: setext. + // assert_eq!( + // micromark("# Heading\n foo\nHeading\n------\n foo\n----"), + // "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />", + // "should mix w/ other content" + // ); + + assert_eq!( + micromark(" foo\n bar"), + "<pre><code> foo\nbar\n</code></pre>", + "should support extra whitespace on the first line" + ); + + assert_eq!( + micromark("\n \n foo\n "), + "<pre><code>foo\n</code></pre>", + "should not support initial blank lines" + ); + + assert_eq!( + micromark(" foo "), + "<pre><code>foo \n</code></pre>", + "should support trailing whitespace" + ); + + // To do: blockquote. + // assert_eq!( + // micromark("> a\nb"), + // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<p>b</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark("> a\n b"), + // "<blockquote>\n<p>a\nb</p>\n</blockquote>", + // "should not support lazyness (2)" + // ); + + // assert_eq!( + // micromark("> a\n b"), + // "<blockquote>\n<p>a\nb</p>\n</blockquote>", + // "should not support lazyness (3)" + // ); + + // assert_eq!( + // micromark("> a\n b"), + // "<blockquote>\n<p>a\nb</p>\n</blockquote>", + // "should not support lazyness (4)" + // ); + + // assert_eq!( + // micromark("> a\n b"), + // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code>b\n</code></pre>", + // "should not support lazyness (5)" + // ); + + // assert_eq!( + // micromark("> a\n b"), + // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code> b\n</code></pre>", + // "should not support lazyness (6)" + // ); + + // assert_eq!( + // micromark("> a\n b"), + // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code> b\n</code></pre>", + // "should not support lazyness (7)" + // ); + + // To do: extensions. + // assert_eq!( + // micromark(" a", {extensions: [{disable: {null: ["codeIndented"]}}]}), + // "<p>a</p>", + // "should support turning off code (indented, 1)" + // ); + + // assert_eq!( + // micromark("> a\n b", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<blockquote>\n<p>a\nb</p>\n</blockquote>", + // "should support turning off code (indented, 2)" + // ); + + // assert_eq!( + // micromark("- a\n b", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<ul>\n<li>a\nb</li>\n</ul>", + // "should support turning off code (indented, 3)" + // ); + + // assert_eq!( + // micromark("- a\n - b", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<ul>\n<li>a\n<ul>\n<li>b</li>\n</ul>\n</li>\n</ul>", + // "should support turning off code (indented, 4)" + // ); + + // assert_eq!( + // micromark("- a\n - b", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<ul>\n<li>a\n<ul>\n<li>b</li>\n</ul>\n</li>\n</ul>", + // "should support turning off code (indented, 5)" + // ); + + // assert_eq!( + // micromark("```\na\n ```", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<pre><code>a\n</code></pre>", + // "should support turning off code (indented, 6)" + // ); + + // assert_eq!( + // micromark("a <?\n ?>", { + // allowDangerousHtml: true, + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<p>a <?\n?></p>", + // "should support turning off code (indented, 7)" + // ); + + // assert_eq!( + // micromark("- Foo\n---", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<ul>\n<li>Foo</li>\n</ul>\n<hr />", + // "should support turning off code (indented, 8)" + // ); + + // assert_eq!( + // micromark("- Foo\n ---", { + // extensions: [{disable: {null: ["codeIndented"]}}] + // }), + // "<ul>\n<li>\n<h2>Foo</h2>\n</li>\n</ul>", + // "should support turning off code (indented, 9)" + // ); +} diff --git a/tests/heading_atx.rs b/tests/heading_atx.rs new file mode 100644 index 0000000..b75d058 --- /dev/null +++ b/tests/heading_atx.rs @@ -0,0 +1,208 @@ +extern crate micromark; +use micromark::micromark; +#[test] +fn heading_atx() { + assert_eq!( + micromark("# foo"), + "<h1>foo</h1>", + "should support a heading w/ rank 1" + ); + + assert_eq!( + micromark("## foo"), + "<h2>foo</h2>", + "should support a heading w/ rank 2" + ); + + assert_eq!( + micromark("### foo"), + "<h3>foo</h3>", + "should support a heading w/ rank 3" + ); + + assert_eq!( + micromark("#### foo"), + "<h4>foo</h4>", + "should support a heading w/ rank 4" + ); + + assert_eq!( + micromark("##### foo"), + "<h5>foo</h5>", + "should support a heading w/ rank 5" + ); + + assert_eq!( + micromark("###### foo"), + "<h6>foo</h6>", + "should support a heading w/ rank 6" + ); + + assert_eq!( + micromark("####### foo"), + "<p>####### foo</p>", + "should not support a heading w/ rank 7" + ); + + assert_eq!( + micromark("#5 bolt"), + "<p>#5 bolt</p>", + "should not support a heading for a number sign not followed by whitespace (1)" + ); + + assert_eq!( + micromark("#hashtag"), + "<p>#hashtag</p>", + "should not support a heading for a number sign not followed by whitespace (2)" + ); + + // To do: phrasing. + // assert_eq!( + // micromark("\\## foo"), + // "<p>## foo</p>", + // "should not support a heading for an escaped number sign" + // ); + + // assert_eq!( + // micromark("# foo *bar* \\*baz\\*"), + // "<h1>foo <em>bar</em> *baz*</h1>", + // "should support text content in headings" + // ); + + assert_eq!( + micromark("# foo "), + "<h1>foo</h1>", + "should support arbitrary initial and final whitespace" + ); + + assert_eq!( + micromark(" ### foo"), + "<h3>foo</h3>", + "should support an initial space" + ); + + assert_eq!( + micromark(" ## foo"), + "<h2>foo</h2>", + "should support two initial spaces" + ); + + assert_eq!( + micromark(" # foo"), + "<h1>foo</h1>", + "should support three initial spaces" + ); + + assert_eq!( + micromark(" # foo"), + "<pre><code># foo\n</code></pre>", + "should not support four initial spaces" + ); + + // To do: lazy. + // assert_eq!( + // micromark("foo\n # bar"), + // "<p>foo\n# bar</p>", + // "should not support four initial spaces when interrupting" + // ); + + assert_eq!( + micromark("## foo ##"), + "<h2>foo</h2>", + "should support a closing sequence (1)" + ); + + assert_eq!( + micromark(" ### bar ###"), + "<h3>bar</h3>", + "should support a closing sequence (2)" + ); + + assert_eq!( + micromark("# foo ##################################"), + "<h1>foo</h1>", + "should support a closing sequence w/ an arbitrary number of number signs (1)" + ); + + assert_eq!( + micromark("##### foo ##"), + "<h5>foo</h5>", + "should support a closing sequence w/ an arbitrary number of number signs (2)" + ); + + assert_eq!( + micromark("### foo ### "), + "<h3>foo</h3>", + "should support trailing whitespace after a closing sequence" + ); + + assert_eq!( + micromark("### foo ### b"), + "<h3>foo ### b</h3>", + "should not support other content after a closing sequence" + ); + + assert_eq!( + micromark("# foo#"), + "<h1>foo#</h1>", + "should not support a closing sequence w/o whitespace before it" + ); + + // Phrasing. + // assert_eq!( + // micromark("### foo \\###"), + // "<h3>foo ###</h3>", + // "should not support an “escaped” closing sequence (1)" + // ); + + // assert_eq!( + // micromark("## foo #\\##"), + // "<h2>foo ###</h2>", + // "should not support an “escaped” closing sequence (2)" + // ); + + // assert_eq!( + // micromark("# foo \\#"), + // "<h1>foo #</h1>", + // "should not support an “escaped” closing sequence (3)" + // ); + + assert_eq!( + micromark("****\n## foo\n****"), + "<hr />\n<h2>foo</h2>\n<hr />", + "should support atx headings when not surrounded by blank lines" + ); + + assert_eq!( + micromark("Foo bar\n# baz\nBar foo"), + "<p>Foo bar</p>\n<h1>baz</h1>\n<p>Bar foo</p>", + "should support atx headings interrupting paragraphs" + ); + + // Line endings. + assert_eq!( + micromark("## \n#\n### ###"), + "<h2></h2>\n<h1></h1>\n<h3></h3>", + "should support empty atx headings" + ); + + // To do: block quote. + // assert_eq!( + // micromark("> #\na"), + // "<blockquote>\n<h1></h1>\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark("> a\n#"), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<h1></h1>", + // "should not support lazyness (2)" + // ); + + // Extensions: + // assert_eq!( + // micromark("# a", {extensions: [{disable: {null: ["headingAtx"]}}]}), + // "<p># a</p>", + // "should support turning off heading (atx)" + // ); +} diff --git a/tests/html_flow.rs b/tests/html_flow.rs new file mode 100644 index 0000000..51d1a2a --- /dev/null +++ b/tests/html_flow.rs @@ -0,0 +1,1058 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, CompileOptions}; + +const DANGER: &CompileOptions = &CompileOptions { + allow_dangerous_html: true, +}; + +#[test] +fn html_flow() { + assert_eq!( + micromark("<!-- asd -->"), + "<!-- asd -->", + "should support a heading w/ rank 1" + ); + + assert_eq!( + micromark_with_options("<!-- asd -->", DANGER), + "<!-- asd -->", + "should support a heading w/ rank 1" + ); + + // To do: extensions. + // assert_eq!( + // micromark_with_options("<x>", {extensions: [{disable: {null: ["htmlFlow"]}}]}), + // "<p><x></p>", + // "should support turning off html (flow)" + // ); +} + +#[test] +fn html_flow_1_raw() { + assert_eq!( + micromark_with_options( + "<pre language=\"haskell\"><code> +import Text.HTML.TagSoup + +main :: IO () +main = print $ parseTags tags +</code></pre> +okay", + DANGER + ), + "<pre language=\"haskell\"><code> +import Text.HTML.TagSoup + +main :: IO () +main = print $ parseTags tags +</code></pre> +<p>okay</p>", + "should support raw pre tags (type 1)" + ); + + assert_eq!( + micromark_with_options( + "<script type=\"text/javascript\"> +// JavaScript example + +document.getElementById(\"demo\").innerHTML = \"Hello JavaScript!\"; +</script> +okay", + DANGER + ), + "<script type=\"text/javascript\"> +// JavaScript example + +document.getElementById(\"demo\").innerHTML = \"Hello JavaScript!\"; +</script> +<p>okay</p>", + "should support raw script tags" + ); + + assert_eq!( + micromark_with_options( + "<style + type=\"text/css\"> +h1 {color:red;} + +p {color:blue;} +</style> +okay", + DANGER + ), + "<style + type=\"text/css\"> +h1 {color:red;} + +p {color:blue;} +</style> +<p>okay</p>", + "should support raw style tags" + ); + + assert_eq!( + micromark_with_options("<style\n type=\"text/css\">\n\nfoo", DANGER), + "<style\n type=\"text/css\">\n\nfoo", + "should support raw tags w/o ending" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<style>p{color:red;}</style>\n*foo*", DANGER), + // "<style>p{color:red;}</style>\n<p><em>foo</em></p>", + // "should support raw tags w/ start and end on a single line" + // ); + + assert_eq!( + micromark_with_options("<script>\nfoo\n</script>1. *bar*", DANGER), + "<script>\nfoo\n</script>1. *bar*", + "should support raw tags w/ more data on ending line" + ); + + assert_eq!( + micromark_with_options("<script", DANGER), + "<script", + "should support an eof directly after a raw tag name" + ); + + // To do: paragraphs. + // assert_eq!( + // micromark_with_options("</script\nmore", DANGER), + // "<p></script\nmore</p>", + // "should not support a raw closing tag" + // ); + + assert_eq!( + micromark_with_options("<script/", DANGER), + "<p><script/</p>", + "should not support an eof after a self-closing slash" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<script/\n*asd*", DANGER), + // "<p><script/\n<em>asd</em></p>", + // "should not support a line ending after a self-closing slash" + // ); + + assert_eq!( + micromark_with_options("<script/>", DANGER), + "<script/>", + "should support an eof after a self-closing tag" + ); + + assert_eq!( + micromark_with_options("<script/>\na", DANGER), + "<script/>\na", + "should support a line ending after a self-closing tag" + ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<script/>a", DANGER), + // "<p><script/>a</p>", + // "should not support other characters after a self-closing tag" + // ); + + assert_eq!( + micromark_with_options("<script>a", DANGER), + "<script>a", + "should support other characters after a raw opening tag" + ); + + // Extra. + assert_eq!( + micromark_with_options("Foo\n<script", DANGER), + "<p>Foo</p>\n<script", + "should support interrupting paragraphs w/ raw tags" + ); + + assert_eq!( + micromark_with_options("<script>\n \n \n</script>", DANGER), + "<script>\n \n \n</script>", + "should support blank lines in raw" + ); + + // To do: block quote. + // assert_eq!( + // micromark_with_options("> <script>\na", DANGER), + // "<blockquote>\n<script>\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<script>", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<script>", + // "should not support lazyness (2)" + // ); +} + +#[test] +fn html_flow_2_comment() { + assert_eq!( + micromark_with_options("<!-- Foo\n\nbar\n baz -->\nokay", DANGER), + "<!-- Foo\n\nbar\n baz -->\n<p>okay</p>", + "should support comments (type 2)" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<!-- foo -->*bar*\n*baz*", DANGER), + // "<!-- foo -->*bar*\n<p><em>baz</em></p>", + // "should support comments w/ start and end on a single line" + // ); + + assert_eq!( + micromark_with_options("<!-asd-->", DANGER), + "<p><!-asd--></p>", + "should not support a single dash to start comments" + ); + + assert_eq!( + micromark_with_options("<!-->", DANGER), + "<!-->", + "should support comments where the start dashes are the end dashes (1)" + ); + + assert_eq!( + micromark_with_options("<!--->", DANGER), + "<!--->", + "should support comments where the start dashes are the end dashes (2)" + ); + + assert_eq!( + micromark_with_options("<!---->", DANGER), + "<!---->", + "should support empty comments" + ); + + // If the `\"` is encoded, we’re in text. If it remains, we’re in HTML. + assert_eq!( + micromark_with_options("<!--\n->\n\"", DANGER), + "<!--\n->\n\"", + "should not end a comment at one dash (`->`)" + ); + assert_eq!( + micromark_with_options("<!--\n-->\n\"", DANGER), + "<!--\n-->\n<p>"</p>", + "should end a comment at two dashes (`-->`)" + ); + assert_eq!( + micromark_with_options("<!--\n--->\n\"", DANGER), + "<!--\n--->\n<p>"</p>", + "should end a comment at three dashes (`--->`)" + ); + assert_eq!( + micromark_with_options("<!--\n---->\n\"", DANGER), + "<!--\n---->\n<p>"</p>", + "should end a comment at four dashes (`---->`)" + ); + + assert_eq!( + micromark_with_options(" <!-- foo -->", DANGER), + " <!-- foo -->", + "should support comments w/ indent" + ); + + assert_eq!( + micromark_with_options(" <!-- foo -->", DANGER), + "<pre><code><!-- foo -->\n</code></pre>", + "should not support comments w/ a 4 character indent" + ); + + // Extra. + assert_eq!( + micromark_with_options("Foo\n<!--", DANGER), + "<p>Foo</p>\n<!--", + "should support interrupting paragraphs w/ comments" + ); + + assert_eq!( + micromark_with_options("<!--\n \n \n-->", DANGER), + "<!--\n \n \n-->", + "should support blank lines in comments" + ); + + // To do: blockquote. + // assert_eq!( + // micromark_with_options("> <!--\na", DANGER), + // "<blockquote>\n<!--\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<!--", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<!--", + // "should not support lazyness (2)" + // ); +} + +#[test] +fn html_flow_3_instruction() { + assert_eq!( + micromark_with_options("<?php\n\n echo \">\";\n\n?>\nokay", DANGER), + "<?php\n\n echo \">\";\n\n?>\n<p>okay</p>", + "should support instructions (type 3)" + ); + + assert_eq!( + micromark_with_options("<?>", DANGER), + "<?>", + "should support empty instructions where the `?` is part of both the start and the end" + ); + + assert_eq!( + micromark_with_options("<??>", DANGER), + "<??>", + "should support empty instructions" + ); + + // Extra. + assert_eq!( + micromark_with_options("Foo\n<?", DANGER), + "<p>Foo</p>\n<?", + "should support interrupting paragraphs w/ instructions" + ); + + assert_eq!( + micromark_with_options("<?\n \n \n?>", DANGER), + "<?\n \n \n?>", + "should support blank lines in instructions" + ); + + // To do: blockquote. + // assert_eq!( + // micromark_with_options("> <?\na", DANGER), + // "<blockquote>\n<?\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<?", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<?", + // "should not support lazyness (2)" + // ); +} + +#[test] +fn html_flow_4_declaration() { + assert_eq!( + micromark_with_options("<!DOCTYPE html>", DANGER), + "<!DOCTYPE html>", + "should support declarations (type 4)" + ); + + assert_eq!( + micromark_with_options("<!123>", DANGER), + "<p><!123></p>", + "should not support declarations that start w/o an alpha" + ); + + assert_eq!( + micromark_with_options("<!>", DANGER), + "<p><!></p>", + "should not support declarations w/o an identifier" + ); + + assert_eq!( + micromark_with_options("<!a>", DANGER), + "<!a>", + "should support declarations w/o a single alpha as identifier" + ); + + // Extra. + assert_eq!( + micromark_with_options("Foo\n<!d", DANGER), + "<p>Foo</p>\n<!d", + "should support interrupting paragraphs w/ declarations" + ); + + // Note about the lower letter: + // <https://github.com/commonmark/commonmark-spec/pull/621> + assert_eq!( + micromark_with_options("<!a\n \n \n>", DANGER), + "<!a\n \n \n>", + "should support blank lines in declarations" + ); + + // To do: blockquote. + // assert_eq!( + // micromark_with_options("> <!a\nb", DANGER), + // "<blockquote>\n<!a\n</blockquote>\n<p>b</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<!b", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<!b", + // "should not support lazyness (2)" + // ); +} + +#[test] +fn html_flow_5_cdata() { + assert_eq!( + micromark_with_options( + "<![CDATA[\nfunction matchwo(a,b)\n{\n if (a < b && a < 0) then {\n return 1;\n\n } else {\n\n return 0;\n }\n}\n]]>\nokay", + DANGER + ), + "<![CDATA[\nfunction matchwo(a,b)\n{\n if (a < b && a < 0) then {\n return 1;\n\n } else {\n\n return 0;\n }\n}\n]]>\n<p>okay</p>", + "should support cdata (type 5)" + ); + + assert_eq!( + micromark_with_options("<![CDATA[]]>", DANGER), + "<![CDATA[]]>", + "should support empty cdata" + ); + + assert_eq!( + micromark_with_options("<![CDATA]]>", DANGER), + "<p><![CDATA]]></p>", + "should not support cdata w/ a missing `[`" + ); + + assert_eq!( + micromark_with_options("<![CDATA[]]]>", DANGER), + "<![CDATA[]]]>", + "should support cdata w/ a single `]` as content" + ); + + // Extra. + assert_eq!( + micromark_with_options("Foo\n<![CDATA[", DANGER), + "<p>Foo</p>\n<![CDATA[", + "should support interrupting paragraphs w/ cdata" + ); + + // Note: cmjs parses this differently. + // See: <https://github.com/commonmark/commonmark.js/issues/193> + assert_eq!( + micromark_with_options("<![cdata[]]>", DANGER), + "<p><![cdata[]]></p>", + "should not support lowercase cdata" + ); + + assert_eq!( + micromark_with_options("<![CDATA[\n \n \n]]>", DANGER), + "<![CDATA[\n \n \n]]>", + "should support blank lines in cdata" + ); + + // To do: blockquote. + // assert_eq!( + // micromark_with_options("> <![CDATA[\na", DANGER), + // "<blockquote>\n<![CDATA[\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<![CDATA[", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<![CDATA[", + // "should not support lazyness (2)" + // ); +} + +#[test] +fn html_flow_6_basic() { + // To do: phrasing, paragraphs, etc. + // assert_eq!( + // micromark_with_options( + // "<table><tr><td>\n<pre>\n**Hello**,\n\n_world_.\n</pre>\n</td></tr></table>", + // DANGER + // ), + // "<table><tr><td>\n<pre>\n**Hello**,\n<p><em>world</em>.\n</pre></p>\n</td></tr></table>", + // "should support html (basic)" + // ); + + // To do: paragraphs. + // assert_eq!( + // micromark_with_options( + // "<table> + // <tr> + // <td> + // hi + // </td> + // </tr> + // </table> + + // okay.", + // DANGER + // ), + // "<table> + // <tr> + // <td> + // hi + // </td> + // </tr> + // </table> + // <p>okay.</p>", + // "should support html of type 6 (1)" + // ); + + assert_eq!( + micromark_with_options(" <div>\n *hello*\n <foo><a>", DANGER), + " <div>\n *hello*\n <foo><a>", + "should support html of type 6 (2)" + ); + + assert_eq!( + micromark_with_options("</div>\n*foo*", DANGER), + "</div>\n*foo*", + "should support html starting w/ a closing tag" + ); + + // To do: phrasing + // assert_eq!( + // micromark_with_options("<DIV CLASS=\"foo\">\n\n*Markdown*\n\n</DIV>", DANGER), + // "<DIV CLASS=\"foo\">\n<p><em>Markdown</em></p>\n</DIV>", + // "should support html w/ markdown in between" + // ); + + assert_eq!( + micromark_with_options("<div id=\"foo\"\n class=\"bar\">\n</div>", DANGER), + "<div id=\"foo\"\n class=\"bar\">\n</div>", + "should support html w/ line endings (1)" + ); + + assert_eq!( + micromark_with_options("<div id=\"foo\" class=\"bar\n baz\">\n</div>", DANGER), + "<div id=\"foo\" class=\"bar\n baz\">\n</div>", + "should support html w/ line endings (2)" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<div>\n*foo*\n\n*bar*", DANGER), + // "<div>\n*foo*\n<p><em>bar</em></p>", + // "should support an unclosed html element" + // ); + + assert_eq!( + micromark_with_options("<div id=\"foo\"\n*hi*", DANGER), + "<div id=\"foo\"\n*hi*", + "should support garbage html (1)" + ); + + assert_eq!( + micromark_with_options("<div class\nfoo", DANGER), + "<div class\nfoo", + "should support garbage html (2)" + ); + + assert_eq!( + micromark_with_options("<div *???-&&&-<---\n*foo*", DANGER), + "<div *???-&&&-<---\n*foo*", + "should support garbage html (3)" + ); + + assert_eq!( + micromark_with_options("<div><a href=\"bar\">*foo*</a></div>", DANGER), + "<div><a href=\"bar\">*foo*</a></div>", + "should support other tags in the opening (1)" + ); + + assert_eq!( + micromark_with_options("<table><tr><td>\nfoo\n</td></tr></table>", DANGER), + "<table><tr><td>\nfoo\n</td></tr></table>", + "should support other tags in the opening (2)" + ); + + assert_eq!( + micromark_with_options("<div></div>\n``` c\nint x = 33;\n```", DANGER), + "<div></div>\n``` c\nint x = 33;\n```", + "should include everything ’till a blank line" + ); + + // To do: blockquote. + // assert_eq!( + // micromark_with_options("> <div>\n> foo\n\nbar", DANGER), + // "<blockquote>\n<div>\nfoo\n</blockquote>\n<p>bar</p>", + // "should support basic tags w/o ending in containers (1)" + // ); + + // To do: list. + // assert_eq!( + // micromark_with_options("- <div>\n- foo", DANGER), + // "<ul>\n<li>\n<div>\n</li>\n<li>foo</li>\n</ul>", + // "should support basic tags w/o ending in containers (2)" + // ); + + assert_eq!( + micromark_with_options(" <div>", DANGER), + " <div>", + "should support basic tags w/ indent" + ); + + assert_eq!( + micromark_with_options(" <div>", DANGER), + "<pre><code><div>\n</code></pre>", + "should not support basic tags w/ a 4 character indent" + ); + + assert_eq!( + micromark_with_options("Foo\n<div>\nbar\n</div>", DANGER), + "<p>Foo</p>\n<div>\nbar\n</div>", + "should support interrupting paragraphs w/ basic tags" + ); + + assert_eq!( + micromark_with_options("<div>\nbar\n</div>\n*foo*", DANGER), + "<div>\nbar\n</div>\n*foo*", + "should require a blank line to end" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<div>\n\n*Emphasized* text.\n\n</div>", DANGER), + // "<div>\n<p><em>Emphasized</em> text.</p>\n</div>", + // "should support interleaving w/ blank lines" + // ); + + assert_eq!( + micromark_with_options("<div>\n*Emphasized* text.\n</div>", DANGER), + "<div>\n*Emphasized* text.\n</div>", + "should not support interleaving w/o blank lines" + ); + + assert_eq!( + micromark_with_options( + "<table>\n\n<tr>\n\n<td>\nHi\n</td>\n\n</tr>\n\n</table>", + DANGER + ), + "<table>\n<tr>\n<td>\nHi\n</td>\n</tr>\n</table>", + "should support blank lines between adjacent html" + ); + + assert_eq!( + micromark_with_options( + "<table> + + <tr> + + <td> + Hi + </td> + + </tr> + +</table>", + DANGER + ), + "<table> + <tr> +<pre><code><td> + Hi +</td> +</code></pre> + </tr> +</table>", + "should not support indented, blank-line delimited, adjacent html" + ); + + assert_eq!( + micromark_with_options("</1>", DANGER), + "<p></1></p>", + "should not support basic tags w/ an incorrect name start character" + ); + + assert_eq!( + micromark_with_options("<div", DANGER), + "<div", + "should support an eof directly after a basic tag name" + ); + + assert_eq!( + micromark_with_options("<div\n", DANGER), + "<div\n", + "should support a line ending directly after a tag name" + ); + + assert_eq!( + micromark_with_options("<div ", DANGER), + "<div ", + "should support an eof after a space directly after a tag name" + ); + + assert_eq!( + micromark_with_options("<div/", DANGER), + "<p><div/</p>", + "should not support an eof directly after a self-closing slash" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<div/\n*asd*", DANGER), + // "<p><div/\n<em>asd</em></p>", + // "should not support a line ending after a self-closing slash" + // ); + + assert_eq!( + micromark_with_options("<div/>", DANGER), + "<div/>", + "should support an eof after a self-closing tag" + ); + + assert_eq!( + micromark_with_options("<div/>\na", DANGER), + "<div/>\na", + "should support a line ending after a self-closing tag" + ); + + assert_eq!( + micromark_with_options("<div/>a", DANGER), + "<div/>a", + "should support another character after a self-closing tag" + ); + + assert_eq!( + micromark_with_options("<div>a", DANGER), + "<div>a", + "should support another character after a basic opening tag" + ); + + // Extra. + assert_eq!( + micromark_with_options("Foo\n<div/>", DANGER), + "<p>Foo</p>\n<div/>", + "should support interrupting paragraphs w/ self-closing basic tags" + ); + + // To do: block quote. + // assert_eq!( + // micromark_with_options("<div\n \n \n>", DANGER), + // "<div\n<blockquote>\n</blockquote>", + // "should not support blank lines in basic" + // ); + + // assert_eq!( + // micromark_with_options("> <div\na", DANGER), + // "<blockquote>\n<div\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<div", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<div", + // "should not support lazyness (2)" + // ); +} + +#[test] +fn html_flow_7_complete() { + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<a href=\"foo\">\n*bar*\n</a>", DANGER), + // "<a href=\"foo\">\n*bar*\n</a>", + // "should support complete tags (type 7)" + // ); + + assert_eq!( + micromark_with_options("<Warning>\n*bar*\n</Warning>", DANGER), + "<Warning>\n*bar*\n</Warning>", + "should support non-html tag names" + ); + + assert_eq!( + micromark_with_options("<i class=\"foo\">\n*bar*\n</i>", DANGER), + "<i class=\"foo\">\n*bar*\n</i>", + "should support non-“block” html tag names (1)" + ); + + assert_eq!( + micromark_with_options("<del>\n*foo*\n</del>", DANGER), + "<del>\n*foo*\n</del>", + "should support non-“block” html tag names (2)" + ); + + assert_eq!( + micromark_with_options("</ins>\n*bar*", DANGER), + "</ins>\n*bar*", + "should support closing tags" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<del>\n\n*foo*\n\n</del>", DANGER), + // "<del>\n<p><em>foo</em></p>\n</del>", + // "should support interleaving" + // ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<del>*foo*</del>", DANGER), + // "<p><del><em>foo</em></del></p>", + // "should not support interleaving w/o blank lines" + // ); + + assert_eq!( + micromark_with_options("<div>\n \nasd", DANGER), + "<div>\n<p>asd</p>", + "should support interleaving w/ whitespace-only blank lines" + ); + + // To do: interrupting. + // assert_eq!( + // micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER), + // "<p>Foo\n<a href=\"bar\">\nbaz</p>", + // "should not support interrupting paragraphs w/ complete tags" + // ); + + assert_eq!( + micromark_with_options("<x", DANGER), + "<p><x</p>", + "should not support an eof directly after a tag name" + ); + + assert_eq!( + micromark_with_options("<x/", DANGER), + "<p><x/</p>", + "should not support an eof directly after a self-closing slash" + ); + + assert_eq!( + micromark_with_options("<x\n", DANGER), + "<p><x</p>\n", + "should not support a line ending directly after a tag name" + ); + + // To do: paragraphs (trailing whitespace). + // assert_eq!( + // micromark_with_options("<x ", DANGER), + // "<p><x</p>", + // "should not support an eof after a space directly after a tag name" + // ); + + assert_eq!( + micromark_with_options("<x/", DANGER), + "<p><x/</p>", + "should not support an eof directly after a self-closing slash" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("<x/\n*asd*", DANGER), + // "<p><x/\n<em>asd</em></p>", + // "should not support a line ending after a self-closing slash" + // ); + + assert_eq!( + micromark_with_options("<x/>", DANGER), + "<x/>", + "should support an eof after a self-closing tag" + ); + + assert_eq!( + micromark_with_options("<x/>\na", DANGER), + "<x/>\na", + "should support a line ending after a self-closing tag" + ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x/>a", DANGER), + // "<p><x/>a</p>", + // "should not support another character after a self-closing tag" + // ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x>a", DANGER), + // "<p><x>a</p>", + // "should not support another character after an opening tag" + // ); + + assert_eq!( + micromark_with_options("<x y>", DANGER), + "<x y>", + "should support boolean attributes in a complete tag" + ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x\ny>", DANGER), + // "<p><x\ny></p>", + // "should not support a line ending before an attribute name" + // ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x\n y>", DANGER), + // "<p><x\ny></p>", + // "should not support a line ending w/ whitespace before an attribute name" + // ); + + assert_eq!( + micromark_with_options("<x\n \ny>", DANGER), + "<p><x</p>\n<p>y></p>", + "should not support a line ending w/ whitespace and another line ending before an attribute name" + ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x y\nz>", DANGER), + // "<p><x y\nz></p>", + // "should not support a line ending between attribute names" + // ); + + assert_eq!( + micromark_with_options("<x y z>", DANGER), + "<x y z>", + "should support whitespace between attribute names" + ); + + assert_eq!( + micromark_with_options("<x:y>", DANGER), + "<p><x:y></p>", + "should not support a colon in a tag name" + ); + + assert_eq!( + micromark_with_options("<x_y>", DANGER), + "<p><x_y></p>", + "should not support an underscore in a tag name" + ); + + assert_eq!( + micromark_with_options("<x.y>", DANGER), + "<p><x.y></p>", + "should not support a dot in a tag name" + ); + + assert_eq!( + micromark_with_options("<x :y>", DANGER), + "<x :y>", + "should support a colon to start an attribute name" + ); + + assert_eq!( + micromark_with_options("<x _y>", DANGER), + "<x _y>", + "should support an underscore to start an attribute name" + ); + + assert_eq!( + micromark_with_options("<x .y>", DANGER), + "<p><x .y></p>", + "should not support a dot to start an attribute name" + ); + + assert_eq!( + micromark_with_options("<x y:>", DANGER), + "<x y:>", + "should support a colon to end an attribute name" + ); + + assert_eq!( + micromark_with_options("<x y_>", DANGER), + "<x y_>", + "should support an underscore to end an attribute name" + ); + + assert_eq!( + micromark_with_options("<x y.>", DANGER), + "<x y.>", + "should support a dot to end an attribute name" + ); + + assert_eq!( + micromark_with_options("<x y123>", DANGER), + "<x y123>", + "should support numbers to end an attribute name" + ); + + assert_eq!( + micromark_with_options("<x data->", DANGER), + "<x data->", + "should support a dash to end an attribute name" + ); + + assert_eq!( + micromark_with_options("<x y=>", DANGER), + "<p><x y=></p>", + "should not upport an initializer w/o a value" + ); + + assert_eq!( + micromark_with_options("<x y==>", DANGER), + "<p><x y==></p>", + "should not support an equals to as an initializer" + ); + + assert_eq!( + micromark_with_options("<x y=z>", DANGER), + "<x y=z>", + "should support a single character as an unquoted attribute value" + ); + + assert_eq!( + micromark_with_options("<x y=\"\">", DANGER), + "<x y=\"\">", + "should support an empty double quoted attribute value" + ); + + assert_eq!( + micromark_with_options("<x y=\"\">", DANGER), + "<x y=\"\">", + "should support an empty single quoted attribute value" + ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x y=\"\n\">", DANGER), + // "<p><x y=\"\n\"></p>", + // "should not support a line ending in a double quoted attribute value" + // ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<x y=\"\n\">", DANGER), + // "<p><x y=\"\n\"></p>", + // "should not support a line ending in a single quoted attribute value" + // ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<w x=y\nz>", DANGER), + // "<p><w x=y\nz></p>", + // "should not support a line ending in/after an unquoted attribute value" + // ); + + assert_eq!( + micromark_with_options("<w x=y\"z>", DANGER), + "<p><w x=y"z></p>", + "should not support a double quote in/after an unquoted attribute value" + ); + + // To do: html (text). + // assert_eq!( + // micromark_with_options("<w x=y\"z>", DANGER), + // "<p><w x=y\"z></p>", + // "should not support a single quote in/after an unquoted attribute value" + // ); + + assert_eq!( + micromark_with_options("<x y=\"\"z>", DANGER), + "<p><x y=""z></p>", + "should not support an attribute after a double quoted attribute value" + ); + + // To do: blockquote. + // assert_eq!( + // micromark_with_options("<x>\n \n \n>", DANGER), + // "<x>\n<blockquote>\n</blockquote>", + // "should not support blank lines in complete" + // ); + + // assert_eq!( + // micromark_with_options("> <a>\n*bar*", DANGER), + // "<blockquote>\n<a>\n</blockquote>\n<p><em>bar</em></p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark_with_options("> a\n<a>", DANGER), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<a>", + // "should not support lazyness (2)" + // ); +} diff --git a/tests/lib.rs b/tests/lib.rs new file mode 100644 index 0000000..18fcef2 --- /dev/null +++ b/tests/lib.rs @@ -0,0 +1,8 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn basic() { + assert_eq!(micromark("asd"), "<p>asd</p>", "should work"); + assert_eq!(micromark("1 < 3"), "<p>1 < 3</p>", "should encode"); +} diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs new file mode 100644 index 0000000..833fa6f --- /dev/null +++ b/tests/thematic_break.rs @@ -0,0 +1,181 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn thematic_break() { + assert_eq!( + micromark("***\n---\n___"), + "<hr />\n<hr />\n<hr />", + "should support thematic breaks w/ asterisks, dashes, and underscores" + ); + + assert_eq!( + micromark("+++"), + "<p>+++</p>", + "should not support thematic breaks w/ plusses" + ); + + assert_eq!( + micromark("==="), + "<p>===</p>", + "should not support thematic breaks w/ equals" + ); + + assert_eq!( + micromark("--"), + "<p>--</p>", + "should not support thematic breaks w/ two dashes" + ); + + assert_eq!( + micromark("**"), + "<p>**</p>", + "should not support thematic breaks w/ two asterisks" + ); + + assert_eq!( + micromark("__"), + "<p>__</p>", + "should not support thematic breaks w/ two underscores" + ); + + assert_eq!( + micromark(" ***"), + "<hr />", + "should support thematic breaks w/ 1 space" + ); + + assert_eq!( + micromark(" ***"), + "<hr />", + "should support thematic breaks w/ 2 spaces" + ); + + assert_eq!( + micromark(" ***"), + "<hr />", + "should support thematic breaks w/ 3 spaces" + ); + + assert_eq!( + micromark(" ***"), + "<pre><code>***\n</code></pre>", + "should not support thematic breaks w/ 4 spaces" + ); + + // To do: paragraphs. + // assert_eq!( + // micromark("Foo\n ***"), + // "<p>Foo\n***</p>", + // "should not support thematic breaks w/ 4 spaces as paragraph continuation" + // ); + + assert_eq!( + micromark("_____________________________________"), + "<hr />", + "should support thematic breaks w/ many markers" + ); + + assert_eq!( + micromark(" - - -"), + "<hr />", + "should support thematic breaks w/ spaces (1)" + ); + + assert_eq!( + micromark(" ** * ** * ** * **"), + "<hr />", + "should support thematic breaks w/ spaces (2)" + ); + + assert_eq!( + micromark("- - - -"), + "<hr />", + "should support thematic breaks w/ spaces (3)" + ); + + assert_eq!( + micromark("- - - - "), + "<hr />", + "should support thematic breaks w/ trailing spaces" + ); + + assert_eq!( + micromark("_ _ _ _ a"), + "<p>_ _ _ _ a</p>", + "should not support thematic breaks w/ other characters (1)" + ); + + assert_eq!( + micromark("a------"), + "<p>a------</p>", + "should not support thematic breaks w/ other characters (2)" + ); + + assert_eq!( + micromark("---a---"), + "<p>---a---</p>", + "should not support thematic breaks w/ other characters (3)" + ); + + // To do: phrasing. + // assert_eq!( + // micromark(" *-*"), + // "<p><em>-</em></p>", + // "should not support thematic breaks w/ mixed markers" + // ); + + // To do: lists. + // assert_eq!( + // micromark("- foo\n***\n- bar"), + // "<ul>\n<li>foo</li>\n</ul>\n<hr />\n<ul>\n<li>bar</li>\n</ul>", + // "should support thematic breaks mixed w/ lists (1)" + // ); + + // assert_eq!( + // micromark("* Foo\n* * *\n* Bar"), + // "<ul>\n<li>Foo</li>\n</ul>\n<hr />\n<ul>\n<li>Bar</li>\n</ul>", + // "should support thematic breaks mixed w/ lists (2)" + // ); + + // To do: paragraph. + // assert_eq!( + // micromark("Foo\n***\nbar"), + // "<p>Foo</p>\n<hr />\n<p>bar</p>", + // "should support thematic breaks interrupting paragraphs" + // ); + + // To do: setext. + // assert_eq!( + // micromark("Foo\n---\nbar"), + // "<h2>Foo</h2>\n<p>bar</p>", + // "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)" + // ); + + // To do: list. + // assert_eq!( + // micromark("- Foo\n- * * *"), + // "<ul>\n<li>Foo</li>\n<li>\n<hr />\n</li>\n</ul>", + // "should support thematic breaks in lists" + // ); + + // To do: blockquote. + // assert_eq!( + // micromark("> ---\na"), + // "<blockquote>\n<hr />\n</blockquote>\n<p>a</p>", + // "should not support lazyness (1)" + // ); + + // assert_eq!( + // micromark("> a\n---"), + // "<blockquote>\n<p>a</p>\n</blockquote>\n<hr />", + // "should not support lazyness (2)" + // ); + + // To do: extensions. + // assert_eq!( + // micromark("***", {extensions: [{disable: {null: ["thematicBreak"]}}]}), + // "<p>***</p>", + // "should support turning off thematic breaks" + // ); +} |