From 4c06c8554c35887f8f5147783953b2b7e7c2327f Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 8 Jun 2022 15:52:16 +0200 Subject: . --- .editorconfig | 12 + .github/workflows/main.yml | 24 + .gitignore | 5 + Cargo.toml | 18 + Untitled.txt | 1 + examples/lib.rs | 22 + funding.yml | 1 + license | 22 + readme.md | 183 +++ src/compiler.rs | 367 +++++ src/constant.rs | 2561 ++++++++++++++++++++++++++++++++++ src/construct/blank_line.rs | 61 + src/construct/character_escape.rs | 69 + src/construct/character_reference.rs | 237 ++++ src/construct/code_fenced.rs | 581 ++++++++ src/construct/code_indented.rs | 190 +++ src/construct/heading_atx.rs | 175 +++ src/construct/html_flow.rs | 1068 ++++++++++++++ src/construct/mod.rs | 11 + src/construct/partial_whitespace.rs | 66 + src/construct/thematic_break.rs | 137 ++ src/content/flow.rs | 258 ++++ src/content/mod.rs | 4 + src/content/string.rs | 120 ++ src/lib.rs | 52 + src/parser.rs | 14 + src/tokenizer.rs | 580 ++++++++ src/util.rs | 241 ++++ tests/code_fenced.rs | 266 ++++ tests/code_indented.rs | 196 +++ tests/heading_atx.rs | 208 +++ tests/html_flow.rs | 1058 ++++++++++++++ tests/lib.rs | 8 + tests/thematic_break.rs | 181 +++ 34 files changed, 8997 insertions(+) create mode 100644 .editorconfig create mode 100644 .github/workflows/main.yml create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 Untitled.txt create mode 100644 examples/lib.rs create mode 100644 funding.yml create mode 100644 license create mode 100644 readme.md create mode 100644 src/compiler.rs create mode 100644 src/constant.rs create mode 100644 src/construct/blank_line.rs create mode 100644 src/construct/character_escape.rs create mode 100644 src/construct/character_reference.rs create mode 100644 src/construct/code_fenced.rs create mode 100644 src/construct/code_indented.rs create mode 100644 src/construct/heading_atx.rs create mode 100644 src/construct/html_flow.rs create mode 100644 src/construct/mod.rs create mode 100644 src/construct/partial_whitespace.rs create mode 100644 src/construct/thematic_break.rs create mode 100644 src/content/flow.rs create mode 100644 src/content/mod.rs create mode 100644 src/content/string.rs create mode 100644 src/lib.rs create mode 100644 src/parser.rs create mode 100644 src/tokenizer.rs create mode 100644 src/util.rs create mode 100644 tests/code_fenced.rs create mode 100644 tests/code_indented.rs create mode 100644 tests/heading_atx.rs create mode 100644 tests/html_flow.rs create mode 100644 tests/lib.rs create mode 100644 tests/thematic_break.rs diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..201f7b7 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.rs] +indent_size = 4 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..cbee315 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,24 @@ +name: main +on: + - pull_request + - push +jobs: + main: + name: ${{matrix.rust}} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{matrix.rust}} + components: rustfmt, clippy + - run: cargo clippy -- -W clippy::pedantic + - run: cargo fmt --all -- --check + - run: cargo test + - run: cargo install cargo-tarpaulin && cargo tarpaulin --out Xml + - uses: codecov/codecov-action@v1 + strategy: + matrix: + rust: + - stable + - beta diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..32a28f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.DS_Store +*.log +*.lock +coverage/ +target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..96f23d7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "micromark" +version = "0.0.0" +authors = ["Titus Wormer "] +edition = "2015" +rust-version = "1.56" +description = "small commonmark compliant markdown parser with positional info and concrete tokens" +homepage = "https://github.com/micromark/micromark-rs" +repository = "https://github.com/micromark/micromark-rs" +license = "MIT" +keywords = ["commonmark", "markdown", "parse", "render", "tokenize"] +categories = ["compilers", "encoding", "parser-implementations", "parsing", "text-processing"] +include = ["src/", "license"] +publish = false + +[dependencies] +log = "0.4" +env_logger = "0.9" diff --git a/Untitled.txt b/Untitled.txt new file mode 100644 index 0000000..cc1576f --- /dev/null +++ b/Untitled.txt @@ -0,0 +1 @@ +micromark.js: unquoted: is `completeAttributeValueUnquoted`s case for `completeAttributeNameAfter` missing a `/`?. I’ve added it here. diff --git a/examples/lib.rs b/examples/lib.rs new file mode 100644 index 0000000..4d01161 --- /dev/null +++ b/examples/lib.rs @@ -0,0 +1,22 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, CompileOptions}; + +fn main() { + // Turn on debugging. + // You can show it with `RUST_LOG=debug cargo run --example lib` + env_logger::init(); + + // Safely turn (untrusted?) markdown into HTML. + println!("{:?}", micromark("# Hello, world!")); + + // Turn trusted markdown into HTML. + println!( + "{:?}", + micromark_with_options( + "
\n\n# Hello, tomato!\n\n
", + &CompileOptions { + allow_dangerous_html: true + } + ) + ); +} diff --git a/funding.yml b/funding.yml new file mode 100644 index 0000000..dee132d --- /dev/null +++ b/funding.yml @@ -0,0 +1 @@ +github: wooorm diff --git a/license b/license new file mode 100644 index 0000000..9ac1e96 --- /dev/null +++ b/license @@ -0,0 +1,22 @@ +(The MIT License) + +Copyright (c) 2022 Titus Wormer + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +'Software'), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..8892183 --- /dev/null +++ b/readme.md @@ -0,0 +1,183 @@ +# micromark-rs + +Here be dragons! +🐉 +There’s a lot to do. +Some major to dos are described here, more smaller ones are in the code. + +## Some useful scripts for now + +Run examples: + +```sh +RUST_BACKTRACE=1 RUST_LOG=debug cargo run --example lib +``` + +Format: + +```sh +cargo fmt --all +``` + +Lint: + +```sh +cargo fmt --all -- --check && cargo clippy -- -W clippy::pedantic +``` + +Tests: + +```sh +RUST_BACKTRACE=1 cargo test +``` + +Docs: + +```sh +cargo doc --document-private-items +``` + +(add `--open` to open them in a browser) + +## To do + +### Some major obstacles + +- [ ] (8) Subtokenization: figure out a good, fast way to deal with constructs in + one content type that also are another content type +- [ ] (1) Setext headings: can they be solved in content, or do they have to be + solved in flow somehow +- [ ] (8) Can content (and to a lesser extent string and text) operate more + performantly than checking whether other flow constructs start a line, + before exiting and actually attempting flow constructs? +- [ ] (5) Figure out definitions and sharing those identifiers, and references + before definitions +- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the + previous construct (typically paragraph) +- [ ] (5) Containers: this will be rather messy, and depends a lot on how + subtokenization is solved +- [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by + containers +- [ ] (3) Lazy lines, in containers, in flow and content in a paragraph, a line + does not need to be indented +- [ ] (5) There’s a lot of rust-related choosing whether to pass (mutable) + references or whatever around that should be refactored +- [ ] (5) Figure out extensions +- [ ] (1) Support turning off constructs + +### Small things + +- [ ] (3) Clean compiler +- [ ] (1) Optionally remove dangerous protocols when compiling +- [ ] (1) Use preferred line ending style in markdown +- [ ] (1) Handle BOM at start +- [ ] (1) Make sure tabs are handled properly and that positional info is perfect +- [ ] (1) Make sure crlf/cr/lf are working perfectly +- [ ] (3) Figure out lifetimes of things (see `life time` in source) +- [ ] (3) Use `commonmark` tests +- [ ] (3) Share a bunch of tests with `micromark-js` +- [ ] (5) Do some research on rust best practices for APIs, e.g., what to accept, + how to integrate with streams or so? +- [ ] (1) Go through clippy rules, and such, to add strict code styles +- [ ] (1) Make sure that rust character groups match CM character groups (e.g., is + `unicode_whitespace` or so the same?) +- [ ] (1) Any special handling of surrogates? +- [ ] (1) Make sure debugging is useful for other folks +- [ ] (3) Add some benchmarks, do some perf testing +- [ ] (3) Write comparison to other parsers +- [ ] (3) Add node/etc bindings? +- [ ] (8) After all extensions, including MDX, are done, see if we can integrate + this with SWC to compile MDX +- [ ] (3) Bunch of docs +- [ ] (5) Site + +### Constructs + +- [ ] (5) attention (strong, emphasis) (text) +- [ ] (1) autolink +- [x] blank line +- [ ] (5) block quote +- [x] character escape +- [x] character reference +- [x] code (fenced) +- [x] code (indented) +- [ ] (1) code (text) +- [ ] (3) content +- [ ] (3) definition +- [ ] (1) hard break escape +- [x] heading (atx) +- [ ] (1) heading (setext) +- [x] html (flow) +- [ ] html (text) +- [ ] (3) label end +- [ ] (3) label start (image) +- [ ] (3) label start (link) +- [ ] (8) list +- [ ] (1) paragraph +- [x] thematic break + +### Content types + +- [ ] (8) container + - [ ] block quote + - [ ] list +- [ ] (1) flow + - [x] blank line + - [x] code (fenced) + - [x] code (indented) + - [ ] content + - [x] heading (atx) + - [x] html (flow) + - [x] thematic break +- [ ] (3) content + - [ ] definition + - [ ] heading (setext) + - [ ] paragraph +- [ ] (5) text + - [ ] attention (strong, emphasis) (text) + - [ ] autolink + - [x] character escape + - [x] character reference + - [ ] code (text) + - [ ] hard break escape + - [ ] html (text) + - [ ] label end + - [ ] label start (image) + - [ ] label start (link) +- [x] string + - [x] character escape + - [x] character reference + +### Extensions + +The main thing here is is to figure out if folks could extend from the outside +with their own code, or if we need to maintain it all here. +Regardless, it is essential for the launch of `micromark-rs` that extensions +are theoretically or practically possible. +The extensions below are listed from top to bottom from more important to less +important. + +- [ ] (1) frontmatter (yaml, toml) (flow) + — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter) +- [ ] (3) autolink literal (GFM) (text) + — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) +- [ ] (3) footnote (GFM) (content, text) + — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) +- [ ] (3) strikethrough (GFM) (text) + — [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough) +- [ ] (5) table (GFM) (flow) + — [`micromark-extension-gfm-table`](https://github.com/micromark/micromark-extension-gfm-table) +- [ ] (1) task list item (GFM) (text) + — [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-task-list-item) +- [ ] (3) math (flow, text) + — [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) +- [ ] (8) directive (flow, text) + — [`micromark-extension-directive`](https://github.com/micromark/micromark-extension-directive) +- [ ] (8) expression (MDX) (flow, text) + — [`micromark-extension-mdx-expression`](https://github.com/micromark/micromark-extension-mdx-expression) +- [ ] (5) JSX (MDX) (flow, text) + — [`micromark-extension-mdx-jsx`](https://github.com/micromark/micromark-extension-mdx-jsx) +- [ ] (3) ESM (MDX) (flow) + — [`micromark-extension-mdxjs-esm`](https://github.com/micromark/micromark-extension-mdxjs-esm) +- [ ] (1) tagfilter (GFM) (n/a, renderer) + — [`micromark-extension-gfm-tagfilter`](https://github.com/micromark/micromark-extension-gfm-tagfilter) diff --git a/src/compiler.rs b/src/compiler.rs new file mode 100644 index 0000000..166950e --- /dev/null +++ b/src/compiler.rs @@ -0,0 +1,367 @@ +//! Turn events into a string of HTML. +use crate::construct::character_reference::Kind as CharacterReferenceKind; +use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::util::{ + decode_named_character_reference, decode_numeric_character_reference, encode, get_span, + slice_serialize, +}; + +/// Configuration (optional). +#[derive(Default, Debug)] +pub struct CompileOptions { + /// Whether to allow (dangerous) HTML. + /// The default is `false`, you can turn it on to `true` for trusted + /// content. + pub allow_dangerous_html: bool, +} + +/// Turn events and codes into a string of HTML. +#[allow(clippy::too_many_lines)] +pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> String { + let mut index = 0; + // let mut last_was_tag = false; + let buffers: &mut Vec> = &mut vec![vec![]]; + let mut atx_opening_sequence_size: Option = None; + let mut atx_heading_buffer: Option = None; + let mut code_flow_seen_data: Option = None; + let mut code_fenced_fences_count: Option = None; + let mut slurp_one_line_ending = false; + let mut ignore_encode = false; + let mut character_reference_kind: Option = None; + // let mut slurp_all_line_endings = false; + + println!("events: {:#?}", events); + + while index < events.len() { + let event = &events[index]; + let token_type = &event.token_type; + + match event.event_type { + EventType::Enter => match token_type { + TokenType::Content => { + buf_tail_mut(buffers).push("

".to_string()); + } + TokenType::CodeIndented => { + code_flow_seen_data = Some(false); + line_ending_if_needed(buffers); + buf_tail_mut(buffers).push("

".to_string());
+                }
+                TokenType::CodeFenced => {
+                    code_flow_seen_data = Some(false);
+                    line_ending_if_needed(buffers);
+                    // Note: no `>`, which is added later.
+                    buf_tail_mut(buffers).push("
 {
+                    buffer(buffers);
+                }
+                TokenType::HtmlFlow => {
+                    line_ending_if_needed(buffers);
+                    if options.allow_dangerous_html {
+                        ignore_encode = true;
+                    }
+                }
+                TokenType::ContentPhrasing
+                | TokenType::AtxHeading
+                | TokenType::AtxHeadingSequence
+                | TokenType::AtxHeadingWhitespace
+                | TokenType::AtxHeadingText
+                | TokenType::LineEnding
+                | TokenType::ThematicBreak
+                | TokenType::ThematicBreakSequence
+                | TokenType::ThematicBreakWhitespace
+                | TokenType::CodeIndentedPrefixWhitespace
+                | TokenType::CodeFlowChunk
+                | TokenType::BlankLineEnding
+                | TokenType::BlankLineWhitespace
+                | TokenType::Whitespace
+                | TokenType::HtmlFlowData
+                | TokenType::CodeFencedFence
+                | TokenType::CodeFencedFenceSequence
+                | TokenType::ChunkString
+                | TokenType::CodeFencedFenceWhitespace
+                | TokenType::Data
+                | TokenType::CharacterEscape
+                | TokenType::CharacterEscapeMarker
+                | TokenType::CharacterEscapeValue
+                | TokenType::CharacterReference
+                | TokenType::CharacterReferenceMarker
+                | TokenType::CharacterReferenceMarkerNumeric
+                | TokenType::CharacterReferenceMarkerHexadecimal
+                | TokenType::CharacterReferenceMarkerSemi
+                | TokenType::CharacterReferenceValue => {}
+                #[allow(unreachable_patterns)]
+                _ => {
+                    unreachable!("unhandled `enter` of TokenType {:?}", token_type)
+                }
+            },
+            EventType::Exit => match token_type {
+                TokenType::ThematicBreakSequence
+                | TokenType::ThematicBreakWhitespace
+                | TokenType::CodeIndentedPrefixWhitespace
+                | TokenType::BlankLineEnding
+                | TokenType::BlankLineWhitespace
+                | TokenType::Whitespace
+                | TokenType::CodeFencedFenceSequence
+                | TokenType::CodeFencedFenceWhitespace
+                | TokenType::CharacterEscape
+                | TokenType::CharacterEscapeMarker
+                | TokenType::CharacterReference
+                | TokenType::CharacterReferenceMarkerSemi => {}
+                TokenType::HtmlFlow => {
+                    ignore_encode = false;
+                }
+                TokenType::HtmlFlowData => {
+                    let slice = slice_serialize(codes, &get_span(events, index), false);
+
+                    let res = if ignore_encode { slice } else { encode(&slice) };
+
+                    // last_was_tag = false;
+                    buf_tail_mut(buffers).push(res);
+                }
+                TokenType::Content => {
+                    buf_tail_mut(buffers).push("

".to_string()); + } + TokenType::CodeIndented | TokenType::CodeFenced => { + let seen_data = + code_flow_seen_data.expect("`code_flow_seen_data` must be defined"); + + // To do: containers. + // One special case is if we are inside a container, and the fenced code was + // not closed (meaning it runs to the end). + // In that case, the following line ending, is considered *outside* the + // fenced code and block quote by micromark, but CM wants to treat that + // ending as part of the code. + // if fenced_count != None && fenced_count < 2 && tightStack.length > 0 && !last_was_tag { + // line_ending(); + // } + + // But in most cases, it’s simpler: when we’ve seen some data, emit an extra + // line ending when needed. + if seen_data { + line_ending_if_needed(buffers); + } + + buf_tail_mut(buffers).push("
".to_string()); + + if let Some(count) = code_fenced_fences_count { + if count < 2 { + line_ending_if_needed(buffers); + } + } + + code_flow_seen_data = None; + code_fenced_fences_count = None; + slurp_one_line_ending = false; + } + TokenType::CodeFencedFence => { + let count = if let Some(count) = code_fenced_fences_count { + count + } else { + 0 + }; + + if count == 0 { + buf_tail_mut(buffers).push(">".to_string()); + // tag = true; + slurp_one_line_ending = true; + } + + code_fenced_fences_count = Some(count + 1); + } + TokenType::CodeFencedFenceInfo => { + let value = resume(buffers); + buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); + // tag = true; + } + TokenType::CodeFencedFenceMeta => { + resume(buffers); + } + TokenType::CodeFlowChunk => { + code_flow_seen_data = Some(true); + buf_tail_mut(buffers).push(encode(&slice_serialize( + codes, + &get_span(events, index), + false, + ))); + } + // `AtxHeadingWhitespace` is ignored after the opening sequence, + // before the closing sequence, and after the closing sequence. + // But it is used around intermediate sequences. + // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`. + // `AtxHeadingSequence` is ignored as the opening and closing sequence, + // but not when intermediate. + TokenType::AtxHeadingWhitespace | TokenType::AtxHeadingSequence => { + if let Some(buf) = atx_heading_buffer { + atx_heading_buffer = Some( + buf.to_string() + + &encode(&slice_serialize(codes, &get_span(events, index), false)), + ); + } + + // First fence we see. + if None == atx_opening_sequence_size { + let rank = slice_serialize(codes, &get_span(events, index), false).len(); + atx_opening_sequence_size = Some(rank); + buf_tail_mut(buffers).push(format!("", rank)); + } + } + TokenType::AtxHeadingText => { + println!("text: {:?}", atx_heading_buffer); + if let Some(ref buf) = atx_heading_buffer { + if !buf.is_empty() { + buf_tail_mut(buffers).push(encode(buf)); + atx_heading_buffer = Some("".to_string()); + } + } else { + atx_heading_buffer = Some("".to_string()); + } + + let slice = encode(&slice_serialize(codes, &get_span(events, index), false)); + println!("slice: {:?}", slice); + buf_tail_mut(buffers).push(slice); + } + TokenType::AtxHeading => { + let rank = atx_opening_sequence_size + .expect("`atx_opening_sequence_size` must be set in headings"); + buf_tail_mut(buffers).push(format!("", rank)); + atx_opening_sequence_size = None; + atx_heading_buffer = None; + } + TokenType::ThematicBreak => { + buf_tail_mut(buffers).push("
".to_string()); + } + TokenType::LineEnding => { + // if slurp_all_line_endings { + // // Empty. + // } else + if slurp_one_line_ending { + slurp_one_line_ending = false; + // } else if code_text_inside { + // buf_tail_mut(buffers).push(" ".to_string()); + } else { + buf_tail_mut(buffers).push(encode(&slice_serialize( + codes, + &get_span(events, index), + false, + ))); + } + } + TokenType::CharacterReferenceMarker => { + character_reference_kind = Some(CharacterReferenceKind::Named); + } + TokenType::CharacterReferenceMarkerNumeric => { + character_reference_kind = Some(CharacterReferenceKind::Decimal); + } + TokenType::CharacterReferenceMarkerHexadecimal => { + character_reference_kind = Some(CharacterReferenceKind::Hexadecimal); + } + TokenType::CharacterReferenceValue => { + let kind = character_reference_kind + .expect("expected `character_reference_kind` to be set"); + let reference = slice_serialize(codes, &get_span(events, index), false); + let ref_string = reference.as_str(); + let value = match kind { + CharacterReferenceKind::Decimal => { + decode_numeric_character_reference(ref_string, 10).to_string() + } + CharacterReferenceKind::Hexadecimal => { + decode_numeric_character_reference(ref_string, 16).to_string() + } + CharacterReferenceKind::Named => { + decode_named_character_reference(ref_string) + } + }; + + buf_tail_mut(buffers).push(value); + + character_reference_kind = None; + } + // To do: `ContentPhrasing` should be parsed as phrasing first. + // This branch below currently acts as the resulting `data` tokens. + TokenType::ContentPhrasing + // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported. + | TokenType::ChunkString + | TokenType::Data + | TokenType::CharacterEscapeValue => { + // last_was_tag = false; + buf_tail_mut(buffers).push(encode(&slice_serialize( + codes, + &get_span(events, index), + false, + ))); + } + #[allow(unreachable_patterns)] + _ => { + unreachable!("unhandled `exit` of TokenType {:?}", token_type) + } + }, + } + + index += 1; + } + + assert!(buffers.len() == 1, "expected 1 final buffer"); + buffers.get(0).expect("expected 1 final buffer").concat() +} + +/// Push a buffer. +fn buffer(buffers: &mut Vec>) { + buffers.push(vec![]); +} + +/// Pop a buffer, returning its value. +fn resume(buffers: &mut Vec>) -> String { + let buf = buffers.pop().expect("Cannot resume w/o buffer"); + buf.concat() +} + +/// Get the last chunk of current buffer. +fn buf_tail_slice(buffers: &mut [Vec]) -> Option<&String> { + let tail = buf_tail(buffers); + tail.last() +} + +/// Get the mutable last chunk of current buffer. +fn buf_tail_mut(buffers: &mut [Vec]) -> &mut Vec { + buffers + .last_mut() + .expect("at least one buffer should exist") +} + +/// Get the current buffer. +fn buf_tail(buffers: &mut [Vec]) -> &Vec { + buffers.last().expect("at least one buffer should exist") +} + +/// Add a line ending. +fn line_ending(buffers: &mut [Vec]) { + let tail = buf_tail_mut(buffers); + // To do: use inferred line ending style. + // lastWasTag = false + tail.push("\n".to_string()); +} + +/// Add a line ending if needed (as in, there’s no eol/eof already). +fn line_ending_if_needed(buffers: &mut [Vec]) { + let slice = buf_tail_slice(buffers); + let last_char = if let Some(x) = slice { + x.chars().last() + } else { + None + }; + let mut add = true; + + if let Some(x) = last_char { + if x == '\n' || x == '\r' { + add = false; + } + } else { + add = false; + } + + if add { + line_ending(buffers); + } +} diff --git a/src/constant.rs b/src/constant.rs new file mode 100644 index 0000000..332fdaf --- /dev/null +++ b/src/constant.rs @@ -0,0 +1,2561 @@ +//! Constants needed to parse markdown. +//! +//! Most of these constants are magic numbers, such as the number of markers +//! needed to parse [code (fenced)][code_fenced] +//! ([`CODE_FENCED_SEQUENCE_SIZE_MIN`][]) or the max number of allowed markers +//! in a [heading (atx)][heading_atx] +//! ([`HEADING_ATX_OPENING_FENCE_SIZE_MAX`][]). +//! +//! Some constants are instead lists of things, such as the list of tag names +//! considered in the **raw** production of [HTML (flow)][html_flow] +//! ([`HTML_RAW_NAMES`][]), or the list of allowed named character references +//! ([`CHARACTER_REFERENCE_NAMES`][]). +//! +//! [code_fenced]: crate::construct::code_fenced +//! [heading_atx]: crate::construct::heading_atx +//! [html_flow]: crate::construct::html_flow + +/// The number of characters that form a tab stop. +/// +/// This relates to the number of whitespace characters needed to form certain +/// constructs in markdown, most notable the whitespace required to form +/// [code (indented)][code_indented]. +/// +/// +/// +/// +/// [code_indented]: crate::construct::code_indented +pub const TAB_SIZE: usize = 4; + +/// The number of markers needed for a [thematic break][thematic_break] to form. +/// +/// Like many things in markdown, the number is `3`. +/// +/// [thematic_break]: crate::construct::thematic_break +pub const THEMATIC_BREAK_MARKER_COUNT_MIN: usize = 3; + +/// The max number of markers allowed to form a [heading (atx)][heading_atx]. +/// +/// This limitation is imposed by HTML, which imposes a max heading rank of +/// `6`. +/// +/// [heading_atx]: crate::construct::heading_atx +pub const HEADING_ATX_OPENING_FENCE_SIZE_MAX: usize = 6; + +/// The number of markers needed for [code (fenced)][code_fenced] to form. +/// +/// Like many things in markdown, the number is `3`. +/// +/// [code_fenced]: crate::construct::code_fenced +pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3; + +/// List of HTML tag names that form the **raw** production of +/// [HTML (flow)][html_flow]. +/// +/// The **raw** production allows blank lines and thus no interleaving with +/// markdown. +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// The number of the longest tag name is also stored as a constant in +/// [`HTML_RAW_SIZE_MAX`][]. +/// +/// > 👉 **Note**: `textarea` was added in `CommonMark@0.30`. +/// +/// ## References +/// +/// * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +/// +/// [html_flow]: crate::construct::html_flow +pub const HTML_RAW_NAMES: [&str; 4] = ["pre", "script", "style", "textarea"]; + +/// The number of the longest tag name in [`HTML_RAW_NAMES`][]. +/// +/// This is currently the size of `textarea`. +pub const HTML_RAW_SIZE_MAX: usize = 8; + +/// List of HTML tag names that form the **basic** production of +/// [HTML (flow)][html_flow]. +/// +/// The **basic** production allows interleaving HTML and markdown with blank lines +/// and allows flow (block) elements to interrupt content. +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// Tag names not on this list result in the **complete** production. +/// +/// > 👉 **Note**: `source` was removed on `main` of the `CommonMark` spec and +/// > is slated to be released in `CommonMark@0.31`. +/// +/// ## References +/// +/// * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +/// * [*Remove source element as HTML block start condition* as `commonmark/commonmark-spec#710`](https://github.com/commonmark/commonmark-spec/pull/710) +/// +/// [html_flow]: crate::construct::html_flow +pub const HTML_BLOCK_NAMES: [&str; 61] = [ + "address", + "article", + "aside", + "base", + "basefont", + "blockquote", + "body", + "caption", + "center", + "col", + "colgroup", + "dd", + "details", + "dialog", + "dir", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hr", + "html", + "iframe", + "legend", + "li", + "link", + "main", + "menu", + "menuitem", + "nav", + "noframes", + "ol", + "optgroup", + "option", + "p", + "param", + "section", + "summary", + "table", + "tbody", + "td", + "tfoot", + "th", + "thead", + "title", + "tr", + "track", + "ul", +]; + +/// The max number of characters in a hexadecimal numeric +/// [character reference][character_reference]. +/// +/// To illustrate, this allows `�` and disallows `�`. +/// This limit is imposed because all bigger numbers are invalid. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX: usize = 6; + +/// The max number of characters in a decimal numeric +/// [character reference][character_reference]. +/// +/// To illustrate, this allows `�` and disallows `�`. +/// This limit is imposed because all bigger numbers are invalid. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_DECIMAL_SIZE_MAX: usize = 7; + +/// The max number of characters in a named +/// [character reference][character_reference]. +/// +/// This is the number of the longest name in [`CHARACTER_REFERENCE_NAMES`][]. +/// It allows `∳` and prevents the parser from +/// continuing for eons. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_NAMED_SIZE_MAX: usize = 31; + +/// List of names that can form a named +/// [character reference][character_reference]. +/// +/// This list is sensitive to casing. +/// +/// The number of the longest name (`CounterClockwiseContourIntegral`) is also +/// stored as a constant in [`CHARACTER_REFERENCE_NAMED_SIZE_MAX`][]. +/// +/// The corresponding values of this list are stored in +/// [`CHARACTER_REFERENCE_VALUES`][]. +/// They correspond through their index. +/// +/// ## References +/// +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_NAMES: [&str; 2222] = [ + "AEli", + "AElig", + "AM", + "AMP", + "Aacut", + "Aacute", + "Abreve", + "Acir", + "Acirc", + "Acy", + "Afr", + "Agrav", + "Agrave", + "Alpha", + "Amacr", + "And", + "Aogon", + "Aopf", + "ApplyFunction", + "Arin", + "Aring", + "Ascr", + "Assign", + "Atild", + "Atilde", + "Aum", + "Auml", + "Backslash", + "Barv", + "Barwed", + "Bcy", + "Because", + "Bernoullis", + "Beta", + "Bfr", + "Bopf", + "Breve", + "Bscr", + "Bumpeq", + "CHcy", + "COP", + "COPY", + "Cacute", + "Cap", + "CapitalDifferentialD", + "Cayleys", + "Ccaron", + "Ccedi", + "Ccedil", + "Ccirc", + "Cconint", + "Cdot", + "Cedilla", + "CenterDot", + "Cfr", + "Chi", + "CircleDot", + "CircleMinus", + "CirclePlus", + "CircleTimes", + "ClockwiseContourIntegral", + "CloseCurlyDoubleQuote", + "CloseCurlyQuote", + "Colon", + "Colone", + "Congruent", + "Conint", + "ContourIntegral", + "Copf", + "Coproduct", + "CounterClockwiseContourIntegral", + "Cross", + "Cscr", + "Cup", + "CupCap", + "DD", + "DDotrahd", + "DJcy", + "DScy", + "DZcy", + "Dagger", + "Darr", + "Dashv", + "Dcaron", + "Dcy", + "Del", + "Delta", + "Dfr", + "DiacriticalAcute", + "DiacriticalDot", + "DiacriticalDoubleAcute", + "DiacriticalGrave", + "DiacriticalTilde", + "Diamond", + "DifferentialD", + "Dopf", + "Dot", + "DotDot", + "DotEqual", + "DoubleContourIntegral", + "DoubleDot", + "DoubleDownArrow", + "DoubleLeftArrow", + "DoubleLeftRightArrow", + "DoubleLeftTee", + "DoubleLongLeftArrow", + "DoubleLongLeftRightArrow", + "DoubleLongRightArrow", + "DoubleRightArrow", + "DoubleRightTee", + "DoubleUpArrow", + "DoubleUpDownArrow", + "DoubleVerticalBar", + "DownArrow", + "DownArrowBar", + "DownArrowUpArrow", + "DownBreve", + "DownLeftRightVector", + "DownLeftTeeVector", + "DownLeftVector", + "DownLeftVectorBar", + "DownRightTeeVector", + "DownRightVector", + "DownRightVectorBar", + "DownTee", + "DownTeeArrow", + "Downarrow", + "Dscr", + "Dstrok", + "ENG", + "ET", + "ETH", + "Eacut", + "Eacute", + "Ecaron", + "Ecir", + "Ecirc", + "Ecy", + "Edot", + "Efr", + "Egrav", + "Egrave", + "Element", + "Emacr", + "EmptySmallSquare", + "EmptyVerySmallSquare", + "Eogon", + "Eopf", + "Epsilon", + "Equal", + "EqualTilde", + "Equilibrium", + "Escr", + "Esim", + "Eta", + "Eum", + "Euml", + "Exists", + "ExponentialE", + "Fcy", + "Ffr", + "FilledSmallSquare", + "FilledVerySmallSquare", + "Fopf", + "ForAll", + "Fouriertrf", + "Fscr", + "GJcy", + "G", + "GT", + "Gamma", + "Gammad", + "Gbreve", + "Gcedil", + "Gcirc", + "Gcy", + "Gdot", + "Gfr", + "Gg", + "Gopf", + "GreaterEqual", + "GreaterEqualLess", + "GreaterFullEqual", + "GreaterGreater", + "GreaterLess", + "GreaterSlantEqual", + "GreaterTilde", + "Gscr", + "Gt", + "HARDcy", + "Hacek", + "Hat", + "Hcirc", + "Hfr", + "HilbertSpace", + "Hopf", + "HorizontalLine", + "Hscr", + "Hstrok", + "HumpDownHump", + "HumpEqual", + "IEcy", + "IJlig", + "IOcy", + "Iacut", + "Iacute", + "Icir", + "Icirc", + "Icy", + "Idot", + "Ifr", + "Igrav", + "Igrave", + "Im", + "Imacr", + "ImaginaryI", + "Implies", + "Int", + "Integral", + "Intersection", + "InvisibleComma", + "InvisibleTimes", + "Iogon", + "Iopf", + "Iota", + "Iscr", + "Itilde", + "Iukcy", + "Ium", + "Iuml", + "Jcirc", + "Jcy", + "Jfr", + "Jopf", + "Jscr", + "Jsercy", + "Jukcy", + "KHcy", + "KJcy", + "Kappa", + "Kcedil", + "Kcy", + "Kfr", + "Kopf", + "Kscr", + "LJcy", + "L", + "LT", + "Lacute", + "Lambda", + "Lang", + "Laplacetrf", + "Larr", + "Lcaron", + "Lcedil", + "Lcy", + "LeftAngleBracket", + "LeftArrow", + "LeftArrowBar", + "LeftArrowRightArrow", + "LeftCeiling", + "LeftDoubleBracket", + "LeftDownTeeVector", + "LeftDownVector", + "LeftDownVectorBar", + "LeftFloor", + "LeftRightArrow", + "LeftRightVector", + "LeftTee", + "LeftTeeArrow", + "LeftTeeVector", + "LeftTriangle", + "LeftTriangleBar", + "LeftTriangleEqual", + "LeftUpDownVector", + "LeftUpTeeVector", + "LeftUpVector", + "LeftUpVectorBar", + "LeftVector", + "LeftVectorBar", + "Leftarrow", + "Leftrightarrow", + "LessEqualGreater", + "LessFullEqual", + "LessGreater", + "LessLess", + "LessSlantEqual", + "LessTilde", + "Lfr", + "Ll", + "Lleftarrow", + "Lmidot", + "LongLeftArrow", + "LongLeftRightArrow", + "LongRightArrow", + "Longleftarrow", + "Longleftrightarrow", + "Longrightarrow", + "Lopf", + "LowerLeftArrow", + "LowerRightArrow", + "Lscr", + "Lsh", + "Lstrok", + "Lt", + "Map", + "Mcy", + "MediumSpace", + "Mellintrf", + "Mfr", + "MinusPlus", + "Mopf", + "Mscr", + "Mu", + "NJcy", + "Nacute", + "Ncaron", + "Ncedil", + "Ncy", + "NegativeMediumSpace", + "NegativeThickSpace", + "NegativeThinSpace", + "NegativeVeryThinSpace", + "NestedGreaterGreater", + "NestedLessLess", + "NewLine", + "Nfr", + "NoBreak", + "NonBreakingSpace", + "Nopf", + "Not", + "NotCongruent", + "NotCupCap", + "NotDoubleVerticalBar", + "NotElement", + "NotEqual", + "NotEqualTilde", + "NotExists", + "NotGreater", + "NotGreaterEqual", + "NotGreaterFullEqual", + "NotGreaterGreater", + "NotGreaterLess", + "NotGreaterSlantEqual", + "NotGreaterTilde", + "NotHumpDownHump", + "NotHumpEqual", + "NotLeftTriangle", + "NotLeftTriangleBar", + "NotLeftTriangleEqual", + "NotLess", + "NotLessEqual", + "NotLessGreater", + "NotLessLess", + "NotLessSlantEqual", + "NotLessTilde", + "NotNestedGreaterGreater", + "NotNestedLessLess", + "NotPrecedes", + "NotPrecedesEqual", + "NotPrecedesSlantEqual", + "NotReverseElement", + "NotRightTriangle", + "NotRightTriangleBar", + "NotRightTriangleEqual", + "NotSquareSubset", + "NotSquareSubsetEqual", + "NotSquareSuperset", + "NotSquareSupersetEqual", + "NotSubset", + "NotSubsetEqual", + "NotSucceeds", + "NotSucceedsEqual", + "NotSucceedsSlantEqual", + "NotSucceedsTilde", + "NotSuperset", + "NotSupersetEqual", + "NotTilde", + "NotTildeEqual", + "NotTildeFullEqual", + "NotTildeTilde", + "NotVerticalBar", + "Nscr", + "Ntild", + "Ntilde", + "Nu", + "OElig", + "Oacut", + "Oacute", + "Ocir", + "Ocirc", + "Ocy", + "Odblac", + "Ofr", + "Ograv", + "Ograve", + "Omacr", + "Omega", + "Omicron", + "Oopf", + "OpenCurlyDoubleQuote", + "OpenCurlyQuote", + "Or", + "Oscr", + "Oslas", + "Oslash", + "Otild", + "Otilde", + "Otimes", + "Oum", + "Ouml", + "OverBar", + "OverBrace", + "OverBracket", + "OverParenthesis", + "PartialD", + "Pcy", + "Pfr", + "Phi", + "Pi", + "PlusMinus", + "Poincareplane", + "Popf", + "Pr", + "Precedes", + "PrecedesEqual", + "PrecedesSlantEqual", + "PrecedesTilde", + "Prime", + "Product", + "Proportion", + "Proportional", + "Pscr", + "Psi", + "QUO", + "QUOT", + "Qfr", + "Qopf", + "Qscr", + "RBarr", + "RE", + "REG", + "Racute", + "Rang", + "Rarr", + "Rarrtl", + "Rcaron", + "Rcedil", + "Rcy", + "Re", + "ReverseElement", + "ReverseEquilibrium", + "ReverseUpEquilibrium", + "Rfr", + "Rho", + "RightAngleBracket", + "RightArrow", + "RightArrowBar", + "RightArrowLeftArrow", + "RightCeiling", + "RightDoubleBracket", + "RightDownTeeVector", + "RightDownVector", + "RightDownVectorBar", + "RightFloor", + "RightTee", + "RightTeeArrow", + "RightTeeVector", + "RightTriangle", + "RightTriangleBar", + "RightTriangleEqual", + "RightUpDownVector", + "RightUpTeeVector", + "RightUpVector", + "RightUpVectorBar", + "RightVector", + "RightVectorBar", + "Rightarrow", + "Ropf", + "RoundImplies", + "Rrightarrow", + "Rscr", + "Rsh", + "RuleDelayed", + "SHCHcy", + "SHcy", + "SOFTcy", + "Sacute", + "Sc", + "Scaron", + "Scedil", + "Scirc", + "Scy", + "Sfr", + "ShortDownArrow", + "ShortLeftArrow", + "ShortRightArrow", + "ShortUpArrow", + "Sigma", + "SmallCircle", + "Sopf", + "Sqrt", + "Square", + "SquareIntersection", + "SquareSubset", + "SquareSubsetEqual", + "SquareSuperset", + "SquareSupersetEqual", + "SquareUnion", + "Sscr", + "Star", + "Sub", + "Subset", + "SubsetEqual", + "Succeeds", + "SucceedsEqual", + "SucceedsSlantEqual", + "SucceedsTilde", + "SuchThat", + "Sum", + "Sup", + "Superset", + "SupersetEqual", + "Supset", + "THOR", + "THORN", + "TRADE", + "TSHcy", + "TScy", + "Tab", + "Tau", + "Tcaron", + "Tcedil", + "Tcy", + "Tfr", + "Therefore", + "Theta", + "ThickSpace", + "ThinSpace", + "Tilde", + "TildeEqual", + "TildeFullEqual", + "TildeTilde", + "Topf", + "TripleDot", + "Tscr", + "Tstrok", + "Uacut", + "Uacute", + "Uarr", + "Uarrocir", + "Ubrcy", + "Ubreve", + "Ucir", + "Ucirc", + "Ucy", + "Udblac", + "Ufr", + "Ugrav", + "Ugrave", + "Umacr", + "UnderBar", + "UnderBrace", + "UnderBracket", + "UnderParenthesis", + "Union", + "UnionPlus", + "Uogon", + "Uopf", + "UpArrow", + "UpArrowBar", + "UpArrowDownArrow", + "UpDownArrow", + "UpEquilibrium", + "UpTee", + "UpTeeArrow", + "Uparrow", + "Updownarrow", + "UpperLeftArrow", + "UpperRightArrow", + "Upsi", + "Upsilon", + "Uring", + "Uscr", + "Utilde", + "Uum", + "Uuml", + "VDash", + "Vbar", + "Vcy", + "Vdash", + "Vdashl", + "Vee", + "Verbar", + "Vert", + "VerticalBar", + "VerticalLine", + "VerticalSeparator", + "VerticalTilde", + "VeryThinSpace", + "Vfr", + "Vopf", + "Vscr", + "Vvdash", + "Wcirc", + "Wedge", + "Wfr", + "Wopf", + "Wscr", + "Xfr", + "Xi", + "Xopf", + "Xscr", + "YAcy", + "YIcy", + "YUcy", + "Yacut", + "Yacute", + "Ycirc", + "Ycy", + "Yfr", + "Yopf", + "Yscr", + "Yuml", + "ZHcy", + "Zacute", + "Zcaron", + "Zcy", + "Zdot", + "ZeroWidthSpace", + "Zeta", + "Zfr", + "Zopf", + "Zscr", + "aacut", + "aacute", + "abreve", + "ac", + "acE", + "acd", + "acir", + "acirc", + "acut", + "acute", + "acy", + "aeli", + "aelig", + "af", + "afr", + "agrav", + "agrave", + "alefsym", + "aleph", + "alpha", + "amacr", + "amalg", + "am", + "amp", + "and", + "andand", + "andd", + "andslope", + "andv", + "ang", + "ange", + "angle", + "angmsd", + "angmsdaa", + "angmsdab", + "angmsdac", + "angmsdad", + "angmsdae", + "angmsdaf", + "angmsdag", + "angmsdah", + "angrt", + "angrtvb", + "angrtvbd", + "angsph", + "angst", + "angzarr", + "aogon", + "aopf", + "ap", + "apE", + "apacir", + "ape", + "apid", + "apos", + "approx", + "approxeq", + "arin", + "aring", + "ascr", + "ast", + "asymp", + "asympeq", + "atild", + "atilde", + "aum", + "auml", + "awconint", + "awint", + "bNot", + "backcong", + "backepsilon", + "backprime", + "backsim", + "backsimeq", + "barvee", + "barwed", + "barwedge", + "bbrk", + "bbrktbrk", + "bcong", + "bcy", + "bdquo", + "becaus", + "because", + "bemptyv", + "bepsi", + "bernou", + "beta", + "beth", + "between", + "bfr", + "bigcap", + "bigcirc", + "bigcup", + "bigodot", + "bigoplus", + "bigotimes", + "bigsqcup", + "bigstar", + "bigtriangledown", + "bigtriangleup", + "biguplus", + "bigvee", + "bigwedge", + "bkarow", + "blacklozenge", + "blacksquare", + "blacktriangle", + "blacktriangledown", + "blacktriangleleft", + "blacktriangleright", + "blank", + "blk12", + "blk14", + "blk34", + "block", + "bne", + "bnequiv", + "bnot", + "bopf", + "bot", + "bottom", + "bowtie", + "boxDL", + "boxDR", + "boxDl", + "boxDr", + "boxH", + "boxHD", + "boxHU", + "boxHd", + "boxHu", + "boxUL", + "boxUR", + "boxUl", + "boxUr", + "boxV", + "boxVH", + "boxVL", + "boxVR", + "boxVh", + "boxVl", + "boxVr", + "boxbox", + "boxdL", + "boxdR", + "boxdl", + "boxdr", + "boxh", + "boxhD", + "boxhU", + "boxhd", + "boxhu", + "boxminus", + "boxplus", + "boxtimes", + "boxuL", + "boxuR", + "boxul", + "boxur", + "boxv", + "boxvH", + "boxvL", + "boxvR", + "boxvh", + "boxvl", + "boxvr", + "bprime", + "breve", + "brvba", + "brvbar", + "bscr", + "bsemi", + "bsim", + "bsime", + "bsol", + "bsolb", + "bsolhsub", + "bull", + "bullet", + "bump", + "bumpE", + "bumpe", + "bumpeq", + "cacute", + "cap", + "capand", + "capbrcup", + "capcap", + "capcup", + "capdot", + "caps", + "caret", + "caron", + "ccaps", + "ccaron", + "ccedi", + "ccedil", + "ccirc", + "ccups", + "ccupssm", + "cdot", + "cedi", + "cedil", + "cemptyv", + "cen", + "cent", + "centerdot", + "cfr", + "chcy", + "check", + "checkmark", + "chi", + "cir", + "cirE", + "circ", + "circeq", + "circlearrowleft", + "circlearrowright", + "circledR", + "circledS", + "circledast", + "circledcirc", + "circleddash", + "cire", + "cirfnint", + "cirmid", + "cirscir", + "clubs", + "clubsuit", + "colon", + "colone", + "coloneq", + "comma", + "commat", + "comp", + "compfn", + "complement", + "complexes", + "cong", + "congdot", + "conint", + "copf", + "coprod", + "cop", + "copy", + "copysr", + "crarr", + "cross", + "cscr", + "csub", + "csube", + "csup", + "csupe", + "ctdot", + "cudarrl", + "cudarrr", + "cuepr", + "cuesc", + "cularr", + "cularrp", + "cup", + "cupbrcap", + "cupcap", + "cupcup", + "cupdot", + "cupor", + "cups", + "curarr", + "curarrm", + "curlyeqprec", + "curlyeqsucc", + "curlyvee", + "curlywedge", + "curre", + "curren", + "curvearrowleft", + "curvearrowright", + "cuvee", + "cuwed", + "cwconint", + "cwint", + "cylcty", + "dArr", + "dHar", + "dagger", + "daleth", + "darr", + "dash", + "dashv", + "dbkarow", + "dblac", + "dcaron", + "dcy", + "dd", + "ddagger", + "ddarr", + "ddotseq", + "de", + "deg", + "delta", + "demptyv", + "dfisht", + "dfr", + "dharl", + "dharr", + "diam", + "diamond", + "diamondsuit", + "diams", + "die", + "digamma", + "disin", + "div", + "divid", + "divide", + "divideontimes", + "divonx", + "djcy", + "dlcorn", + "dlcrop", + "dollar", + "dopf", + "dot", + "doteq", + "doteqdot", + "dotminus", + "dotplus", + "dotsquare", + "doublebarwedge", + "downarrow", + "downdownarrows", + "downharpoonleft", + "downharpoonright", + "drbkarow", + "drcorn", + "drcrop", + "dscr", + "dscy", + "dsol", + "dstrok", + "dtdot", + "dtri", + "dtrif", + "duarr", + "duhar", + "dwangle", + "dzcy", + "dzigrarr", + "eDDot", + "eDot", + "eacut", + "eacute", + "easter", + "ecaron", + "ecir", + "ecirc", + "ecolon", + "ecy", + "edot", + "ee", + "efDot", + "efr", + "eg", + "egrav", + "egrave", + "egs", + "egsdot", + "el", + "elinters", + "ell", + "els", + "elsdot", + "emacr", + "empty", + "emptyset", + "emptyv", + "emsp13", + "emsp14", + "emsp", + "eng", + "ensp", + "eogon", + "eopf", + "epar", + "eparsl", + "eplus", + "epsi", + "epsilon", + "epsiv", + "eqcirc", + "eqcolon", + "eqsim", + "eqslantgtr", + "eqslantless", + "equals", + "equest", + "equiv", + "equivDD", + "eqvparsl", + "erDot", + "erarr", + "escr", + "esdot", + "esim", + "eta", + "et", + "eth", + "eum", + "euml", + "euro", + "excl", + "exist", + "expectation", + "exponentiale", + "fallingdotseq", + "fcy", + "female", + "ffilig", + "fflig", + "ffllig", + "ffr", + "filig", + "fjlig", + "flat", + "fllig", + "fltns", + "fnof", + "fopf", + "forall", + "fork", + "forkv", + "fpartint", + "frac1", + "frac12", + "frac13", + "frac14", + "frac15", + "frac16", + "frac18", + "frac23", + "frac25", + "frac3", + "frac34", + "frac35", + "frac38", + "frac45", + "frac56", + "frac58", + "frac78", + "frasl", + "frown", + "fscr", + "gE", + "gEl", + "gacute", + "gamma", + "gammad", + "gap", + "gbreve", + "gcirc", + "gcy", + "gdot", + "ge", + "gel", + "geq", + "geqq", + "geqslant", + "ges", + "gescc", + "gesdot", + "gesdoto", + "gesdotol", + "gesl", + "gesles", + "gfr", + "gg", + "ggg", + "gimel", + "gjcy", + "gl", + "glE", + "gla", + "glj", + "gnE", + "gnap", + "gnapprox", + "gne", + "gneq", + "gneqq", + "gnsim", + "gopf", + "grave", + "gscr", + "gsim", + "gsime", + "gsiml", + "g", + "gt", + "gtcc", + "gtcir", + "gtdot", + "gtlPar", + "gtquest", + "gtrapprox", + "gtrarr", + "gtrdot", + "gtreqless", + "gtreqqless", + "gtrless", + "gtrsim", + "gvertneqq", + "gvnE", + "hArr", + "hairsp", + "half", + "hamilt", + "hardcy", + "harr", + "harrcir", + "harrw", + "hbar", + "hcirc", + "hearts", + "heartsuit", + "hellip", + "hercon", + "hfr", + "hksearow", + "hkswarow", + "hoarr", + "homtht", + "hookleftarrow", + "hookrightarrow", + "hopf", + "horbar", + "hscr", + "hslash", + "hstrok", + "hybull", + "hyphen", + "iacut", + "iacute", + "ic", + "icir", + "icirc", + "icy", + "iecy", + "iexc", + "iexcl", + "iff", + "ifr", + "igrav", + "igrave", + "ii", + "iiiint", + "iiint", + "iinfin", + "iiota", + "ijlig", + "imacr", + "image", + "imagline", + "imagpart", + "imath", + "imof", + "imped", + "in", + "incare", + "infin", + "infintie", + "inodot", + "int", + "intcal", + "integers", + "intercal", + "intlarhk", + "intprod", + "iocy", + "iogon", + "iopf", + "iota", + "iprod", + "iques", + "iquest", + "iscr", + "isin", + "isinE", + "isindot", + "isins", + "isinsv", + "isinv", + "it", + "itilde", + "iukcy", + "ium", + "iuml", + "jcirc", + "jcy", + "jfr", + "jmath", + "jopf", + "jscr", + "jsercy", + "jukcy", + "kappa", + "kappav", + "kcedil", + "kcy", + "kfr", + "kgreen", + "khcy", + "kjcy", + "kopf", + "kscr", + "lAarr", + "lArr", + "lAtail", + "lBarr", + "lE", + "lEg", + "lHar", + "lacute", + "laemptyv", + "lagran", + "lambda", + "lang", + "langd", + "langle", + "lap", + "laqu", + "laquo", + "larr", + "larrb", + "larrbfs", + "larrfs", + "larrhk", + "larrlp", + "larrpl", + "larrsim", + "larrtl", + "lat", + "latail", + "late", + "lates", + "lbarr", + "lbbrk", + "lbrace", + "lbrack", + "lbrke", + "lbrksld", + "lbrkslu", + "lcaron", + "lcedil", + "lceil", + "lcub", + "lcy", + "ldca", + "ldquo", + "ldquor", + "ldrdhar", + "ldrushar", + "ldsh", + "le", + "leftarrow", + "leftarrowtail", + "leftharpoondown", + "leftharpoonup", + "leftleftarrows", + "leftrightarrow", + "leftrightarrows", + "leftrightharpoons", + "leftrightsquigarrow", + "leftthreetimes", + "leg", + "leq", + "leqq", + "leqslant", + "les", + "lescc", + "lesdot", + "lesdoto", + "lesdotor", + "lesg", + "lesges", + "lessapprox", + "lessdot", + "lesseqgtr", + "lesseqqgtr", + "lessgtr", + "lesssim", + "lfisht", + "lfloor", + "lfr", + "lg", + "lgE", + "lhard", + "lharu", + "lharul", + "lhblk", + "ljcy", + "ll", + "llarr", + "llcorner", + "llhard", + "lltri", + "lmidot", + "lmoust", + "lmoustache", + "lnE", + "lnap", + "lnapprox", + "lne", + "lneq", + "lneqq", + "lnsim", + "loang", + "loarr", + "lobrk", + "longleftarrow", + "longleftrightarrow", + "longmapsto", + "longrightarrow", + "looparrowleft", + "looparrowright", + "lopar", + "lopf", + "loplus", + "lotimes", + "lowast", + "lowbar", + "loz", + "lozenge", + "lozf", + "lpar", + "lparlt", + "lrarr", + "lrcorner", + "lrhar", + "lrhard", + "lrm", + "lrtri", + "lsaquo", + "lscr", + "lsh", + "lsim", + "lsime", + "lsimg", + "lsqb", + "lsquo", + "lsquor", + "lstrok", + "l", + "lt", + "ltcc", + "ltcir", + "ltdot", + "lthree", + "ltimes", + "ltlarr", + "ltquest", + "ltrPar", + "ltri", + "ltrie", + "ltrif", + "lurdshar", + "luruhar", + "lvertneqq", + "lvnE", + "mDDot", + "mac", + "macr", + "male", + "malt", + "maltese", + "map", + "mapsto", + "mapstodown", + "mapstoleft", + "mapstoup", + "marker", + "mcomma", + "mcy", + "mdash", + "measuredangle", + "mfr", + "mho", + "micr", + "micro", + "mid", + "midast", + "midcir", + "middo", + "middot", + "minus", + "minusb", + "minusd", + "minusdu", + "mlcp", + "mldr", + "mnplus", + "models", + "mopf", + "mp", + "mscr", + "mstpos", + "mu", + "multimap", + "mumap", + "nGg", + "nGt", + "nGtv", + "nLeftarrow", + "nLeftrightarrow", + "nLl", + "nLt", + "nLtv", + "nRightarrow", + "nVDash", + "nVdash", + "nabla", + "nacute", + "nang", + "nap", + "napE", + "napid", + "napos", + "napprox", + "natur", + "natural", + "naturals", + "nbs", + "nbsp", + "nbump", + "nbumpe", + "ncap", + "ncaron", + "ncedil", + "ncong", + "ncongdot", + "ncup", + "ncy", + "ndash", + "ne", + "neArr", + "nearhk", + "nearr", + "nearrow", + "nedot", + "nequiv", + "nesear", + "nesim", + "nexist", + "nexists", + "nfr", + "ngE", + "nge", + "ngeq", + "ngeqq", + "ngeqslant", + "nges", + "ngsim", + "ngt", + "ngtr", + "nhArr", + "nharr", + "nhpar", + "ni", + "nis", + "nisd", + "niv", + "njcy", + "nlArr", + "nlE", + "nlarr", + "nldr", + "nle", + "nleftarrow", + "nleftrightarrow", + "nleq", + "nleqq", + "nleqslant", + "nles", + "nless", + "nlsim", + "nlt", + "nltri", + "nltrie", + "nmid", + "nopf", + "no", + "not", + "notin", + "notinE", + "notindot", + "notinva", + "notinvb", + "notinvc", + "notni", + "notniva", + "notnivb", + "notnivc", + "npar", + "nparallel", + "nparsl", + "npart", + "npolint", + "npr", + "nprcue", + "npre", + "nprec", + "npreceq", + "nrArr", + "nrarr", + "nrarrc", + "nrarrw", + "nrightarrow", + "nrtri", + "nrtrie", + "nsc", + "nsccue", + "nsce", + "nscr", + "nshortmid", + "nshortparallel", + "nsim", + "nsime", + "nsimeq", + "nsmid", + "nspar", + "nsqsube", + "nsqsupe", + "nsub", + "nsubE", + "nsube", + "nsubset", + "nsubseteq", + "nsubseteqq", + "nsucc", + "nsucceq", + "nsup", + "nsupE", + "nsupe", + "nsupset", + "nsupseteq", + "nsupseteqq", + "ntgl", + "ntild", + "ntilde", + "ntlg", + "ntriangleleft", + "ntrianglelefteq", + "ntriangleright", + "ntrianglerighteq", + "nu", + "num", + "numero", + "numsp", + "nvDash", + "nvHarr", + "nvap", + "nvdash", + "nvge", + "nvgt", + "nvinfin", + "nvlArr", + "nvle", + "nvlt", + "nvltrie", + "nvrArr", + "nvrtrie", + "nvsim", + "nwArr", + "nwarhk", + "nwarr", + "nwarrow", + "nwnear", + "oS", + "oacut", + "oacute", + "oast", + "ocir", + "ocirc", + "ocy", + "odash", + "odblac", + "odiv", + "odot", + "odsold", + "oelig", + "ofcir", + "ofr", + "ogon", + "ograv", + "ograve", + "ogt", + "ohbar", + "ohm", + "oint", + "olarr", + "olcir", + "olcross", + "oline", + "olt", + "omacr", + "omega", + "omicron", + "omid", + "ominus", + "oopf", + "opar", + "operp", + "oplus", + "or", + "orarr", + "ord", + "order", + "orderof", + "ordf", + "ordm", + "origof", + "oror", + "orslope", + "orv", + "oscr", + "oslas", + "oslash", + "osol", + "otild", + "otilde", + "otimes", + "otimesas", + "oum", + "ouml", + "ovbar", + "par", + "para", + "parallel", + "parsim", + "parsl", + "part", + "pcy", + "percnt", + "period", + "permil", + "perp", + "pertenk", + "pfr", + "phi", + "phiv", + "phmmat", + "phone", + "pi", + "pitchfork", + "piv", + "planck", + "planckh", + "plankv", + "plus", + "plusacir", + "plusb", + "pluscir", + "plusdo", + "plusdu", + "pluse", + "plusm", + "plusmn", + "plussim", + "plustwo", + "pm", + "pointint", + "popf", + "poun", + "pound", + "pr", + "prE", + "prap", + "prcue", + "pre", + "prec", + "precapprox", + "preccurlyeq", + "preceq", + "precnapprox", + "precneqq", + "precnsim", + "precsim", + "prime", + "primes", + "prnE", + "prnap", + "prnsim", + "prod", + "profalar", + "profline", + "profsurf", + "prop", + "propto", + "prsim", + "prurel", + "pscr", + "psi", + "puncsp", + "qfr", + "qint", + "qopf", + "qprime", + "qscr", + "quaternions", + "quatint", + "quest", + "questeq", + "quo", + "quot", + "rAarr", + "rArr", + "rAtail", + "rBarr", + "rHar", + "race", + "racute", + "radic", + "raemptyv", + "rang", + "rangd", + "range", + "rangle", + "raqu", + "raquo", + "rarr", + "rarrap", + "rarrb", + "rarrbfs", + "rarrc", + "rarrfs", + "rarrhk", + "rarrlp", + "rarrpl", + "rarrsim", + "rarrtl", + "rarrw", + "ratail", + "ratio", + "rationals", + "rbarr", + "rbbrk", + "rbrace", + "rbrack", + "rbrke", + "rbrksld", + "rbrkslu", + "rcaron", + "rcedil", + "rceil", + "rcub", + "rcy", + "rdca", + "rdldhar", + "rdquo", + "rdquor", + "rdsh", + "real", + "realine", + "realpart", + "reals", + "rect", + "re", + "reg", + "rfisht", + "rfloor", + "rfr", + "rhard", + "rharu", + "rharul", + "rho", + "rhov", + "rightarrow", + "rightarrowtail", + "rightharpoondown", + "rightharpoonup", + "rightleftarrows", + "rightleftharpoons", + "rightrightarrows", + "rightsquigarrow", + "rightthreetimes", + "ring", + "risingdotseq", + "rlarr", + "rlhar", + "rlm", + "rmoust", + "rmoustache", + "rnmid", + "roang", + "roarr", + "robrk", + "ropar", + "ropf", + "roplus", + "rotimes", + "rpar", + "rpargt", + "rppolint", + "rrarr", + "rsaquo", + "rscr", + "rsh", + "rsqb", + "rsquo", + "rsquor", + "rthree", + "rtimes", + "rtri", + "rtrie", + "rtrif", + "rtriltri", + "ruluhar", + "rx", + "sacute", + "sbquo", + "sc", + "scE", + "scap", + "scaron", + "sccue", + "sce", + "scedil", + "scirc", + "scnE", + "scnap", + "scnsim", + "scpolint", + "scsim", + "scy", + "sdot", + "sdotb", + "sdote", + "seArr", + "searhk", + "searr", + "searrow", + "sec", + "sect", + "semi", + "seswar", + "setminus", + "setmn", + "sext", + "sfr", + "sfrown", + "sharp", + "shchcy", + "shcy", + "shortmid", + "shortparallel", + "sh", + "shy", + "sigma", + "sigmaf", + "sigmav", + "sim", + "simdot", + "sime", + "simeq", + "simg", + "simgE", + "siml", + "simlE", + "simne", + "simplus", + "simrarr", + "slarr", + "smallsetminus", + "smashp", + "smeparsl", + "smid", + "smile", + "smt", + "smte", + "smtes", + "softcy", + "sol", + "solb", + "solbar", + "sopf", + "spades", + "spadesuit", + "spar", + "sqcap", + "sqcaps", + "sqcup", + "sqcups", + "sqsub", + "sqsube", + "sqsubset", + "sqsubseteq", + "sqsup", + "sqsupe", + "sqsupset", + "sqsupseteq", + "squ", + "square", + "squarf", + "squf", + "srarr", + "sscr", + "ssetmn", + "ssmile", + "sstarf", + "star", + "starf", + "straightepsilon", + "straightphi", + "strns", + "sub", + "subE", + "subdot", + "sube", + "subedot", + "submult", + "subnE", + "subne", + "subplus", + "subrarr", + "subset", + "subseteq", + "subseteqq", + "subsetneq", + "subsetneqq", + "subsim", + "subsub", + "subsup", + "succ", + "succapprox", + "succcurlyeq", + "succeq", + "succnapprox", + "succneqq", + "succnsim", + "succsim", + "sum", + "sung", + "sup", + "sup1", + "sup2", + "sup3", + "supE", + "supdot", + "supdsub", + "supe", + "supedot", + "suphsol", + "suphsub", + "suplarr", + "supmult", + "supnE", + "supne", + "supplus", + "supset", + "supseteq", + "supseteqq", + "supsetneq", + "supsetneqq", + "supsim", + "supsub", + "supsup", + "swArr", + "swarhk", + "swarr", + "swarrow", + "swnwar", + "szli", + "szlig", + "target", + "tau", + "tbrk", + "tcaron", + "tcedil", + "tcy", + "tdot", + "telrec", + "tfr", + "there4", + "therefore", + "theta", + "thetasym", + "thetav", + "thickapprox", + "thicksim", + "thinsp", + "thkap", + "thksim", + "thor", + "thorn", + "tilde", + "time", + "times", + "timesb", + "timesbar", + "timesd", + "tint", + "toea", + "top", + "topbot", + "topcir", + "topf", + "topfork", + "tosa", + "tprime", + "trade", + "triangle", + "triangledown", + "triangleleft", + "trianglelefteq", + "triangleq", + "triangleright", + "trianglerighteq", + "tridot", + "trie", + "triminus", + "triplus", + "trisb", + "tritime", + "trpezium", + "tscr", + "tscy", + "tshcy", + "tstrok", + "twixt", + "twoheadleftarrow", + "twoheadrightarrow", + "uArr", + "uHar", + "uacut", + "uacute", + "uarr", + "ubrcy", + "ubreve", + "ucir", + "ucirc", + "ucy", + "udarr", + "udblac", + "udhar", + "ufisht", + "ufr", + "ugrav", + "ugrave", + "uharl", + "uharr", + "uhblk", + "ulcorn", + "ulcorner", + "ulcrop", + "ultri", + "umacr", + "um", + "uml", + "uogon", + "uopf", + "uparrow", + "updownarrow", + "upharpoonleft", + "upharpoonright", + "uplus", + "upsi", + "upsih", + "upsilon", + "upuparrows", + "urcorn", + "urcorner", + "urcrop", + "uring", + "urtri", + "uscr", + "utdot", + "utilde", + "utri", + "utrif", + "uuarr", + "uum", + "uuml", + "uwangle", + "vArr", + "vBar", + "vBarv", + "vDash", + "vangrt", + "varepsilon", + "varkappa", + "varnothing", + "varphi", + "varpi", + "varpropto", + "varr", + "varrho", + "varsigma", + "varsubsetneq", + "varsubsetneqq", + "varsupsetneq", + "varsupsetneqq", + "vartheta", + "vartriangleleft", + "vartriangleright", + "vcy", + "vdash", + "vee", + "veebar", + "veeeq", + "vellip", + "verbar", + "vert", + "vfr", + "vltri", + "vnsub", + "vnsup", + "vopf", + "vprop", + "vrtri", + "vscr", + "vsubnE", + "vsubne", + "vsupnE", + "vsupne", + "vzigzag", + "wcirc", + "wedbar", + "wedge", + "wedgeq", + "weierp", + "wfr", + "wopf", + "wp", + "wr", + "wreath", + "wscr", + "xcap", + "xcirc", + "xcup", + "xdtri", + "xfr", + "xhArr", + "xharr", + "xi", + "xlArr", + "xlarr", + "xmap", + "xnis", + "xodot", + "xopf", + "xoplus", + "xotime", + "xrArr", + "xrarr", + "xscr", + "xsqcup", + "xuplus", + "xutri", + "xvee", + "xwedge", + "yacut", + "yacute", + "yacy", + "ycirc", + "ycy", + "ye", + "yen", + "yfr", + "yicy", + "yopf", + "yscr", + "yucy", + "yum", + "yuml", + "zacute", + "zcaron", + "zcy", + "zdot", + "zeetrf", + "zeta", + "zfr", + "zhcy", + "zigrarr", + "zopf", + "zscr", + "zwj", + "zwnj", +]; + +/// List of values corresponding to names of named +/// [character references][character_reference]. +/// +/// The corresponding names of this list are stored in +/// [`CHARACTER_REFERENCE_NAMES`][]. +/// They correspond through their index. +/// +/// ## References +/// +/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_VALUES: [&str; 2222] = [ + "Æ", "Æ", "&", "&", "Á", "Á", "Ă", "Â", "Â", "А", "𝔄", "À", "À", "Α", "Ā", "⩓", "Ą", "𝔸", "⁡", + "Å", "Å", "𝒜", "≔", "Ã", "Ã", "Ä", "Ä", "∖", "⫧", "⌆", "Б", "∵", "ℬ", "Β", "𝔅", "𝔹", "˘", "ℬ", + "≎", "Ч", "©", "©", "Ć", "⋒", "ⅅ", "ℭ", "Č", "Ç", "Ç", "Ĉ", "∰", "Ċ", "¸", "·", "ℭ", "Χ", "⊙", + "⊖", "⊕", "⊗", "∲", "”", "’", "∷", "⩴", "≡", "∯", "∮", "ℂ", "∐", "∳", "⨯", "𝒞", "⋓", "≍", "ⅅ", + "⤑", "Ђ", "Ѕ", "Џ", "‡", "↡", "⫤", "Ď", "Д", "∇", "Δ", "𝔇", "´", "˙", "˝", "`", "˜", "⋄", "ⅆ", + "𝔻", "¨", "⃜", "≐", "∯", "¨", "⇓", "⇐", "⇔", "⫤", "⟸", "⟺", "⟹", "⇒", "⊨", "⇑", "⇕", "∥", "↓", + "⤓", "⇵", "̑", "⥐", "⥞", "↽", "⥖", "⥟", "⇁", "⥗", "⊤", "↧", "⇓", "𝒟", "Đ", "Ŋ", "Ð", "Ð", "É", + "É", "Ě", "Ê", "Ê", "Э", "Ė", "𝔈", "È", "È", "∈", "Ē", "◻", "▫", "Ę", "𝔼", "Ε", "⩵", "≂", "⇌", + "ℰ", "⩳", "Η", "Ë", "Ë", "∃", "ⅇ", "Ф", "𝔉", "◼", "▪", "𝔽", "∀", "ℱ", "ℱ", "Ѓ", ">", ">", "Γ", + "Ϝ", "Ğ", "Ģ", "Ĝ", "Г", "Ġ", "𝔊", "⋙", "𝔾", "≥", "⋛", "≧", "⪢", "≷", "⩾", "≳", "𝒢", "≫", "Ъ", + "ˇ", "^", "Ĥ", "ℌ", "ℋ", "ℍ", "─", "ℋ", "Ħ", "≎", "≏", "Е", "IJ", "Ё", "Í", "Í", "Î", "Î", "И", + "İ", "ℑ", "Ì", "Ì", "ℑ", "Ī", "ⅈ", "⇒", "∬", "∫", "⋂", "⁣", "⁢", "Į", "𝕀", "Ι", "ℐ", "Ĩ", "І", + "Ï", "Ï", "Ĵ", "Й", "𝔍", "𝕁", "𝒥", "Ј", "Є", "Х", "Ќ", "Κ", "Ķ", "К", "𝔎", "𝕂", "𝒦", "Љ", "<", + "<", "Ĺ", "Λ", "⟪", "ℒ", "↞", "Ľ", "Ļ", "Л", "⟨", "←", "⇤", "⇆", "⌈", "⟦", "⥡", "⇃", "⥙", "⌊", + "↔", "⥎", "⊣", "↤", "⥚", "⊲", "⧏", "⊴", "⥑", "⥠", "↿", "⥘", "↼", "⥒", "⇐", "⇔", "⋚", "≦", "≶", + "⪡", "⩽", "≲", "𝔏", "⋘", "⇚", "Ŀ", "⟵", "⟷", "⟶", "⟸", "⟺", "⟹", "𝕃", "↙", "↘", "ℒ", "↰", "Ł", + "≪", "⤅", "М", " ", "ℳ", "𝔐", "∓", "𝕄", "ℳ", "Μ", "Њ", "Ń", "Ň", "Ņ", "Н", "\u{200B}", + "\u{200B}", "\u{200B}", "\u{200B}", "≫", "≪", "\n", "𝔑", "\u{2060}", " ", "ℕ", "⫬", "≢", "≭", + "∦", "∉", "≠", "≂̸", "∄", "≯", "≱", "≧̸", "≫̸", "≹", "⩾̸", "≵", "≎̸", "≏̸", "⋪", "⧏̸", "⋬", "≮", "≰", + "≸", "≪̸", "⩽̸", "≴", "⪢̸", "⪡̸", "⊀", "⪯̸", "⋠", "∌", "⋫", "⧐̸", "⋭", "⊏̸", "⋢", "⊐̸", "⋣", "⊂⃒", "⊈", + "⊁", "⪰̸", "⋡", "≿̸", "⊃⃒", "⊉", "≁", "≄", "≇", "≉", "∤", "𝒩", "Ñ", "Ñ", "Ν", "Œ", "Ó", "Ó", "Ô", + "Ô", "О", "Ő", "𝔒", "Ò", "Ò", "Ō", "Ω", "Ο", "𝕆", "“", "‘", "⩔", "𝒪", "Ø", "Ø", "Õ", "Õ", "⨷", + "Ö", "Ö", "‾", "⏞", "⎴", "⏜", "∂", "П", "𝔓", "Φ", "Π", "±", "ℌ", "ℙ", "⪻", "≺", "⪯", "≼", "≾", + "″", "∏", "∷", "∝", "𝒫", "Ψ", "\"", "\"", "𝔔", "ℚ", "𝒬", "⤐", "®", "®", "Ŕ", "⟫", "↠", "⤖", + "Ř", "Ŗ", "Р", "ℜ", "∋", "⇋", "⥯", "ℜ", "Ρ", "⟩", "→", "⇥", "⇄", "⌉", "⟧", "⥝", "⇂", "⥕", "⌋", + "⊢", "↦", "⥛", "⊳", "⧐", "⊵", "⥏", "⥜", "↾", "⥔", "⇀", "⥓", "⇒", "ℝ", "⥰", "⇛", "ℛ", "↱", "⧴", + "Щ", "Ш", "Ь", "Ś", "⪼", "Š", "Ş", "Ŝ", "С", "𝔖", "↓", "←", "→", "↑", "Σ", "∘", "𝕊", "√", "□", + "⊓", "⊏", "⊑", "⊐", "⊒", "⊔", "𝒮", "⋆", "⋐", "⋐", "⊆", "≻", "⪰", "≽", "≿", "∋", "∑", "⋑", "⊃", + "⊇", "⋑", "Þ", "Þ", "™", "Ћ", "Ц", "\t", "Τ", "Ť", "Ţ", "Т", "𝔗", "∴", "Θ", "  ", " ", "∼", + "≃", "≅", "≈", "𝕋", "⃛", "𝒯", "Ŧ", "Ú", "Ú", "↟", "⥉", "Ў", "Ŭ", "Û", "Û", "У", "Ű", "𝔘", "Ù", + "Ù", "Ū", "_", "⏟", "⎵", "⏝", "⋃", "⊎", "Ų", "𝕌", "↑", "⤒", "⇅", "↕", "⥮", "⊥", "↥", "⇑", "⇕", + "↖", "↗", "ϒ", "Υ", "Ů", "𝒰", "Ũ", "Ü", "Ü", "⊫", "⫫", "В", "⊩", "⫦", "⋁", "‖", "‖", "∣", "|", + "❘", "≀", " ", "𝔙", "𝕍", "𝒱", "⊪", "Ŵ", "⋀", "𝔚", "𝕎", "𝒲", "𝔛", "Ξ", "𝕏", "𝒳", "Я", "Ї", "Ю", + "Ý", "Ý", "Ŷ", "Ы", "𝔜", "𝕐", "𝒴", "Ÿ", "Ж", "Ź", "Ž", "З", "Ż", "\u{200B}", "Ζ", "ℨ", "ℤ", + "𝒵", "á", "á", "ă", "∾", "∾̳", "∿", "â", "â", "´", "´", "а", "æ", "æ", "⁡", "𝔞", "à", "à", "ℵ", + "ℵ", "α", "ā", "⨿", "&", "&", "∧", "⩕", "⩜", "⩘", "⩚", "∠", "⦤", "∠", "∡", "⦨", "⦩", "⦪", "⦫", + "⦬", "⦭", "⦮", "⦯", "∟", "⊾", "⦝", "∢", "Å", "⍼", "ą", "𝕒", "≈", "⩰", "⩯", "≊", "≋", "'", "≈", + "≊", "å", "å", "𝒶", "*", "≈", "≍", "ã", "ã", "ä", "ä", "∳", "⨑", "⫭", "≌", "϶", "‵", "∽", "⋍", + "⊽", "⌅", "⌅", "⎵", "⎶", "≌", "б", "„", "∵", "∵", "⦰", "϶", "ℬ", "β", "ℶ", "≬", "𝔟", "⋂", "◯", + "⋃", "⨀", "⨁", "⨂", "⨆", "★", "▽", "△", "⨄", "⋁", "⋀", "⤍", "⧫", "▪", "▴", "▾", "◂", "▸", "␣", + "▒", "░", "▓", "█", "=⃥", "≡⃥", "⌐", "𝕓", "⊥", "⊥", "⋈", "╗", "╔", "╖", "╓", "═", "╦", "╩", "╤", + "╧", "╝", "╚", "╜", "╙", "║", "╬", "╣", "╠", "╫", "╢", "╟", "⧉", "╕", "╒", "┐", "┌", "─", "╥", + "╨", "┬", "┴", "⊟", "⊞", "⊠", "╛", "╘", "┘", "└", "│", "╪", "╡", "╞", "┼", "┤", "├", "‵", "˘", + "¦", "¦", "𝒷", "⁏", "∽", "⋍", "\\", "⧅", "⟈", "•", "•", "≎", "⪮", "≏", "≏", "ć", "∩", "⩄", "⩉", + "⩋", "⩇", "⩀", "∩︀", "⁁", "ˇ", "⩍", "č", "ç", "ç", "ĉ", "⩌", "⩐", "ċ", "¸", "¸", "⦲", "¢", "¢", + "·", "𝔠", "ч", "✓", "✓", "χ", "○", "⧃", "ˆ", "≗", "↺", "↻", "®", "Ⓢ", "⊛", "⊚", "⊝", "≗", "⨐", + "⫯", "⧂", "♣", "♣", ":", "≔", "≔", ",", "@", "∁", "∘", "∁", "ℂ", "≅", "⩭", "∮", "𝕔", "∐", "©", + "©", "℗", "↵", "✗", "𝒸", "⫏", "⫑", "⫐", "⫒", "⋯", "⤸", "⤵", "⋞", "⋟", "↶", "⤽", "∪", "⩈", "⩆", + "⩊", "⊍", "⩅", "∪︀", "↷", "⤼", "⋞", "⋟", "⋎", "⋏", "¤", "¤", "↶", "↷", "⋎", "⋏", "∲", "∱", "⌭", + "⇓", "⥥", "†", "ℸ", "↓", "‐", "⊣", "⤏", "˝", "ď", "д", "ⅆ", "‡", "⇊", "⩷", "°", "°", "δ", "⦱", + "⥿", "𝔡", "⇃", "⇂", "⋄", "⋄", "♦", "♦", "¨", "ϝ", "⋲", "÷", "÷", "÷", "⋇", "⋇", "ђ", "⌞", "⌍", + "$", "𝕕", "˙", "≐", "≑", "∸", "∔", "⊡", "⌆", "↓", "⇊", "⇃", "⇂", "⤐", "⌟", "⌌", "𝒹", "ѕ", "⧶", + "đ", "⋱", "▿", "▾", "⇵", "⥯", "⦦", "џ", "⟿", "⩷", "≑", "é", "é", "⩮", "ě", "ê", "ê", "≕", "э", + "ė", "ⅇ", "≒", "𝔢", "⪚", "è", "è", "⪖", "⪘", "⪙", "⏧", "ℓ", "⪕", "⪗", "ē", "∅", "∅", "∅", " ", + " ", " ", "ŋ", " ", "ę", "𝕖", "⋕", "⧣", "⩱", "ε", "ε", "ϵ", "≖", "≕", "≂", "⪖", "⪕", "=", "≟", + "≡", "⩸", "⧥", "≓", "⥱", "ℯ", "≐", "≂", "η", "ð", "ð", "ë", "ë", "€", "!", "∃", "ℰ", "ⅇ", "≒", + "ф", "♀", "ffi", "ff", "ffl", "𝔣", "fi", "fj", "♭", "fl", "▱", "ƒ", "𝕗", "∀", "⋔", "⫙", "⨍", "¼", "½", + "⅓", "¼", "⅕", "⅙", "⅛", "⅔", "⅖", "¾", "¾", "⅗", "⅜", "⅘", "⅚", "⅝", "⅞", "⁄", "⌢", "𝒻", "≧", + "⪌", "ǵ", "γ", "ϝ", "⪆", "ğ", "ĝ", "г", "ġ", "≥", "⋛", "≥", "≧", "⩾", "⩾", "⪩", "⪀", "⪂", "⪄", + "⋛︀", "⪔", "𝔤", "≫", "⋙", "ℷ", "ѓ", "≷", "⪒", "⪥", "⪤", "≩", "⪊", "⪊", "⪈", "⪈", "≩", "⋧", "𝕘", + "`", "ℊ", "≳", "⪎", "⪐", ">", ">", "⪧", "⩺", "⋗", "⦕", "⩼", "⪆", "⥸", "⋗", "⋛", "⪌", "≷", "≳", + "≩︀", "≩︀", "⇔", " ", "½", "ℋ", "ъ", "↔", "⥈", "↭", "ℏ", "ĥ", "♥", "♥", "…", "⊹", "𝔥", "⤥", "⤦", + "⇿", "∻", "↩", "↪", "𝕙", "―", "𝒽", "ℏ", "ħ", "⁃", "‐", "í", "í", "⁣", "î", "î", "и", "е", "¡", + "¡", "⇔", "𝔦", "ì", "ì", "ⅈ", "⨌", "∭", "⧜", "℩", "ij", "ī", "ℑ", "ℐ", "ℑ", "ı", "⊷", "Ƶ", "∈", + "℅", "∞", "⧝", "ı", "∫", "⊺", "ℤ", "⊺", "⨗", "⨼", "ё", "į", "𝕚", "ι", "⨼", "¿", "¿", "𝒾", "∈", + "⋹", "⋵", "⋴", "⋳", "∈", "⁢", "ĩ", "і", "ï", "ï", "ĵ", "й", "𝔧", "ȷ", "𝕛", "𝒿", "ј", "є", "κ", + "ϰ", "ķ", "к", "𝔨", "ĸ", "х", "ќ", "𝕜", "𝓀", "⇚", "⇐", "⤛", "⤎", "≦", "⪋", "⥢", "ĺ", "⦴", "ℒ", + "λ", "⟨", "⦑", "⟨", "⪅", "«", "«", "←", "⇤", "⤟", "⤝", "↩", "↫", "⤹", "⥳", "↢", "⪫", "⤙", "⪭", + "⪭︀", "⤌", "❲", "{", "[", "⦋", "⦏", "⦍", "ľ", "ļ", "⌈", "{", "л", "⤶", "“", "„", "⥧", "⥋", "↲", + "≤", "←", "↢", "↽", "↼", "⇇", "↔", "⇆", "⇋", "↭", "⋋", "⋚", "≤", "≦", "⩽", "⩽", "⪨", "⩿", "⪁", + "⪃", "⋚︀", "⪓", "⪅", "⋖", "⋚", "⪋", "≶", "≲", "⥼", "⌊", "𝔩", "≶", "⪑", "↽", "↼", "⥪", "▄", "љ", + "≪", "⇇", "⌞", "⥫", "◺", "ŀ", "⎰", "⎰", "≨", "⪉", "⪉", "⪇", "⪇", "≨", "⋦", "⟬", "⇽", "⟦", "⟵", + "⟷", "⟼", "⟶", "↫", "↬", "⦅", "𝕝", "⨭", "⨴", "∗", "_", "◊", "◊", "⧫", "(", "⦓", "⇆", "⌟", "⇋", + "⥭", "‎", "⊿", "‹", "𝓁", "↰", "≲", "⪍", "⪏", "[", "‘", "‚", "ł", "<", "<", "⪦", "⩹", "⋖", "⋋", + "⋉", "⥶", "⩻", "⦖", "◃", "⊴", "◂", "⥊", "⥦", "≨︀", "≨︀", "∺", "¯", "¯", "♂", "✠", "✠", "↦", "↦", + "↧", "↤", "↥", "▮", "⨩", "м", "—", "∡", "𝔪", "℧", "µ", "µ", "∣", "*", "⫰", "·", "·", "−", "⊟", + "∸", "⨪", "⫛", "…", "∓", "⊧", "𝕞", "∓", "𝓂", "∾", "μ", "⊸", "⊸", "⋙̸", "≫⃒", "≫̸", "⇍", "⇎", "⋘̸", + "≪⃒", "≪̸", "⇏", "⊯", "⊮", "∇", "ń", "∠⃒", "≉", "⩰̸", "≋̸", "ʼn", "≉", "♮", "♮", "ℕ", " ", " ", "≎̸", + "≏̸", "⩃", "ň", "ņ", "≇", "⩭̸", "⩂", "н", "–", "≠", "⇗", "⤤", "↗", "↗", "≐̸", "≢", "⤨", "≂̸", "∄", + "∄", "𝔫", "≧̸", "≱", "≱", "≧̸", "⩾̸", "⩾̸", "≵", "≯", "≯", "⇎", "↮", "⫲", "∋", "⋼", "⋺", "∋", "њ", + "⇍", "≦̸", "↚", "‥", "≰", "↚", "↮", "≰", "≦̸", "⩽̸", "⩽̸", "≮", "≴", "≮", "⋪", "⋬", "∤", "𝕟", "¬", + "¬", "∉", "⋹̸", "⋵̸", "∉", "⋷", "⋶", "∌", "∌", "⋾", "⋽", "∦", "∦", "⫽⃥", "∂̸", "⨔", "⊀", "⋠", "⪯̸", + "⊀", "⪯̸", "⇏", "↛", "⤳̸", "↝̸", "↛", "⋫", "⋭", "⊁", "⋡", "⪰̸", "𝓃", "∤", "∦", "≁", "≄", "≄", "∤", + "∦", "⋢", "⋣", "⊄", "⫅̸", "⊈", "⊂⃒", "⊈", "⫅̸", "⊁", "⪰̸", "⊅", "⫆̸", "⊉", "⊃⃒", "⊉", "⫆̸", "≹", "ñ", + "ñ", "≸", "⋪", "⋬", "⋫", "⋭", "ν", "#", "№", " ", "⊭", "⤄", "≍⃒", "⊬", "≥⃒", ">⃒", "⧞", "⤂", "≤⃒", + "<⃒", "⊴⃒", "⤃", "⊵⃒", "∼⃒", "⇖", "⤣", "↖", "↖", "⤧", "Ⓢ", "ó", "ó", "⊛", "ô", "ô", "о", "⊝", "ő", + "⨸", "⊙", "⦼", "œ", "⦿", "𝔬", "˛", "ò", "ò", "⧁", "⦵", "Ω", "∮", "↺", "⦾", "⦻", "‾", "⧀", "ō", + "ω", "ο", "⦶", "⊖", "𝕠", "⦷", "⦹", "⊕", "∨", "↻", "º", "ℴ", "ℴ", "ª", "º", "⊶", "⩖", "⩗", "⩛", + "ℴ", "ø", "ø", "⊘", "õ", "õ", "⊗", "⨶", "ö", "ö", "⌽", "¶", "¶", "∥", "⫳", "⫽", "∂", "п", "%", + ".", "‰", "⊥", "‱", "𝔭", "φ", "ϕ", "ℳ", "☎", "π", "⋔", "ϖ", "ℏ", "ℎ", "ℏ", "+", "⨣", "⊞", "⨢", + "∔", "⨥", "⩲", "±", "±", "⨦", "⨧", "±", "⨕", "𝕡", "£", "£", "≺", "⪳", "⪷", "≼", "⪯", "≺", "⪷", + "≼", "⪯", "⪹", "⪵", "⋨", "≾", "′", "ℙ", "⪵", "⪹", "⋨", "∏", "⌮", "⌒", "⌓", "∝", "∝", "≾", "⊰", + "𝓅", "ψ", " ", "𝔮", "⨌", "𝕢", "⁗", "𝓆", "ℍ", "⨖", "?", "≟", "\"", "\"", "⇛", "⇒", "⤜", "⤏", + "⥤", "∽̱", "ŕ", "√", "⦳", "⟩", "⦒", "⦥", "⟩", "»", "»", "→", "⥵", "⇥", "⤠", "⤳", "⤞", "↪", "↬", + "⥅", "⥴", "↣", "↝", "⤚", "∶", "ℚ", "⤍", "❳", "}", "]", "⦌", "⦎", "⦐", "ř", "ŗ", "⌉", "}", "р", + "⤷", "⥩", "”", "”", "↳", "ℜ", "ℛ", "ℜ", "ℝ", "▭", "®", "®", "⥽", "⌋", "𝔯", "⇁", "⇀", "⥬", "ρ", + "ϱ", "→", "↣", "⇁", "⇀", "⇄", "⇌", "⇉", "↝", "⋌", "˚", "≓", "⇄", "⇌", "‏", "⎱", "⎱", "⫮", "⟭", + "⇾", "⟧", "⦆", "𝕣", "⨮", "⨵", ")", "⦔", "⨒", "⇉", "›", "𝓇", "↱", "]", "’", "’", "⋌", "⋊", "▹", + "⊵", "▸", "⧎", "⥨", "℞", "ś", "‚", "≻", "⪴", "⪸", "š", "≽", "⪰", "ş", "ŝ", "⪶", "⪺", "⋩", "⨓", + "≿", "с", "⋅", "⊡", "⩦", "⇘", "⤥", "↘", "↘", "§", "§", ";", "⤩", "∖", "∖", "✶", "𝔰", "⌢", "♯", + "щ", "ш", "∣", "∥", "\u{AD}", "\u{AD}", "σ", "ς", "ς", "∼", "⩪", "≃", "≃", "⪞", "⪠", "⪝", "⪟", + "≆", "⨤", "⥲", "←", "∖", "⨳", "⧤", "∣", "⌣", "⪪", "⪬", "⪬︀", "ь", "/", "⧄", "⌿", "𝕤", "♠", "♠", + "∥", "⊓", "⊓︀", "⊔", "⊔︀", "⊏", "⊑", "⊏", "⊑", "⊐", "⊒", "⊐", "⊒", "□", "□", "▪", "▪", "→", "𝓈", + "∖", "⌣", "⋆", "☆", "★", "ϵ", "ϕ", "¯", "⊂", "⫅", "⪽", "⊆", "⫃", "⫁", "⫋", "⊊", "⪿", "⥹", "⊂", + "⊆", "⫅", "⊊", "⫋", "⫇", "⫕", "⫓", "≻", "⪸", "≽", "⪰", "⪺", "⪶", "⋩", "≿", "∑", "♪", "⊃", "¹", + "²", "³", "⫆", "⪾", "⫘", "⊇", "⫄", "⟉", "⫗", "⥻", "⫂", "⫌", "⊋", "⫀", "⊃", "⊇", "⫆", "⊋", "⫌", + "⫈", "⫔", "⫖", "⇙", "⤦", "↙", "↙", "⤪", "ß", "ß", "⌖", "τ", "⎴", "ť", "ţ", "т", "⃛", "⌕", "𝔱", + "∴", "∴", "θ", "ϑ", "ϑ", "≈", "∼", " ", "≈", "∼", "þ", "þ", "˜", "×", "×", "⊠", "⨱", "⨰", "∭", + "⤨", "⊤", "⌶", "⫱", "𝕥", "⫚", "⤩", "‴", "™", "▵", "▿", "◃", "⊴", "≜", "▹", "⊵", "◬", "≜", "⨺", + "⨹", "⧍", "⨻", "⏢", "𝓉", "ц", "ћ", "ŧ", "≬", "↞", "↠", "⇑", "⥣", "ú", "ú", "↑", "ў", "ŭ", "û", + "û", "у", "⇅", "ű", "⥮", "⥾", "𝔲", "ù", "ù", "↿", "↾", "▀", "⌜", "⌜", "⌏", "◸", "ū", "¨", "¨", + "ų", "𝕦", "↑", "↕", "↿", "↾", "⊎", "υ", "ϒ", "υ", "⇈", "⌝", "⌝", "⌎", "ů", "◹", "𝓊", "⋰", "ũ", + "▵", "▴", "⇈", "ü", "ü", "⦧", "⇕", "⫨", "⫩", "⊨", "⦜", "ϵ", "ϰ", "∅", "ϕ", "ϖ", "∝", "↕", "ϱ", + "ς", "⊊︀", "⫋︀", "⊋︀", "⫌︀", "ϑ", "⊲", "⊳", "в", "⊢", "∨", "⊻", "≚", "⋮", "|", "|", "𝔳", "⊲", "⊂⃒", + "⊃⃒", "𝕧", "∝", "⊳", "𝓋", "⫋︀", "⊊︀", "⫌︀", "⊋︀", "⦚", "ŵ", "⩟", "∧", "≙", "℘", "𝔴", "𝕨", "℘", "≀", + "≀", "𝓌", "⋂", "◯", "⋃", "▽", "𝔵", "⟺", "⟷", "ξ", "⟸", "⟵", "⟼", "⋻", "⨀", "𝕩", "⨁", "⨂", "⟹", + "⟶", "𝓍", "⨆", "⨄", "△", "⋁", "⋀", "ý", "ý", "я", "ŷ", "ы", "¥", "¥", "𝔶", "ї", "𝕪", "𝓎", "ю", + "ÿ", "ÿ", "ź", "ž", "з", "ż", "ℨ", "ζ", "𝔷", "ж", "⇝", "𝕫", "𝓏", "‍", "‌", +]; diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs new file mode 100644 index 0000000..7b7962b --- /dev/null +++ b/src/construct/blank_line.rs @@ -0,0 +1,61 @@ +//! Blank lines are a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! blank_line ::= *(' ' '\t') +//! ``` +//! +//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! from another paragraph. +//! In several cases, blank lines are not needed between flow constructs, +//! such as between two headings. +//! Sometimes, whether blank lines are present, changes the behavior of how +//! HTML is rendered, such as whether blank lines are present between list +//! items in a list. +//! More than one blank line is never needed in `CommonMark`. +//! +//! Because blank lines can be empty (line endings are not considered part of +//! it), and events cannot be empty, blank lines are not present as a token. +//! +//! ## References +//! +//! * [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js) +//! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) +//! +//! + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a blank line. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace), + |_ok| Box::new(after), + )(tokenizer, code) +} + +/// After zero or more spaces or tabs, before a line ending or EOF. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +fn after(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs new file mode 100644 index 0000000..5ea995e --- /dev/null +++ b/src/construct/character_escape.rs @@ -0,0 +1,69 @@ +//! Character escapes are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_escape ::= '\\' ascii_punctuation +//! ``` +//! +//! Like much of markdown, there are no “invalid” character escapes: just a +//! slash, or a slash followed by anything other than an ASCII punctuation +//! character, is exactly that: just a slash. +//! To escape (most) arbitrary characters, use a +//! [character reference][] instead +//! (as in, `&`, `{`, or say ` `). +//! It is also possible to escape a line ending in text with a similar +//! construct: a backslash followed by a line ending (that is part of the +//! construct instead of ending it). +//! +//! ## References +//! +//! * [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js) +//! * [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes) +//! +//! [character reference]: crate::construct::character_reference +//! +//! + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a character escape. +/// +/// ```markdown +/// a|\*b +/// a|\b +/// a|\ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('\\') => { + tokenizer.enter(TokenType::CharacterEscape); + tokenizer.enter(TokenType::CharacterEscapeMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeMarker); + (State::Fn(Box::new(inside)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character escape, after `\`. +/// +/// ```markdown +/// a\|*b +/// a\|b +/// a\| b +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char.is_ascii_punctuation() => { + tokenizer.enter(TokenType::CharacterEscapeValue); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterEscapeValue); + tokenizer.exit(TokenType::CharacterEscape); + (State::Ok, None) + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs new file mode 100644 index 0000000..27275d5 --- /dev/null +++ b/src/construct/character_reference.rs @@ -0,0 +1,237 @@ +//! Character references are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_reference ::= '&' (numeric | named) ';' +//! +//! numeric ::= '#' (hexadecimal | decimal) +//! ; Note: Limit of `6` imposed as all bigger numbers are invalid: +//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) +//! ; Note: Limit of `7` imposed as all bigger numbers are invalid: +//! decimal ::= 1*7(ascii_digit) +//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`: +//! ; Note: Limited to any known named character reference (see `constants.rs`) +//! named ::= 1*31(ascii_alphanumeric) +//! ``` +//! +//! Like much of markdown, there are no “invalid” character references. +//! However, for security reasons, several numeric character references parse +//! fine but are not rendered as their corresponding character and they are +//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). +//! See [`decode_numeric_character_reference`][decode_numeric] for more info. +//! +//! To escape ASCII punctuation characters, use the terser +//! [character escape][character_escape] construct instead (as in, `\&`). +//! +//! Character references in markdown are not the same as character references +//! in HTML. +//! Notably, HTML allows several character references without a closing +//! semicolon. +//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info. +//! +//! Character references are parsed insensitive to casing. +//! The casing of hexadecimal numeric character references has no effect. +//! The casing of named character references does not matter when parsing them, +//! but does affect whether they match. +//! Depending on the name, one or more cases are allowed, such as that `AMP` +//! and `amp` are both allowed but other cases are not. +//! See [`CHARACTER_REFERENCE_NAMES`][character_reference_names] for which +//! names match. +//! +//! ## References +//! +//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) +//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +//! +//! [character_escape]: crate::construct::character_reference +//! [decode_numeric]: crate::util::decode_numeric_character_reference +//! [character_reference_names]: crate::constant::CHARACTER_REFERENCE_NAMES +//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state +//! +//! + +use crate::constant::{ + CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + CHARACTER_REFERENCE_NAMED_SIZE_MAX, CHARACTER_REFERENCE_NAMES, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of a character reference. +#[derive(Debug, Clone)] +pub enum Kind { + /// Numeric decimal character reference (` `). + Decimal, + /// Numeric hexadecimal character reference (`{`). + Hexadecimal, + /// Named character reference (`&`). + Named, +} + +/// State needed to parse character references. +#[derive(Debug, Clone)] +struct Info { + /// All parsed characters. + buffer: Vec, + /// Kind of character reference. + kind: Kind, +} + +/// Start of a character reference. +/// +/// ```markdown +/// a|&b +/// a|{b +/// a| b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char('&') => { + tokenizer.enter(TokenType::CharacterReference); + tokenizer.enter(TokenType::CharacterReferenceMarker); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarker); + (State::Fn(Box::new(open)), None) + } + _ => (State::Nok, None), + } +} + +/// Inside a character reference, after `&`, before `#` for numeric references +/// or an alphanumeric for named references. +/// +/// ```markdown +/// a&|amp;b +/// a&|#123;b +/// a&|#x9;b +/// ``` +fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); + (State::Fn(Box::new(numeric)), None) + } else { + tokenizer.enter(TokenType::CharacterReferenceValue); + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Named, + }, + ) + } +} + +/// Inside a numeric character reference, right before `x` for hexadecimals, +/// or a digit for decimals. +/// +/// ```markdown +/// a&#|123;b +/// a&#|x9;b +/// ``` +fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::Char(char) if char == 'x' || char == 'X' => { + tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal); + tokenizer.enter(TokenType::CharacterReferenceValue); + + ( + State::Fn(Box::new(|tokenizer, code| { + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Hexadecimal, + }, + ) + })), + None, + ) + } + _ => { + tokenizer.enter(TokenType::CharacterReferenceValue); + + value( + tokenizer, + code, + Info { + buffer: vec![], + kind: Kind::Decimal, + }, + ) + } + } +} + +/// Inside a character reference value, after the markers (`&#x`, `&#`, or +/// `&`) that define its kind, but before the `;`. +/// The character reference kind defines what and how many characters are +/// allowed. +/// +/// ```markdown +/// a&a|mp;b +/// a|23;b +/// a&#x|9;b +/// ``` +fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { + match code { + Code::Char(';') if !info.buffer.is_empty() => { + tokenizer.exit(TokenType::CharacterReferenceValue); + let value = info.buffer.iter().collect::(); + + if let Kind::Named = info.kind { + if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { + return (State::Nok, Some(vec![code])); + } + } + + tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); + tokenizer.exit(TokenType::CharacterReference); + (State::Ok, None) + } + Code::Char(char) => { + let len = info.buffer.len(); + + let cont = match info.kind { + Kind::Hexadecimal + if char.is_ascii_hexdigit() + && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX => + { + true + } + Kind::Decimal + if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX => + { + true + } + Kind::Named + if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX => + { + true + } + _ => false, + }; + + if cont { + let mut clone = info; + clone.buffer.push(char); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))), + None, + ) + } else { + (State::Nok, None) + } + } + _ => (State::Nok, None), + } +} diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs new file mode 100644 index 0000000..2068a62 --- /dev/null +++ b/src/construct/code_fenced.rs @@ -0,0 +1,581 @@ +//! Code (fenced) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! +//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' +//! info ::= 1*text +//! meta ::= 1*text *( *space_or_tab 1*text ) +//! +//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the +//! ; marker of the opening fence sequence. +//! text ::= code - eol - space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! code ::= . ; any unicode code point (other than line endings). +//! ``` +//! +//! The above grammar does not show how whitespace is handled. +//! To parse code (fenced), let `X` be the number of whitespace characters +//! before the opening fence sequence. +//! Each line of content is then allowed (not required) to be indented with up +//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the code. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the code instead of a fence. +//! +//! Code (fenced) relates to both the `
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! The optional `meta` part is ignored: it is not used when parsing or
+//! rendering.
+//! The optional `info` part is used and is expected to specify the programming
+//! language that the code is in.
+//! Which value it holds depends on what your syntax highlighter supports, if
+//! one is used.
+//! The `info` is, when rendering to HTML, typically exposed as a class.
+//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
+//! element*][html-code]).
+//! For example:
+//!
+//! ```markdown
+//! ~~~css
+//! * { color: tomato }
+//! ~~~
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! 
* { color: tomato }
+//! 
+//! ``` +//! +//! The `info` and `meta` parts are interpreted as the string content type. +//! That means that character escapes and character reference are allowed. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the +//! [code (indented)][code-indented] construct. +//! That construct is less explicit, different from code (text), and has no +//! support for specifying the programming language, so it is recommended to +//! use code (fenced) instead of code (indented). +//! +//! ## References +//! +//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! [code-indented]: crate::construct::code_indented +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! + +use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::get_span; + +/// Kind of fences. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { + /// Grave accent (tick) code. + GraveAccent, + /// Tilde code. + Tilde, +} + +/// State needed to parse code (fenced). +#[derive(Debug, Clone)] +struct Info { + /// Number of markers on the opening fence sequence. + size: usize, + /// Number of tabs or spaces of indentation before the opening fence + /// sequence. + prefix: usize, + /// Kind of fences. + kind: Kind, +} + +/// Start of fenced code. +/// +/// ```markdown +/// | ~~~js +/// console.log(1); +/// ~~~ +/// ``` +/// +/// Parsing note: normally, the prefix is already stripped. +/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need +/// it. +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFenced); + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_sequence_open), + )(tokenizer, code) +} + +/// Inside the opening fence, after an optional prefix, before a sequence. +/// +/// ```markdown +/// |~~~js +/// console.log(1); +/// ~~~ +/// ``` +fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + Code::Char(char) if char == '`' || char == '~' => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + kind: if char == '`' { + Kind::GraveAccent + } else { + Kind::Tilde + }, + }, + code, + ) + } + _ => (State::Nok, None), + } +} + +/// Inside the opening fence sequence. +/// +/// ```markdown +/// ~|~~js +/// console.log(1); +/// ~~~ +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + let mut info = info; + info.size += 1; + sequence_open(tokenizer, info, code) + })), + None, + ) + } + _ => { + if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { + (State::Nok, None) + } else { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| { + whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) + }, + |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), + )(tokenizer, code) + } + } + } +} + +/// Inside the opening fence, after the sequence (and optional whitespace), before the info. +/// +/// ```markdown +/// ~~~|js +/// console.log(1); +/// ~~~ +/// ``` +fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceInfo); + tokenizer.enter(TokenType::ChunkString); + info_inside(tokenizer, info, code, vec![]) + } + } +} + +/// Inside the opening fence info. +/// +/// ```markdown +/// ~~~j|s +/// console.log(1); +/// ~~~ +/// ``` +fn info_inside( + tokenizer: &mut Tokenizer, + info: Info, + code: Code, + codes: Vec, +) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + println!("to do: subtokenize: {:?}", codes); + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceInfo); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), + )(tokenizer, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char(_) => { + let mut codes = codes; + codes.push(code); + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + info_inside(tokenizer, info, code, codes) + })), + None, + ) + } + } +} + +/// Inside the opening fence, after the info and whitespace, before the meta. +/// +/// ```markdown +/// ~~~js |eval +/// console.log(1); +/// ~~~ +/// ``` +fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.enter(TokenType::CodeFencedFenceMeta); + tokenizer.enter(TokenType::ChunkString); + meta(tokenizer, info, code) + } + } +} + +/// Inside the opening fence meta. +/// +/// ```markdown +/// ~~~js e|val +/// console.log(1); +/// ~~~ +/// ``` +fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ChunkString); + tokenizer.exit(TokenType::CodeFencedFenceMeta); + tokenizer.exit(TokenType::CodeFencedFence); + at_break(tokenizer, info, code) + } + Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), + None, + ) + } + } +} + +/// At an eol/eof in code, before a closing fence or before content. +/// +/// ```markdown +/// ~~~js| +/// aa| +/// ~~~ +/// ``` +fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let clone = info.clone(); + + match code { + Code::None => after(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt( + |tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + close_before(tokenizer, info, code) + })), + None, + ) + }, + |ok| { + if ok { + Box::new(after) + } else { + Box::new(|tokenizer, code| { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + ( + State::Fn(Box::new(|tokenizer, code| { + content_start(tokenizer, clone, code) + })), + None, + ) + }) + } + }, + )(tokenizer, code), + _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), + } +} + +/// Before a closing fence, before optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// | ~~~ +/// ``` +fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + tokenizer.enter(TokenType::CodeFencedFence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), + )(tokenizer, code) +} + +/// In a closing fence, after optional whitespace, before sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// |~~~ +/// ``` +fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + // To do: 4+ should be okay if code (indented) is turned off! + if prefix >= TAB_SIZE { + return (State::Nok, None); + } + + match code { + Code::Char(char) if char == marker => { + tokenizer.enter(TokenType::CodeFencedFenceSequence); + close_sequence(tokenizer, info, code, 0) + } + _ => (State::Nok, None), + } +} + +/// In the closing fence sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~|~~ +/// ``` +fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { + let marker = if info.kind == Kind::GraveAccent { + '`' + } else { + '~' + }; + + match code { + Code::Char(char) if char == marker => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + close_sequence(tokenizer, info, code, size + 1) + })), + None, + ) + } + _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { + tokenizer.exit(TokenType::CodeFencedFenceSequence); + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), + |_ok| Box::new(close_whitespace_after), + )(tokenizer, code) + } + _ => (State::Nok, None), + } +} + +/// After the closing fence sequence after optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~ | +/// ``` +fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFencedFence); + (State::Ok, Some(vec![code])) + } + _ => (State::Nok, None), + } +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// ~~~js +/// |aa +/// ~~~ +/// ``` +fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + at_break(tokenizer, info, code) + } + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { + tokenizer.enter(TokenType::Whitespace); + content_prefix(tokenizer, info, 0, code) + } + _ => { + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// Before code content, in a prefix. +/// +/// ```markdown +/// ~~~js +/// | aa +/// ~~~ +/// ``` +fn content_prefix( + tokenizer: &mut Tokenizer, + info: Info, + prefix: usize, + code: Code, +) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + content_prefix(tokenizer, info, prefix + 1, code) + })), + None, + ) + } + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::Whitespace); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.exit(TokenType::Whitespace); + tokenizer.enter(TokenType::CodeFlowChunk); + content_continue(tokenizer, info, code) + } + } +} + +/// In code content. +/// +/// ```markdown +/// ~~~js +/// |ab +/// a|b +/// ab| +/// ~~~ +/// ``` +fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::CodeFlowChunk); + at_break(tokenizer, info, code) + } + _ => { + tokenizer.consume(code); + ( + State::Fn(Box::new(|tokenizer, code| { + content_continue(tokenizer, info, code) + })), + None, + ) + } + } +} + +/// After fenced code. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::CodeFenced); + (State::Ok, Some(vec![code])) +} diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs new file mode 100644 index 0000000..6bf089b --- /dev/null +++ b/src/construct/code_indented.rs @@ -0,0 +1,190 @@ +//! Code (indented) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line ) +//! +//! ; Restriction: at least one `code` must not be whitespace. +//! indented_filled_line ::= 4space_or_tab *code +//! blank_line ::= *space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Code (indented) relates to both the `
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! In markdown, it is also possible to use code (text) in the text content
+//! type.
+//! It is also possible to create code with the [code (fenced)][code-fenced]
+//! construct.
+//! That construct is more explicit, more similar to code (text), and has
+//! support for specifying the programming language that the code is in, so it
+//! is recommended to use that instead of indented code.
+//!
+//! ## References
+//!
+//! *   [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js)
+//! *   [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks)
+//!
+//! [code-fenced]: crate::construct::code_fenced
+//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+//!
+//! 
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of code (indented).
+///
+/// ```markdown
+/// |    asd
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.enter(TokenType::CodeIndented);
+            tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+            indent(tokenizer, code, 0)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// Inside the initial whitespace.
+///
+/// ```markdown
+///  |   asd
+///   |  asd
+///    | asd
+///     |asd
+/// ```
+///
+/// > **Parsing note**: it is not needed to check if this first line is a
+/// > filled line (that it has a non-whitespace character), because blank lines
+/// > are parsed already, so we never run into that.
+fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        _ if size == TAB_SIZE => {
+            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+            at_break(tokenizer, code)
+        }
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    indent(tokenizer, code, size + 1)
+                })),
+                None,
+            )
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// At a break.
+///
+/// ```markdown
+///     |asd
+///     asd|
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None => after(tokenizer, code),
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
+            .attempt(further_start, |ok| {
+                Box::new(if ok { at_break } else { after })
+            })(tokenizer, code),
+        _ => {
+            tokenizer.enter(TokenType::CodeFlowChunk);
+            content(tokenizer, code)
+        }
+    }
+}
+
+/// Inside code content.
+///
+/// ```markdown
+///     |ab
+///     a|b
+///     ab|
+/// ```
+fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    match code {
+        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.exit(TokenType::CodeFlowChunk);
+            at_break(tokenizer, code)
+        }
+        _ => {
+            tokenizer.consume(code);
+            (State::Fn(Box::new(content)), None)
+        }
+    }
+}
+
+/// After indented code.
+///
+/// ```markdown
+///     ab|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    tokenizer.exit(TokenType::CodeIndented);
+    (State::Ok, Some(vec![code]))
+}
+
+/// Right at a line ending, trying to parse another indent.
+///
+/// ```markdown
+///     ab|
+///     cd
+/// ```
+fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+    // To do: `nok` if lazy line.
+    match code {
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.enter(TokenType::LineEnding);
+            tokenizer.consume(code);
+            tokenizer.exit(TokenType::LineEnding);
+            (State::Fn(Box::new(further_start)), None)
+        }
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+            further_indent(tokenizer, code, 0)
+        }
+        _ => (State::Nok, None),
+    }
+}
+
+/// Inside further whitespace.
+///
+/// ```markdown
+///     asd
+///   |  asd
+/// ```
+fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+    match code {
+        _ if size == TAB_SIZE => {
+            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+            (State::Ok, Some(vec![code]))
+        }
+        Code::VirtualSpace | Code::Char(' ' | '\t') => {
+            tokenizer.consume(code);
+            (
+                State::Fn(Box::new(move |tokenizer, code| {
+                    further_indent(tokenizer, code, size + 1)
+                })),
+                None,
+            )
+        }
+        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+            further_start(tokenizer, code)
+        }
+        _ => (State::Nok, None),
+    }
+}
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
new file mode 100644
index 0000000..b3aef1b
--- /dev/null
+++ b/src/construct/heading_atx.rs
@@ -0,0 +1,175 @@
+//! Heading (atx) is a construct that occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab
+//!
+//! code ::= . ; any unicode code point (other than line endings).
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Headings in markdown relate to the `

` through `

` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! `CommonMark` introduced the requirement on whitespace existing after the +//! opening sequence and before text. +//! In older markdown versions, this was not required, and headings would form +//! without it. +//! +//! In markdown, it is also possible to create headings with the setext heading +//! construct. +//! The benefit of setext headings is that their text can include line endings. +//! However, their limit is that they cannot form `

` through `

` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) +//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) +//! +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ +//! +//! + +use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a heading (atx). +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if Code::Char('#') == code { + tokenizer.enter(TokenType::AtxHeading); + tokenizer.enter(TokenType::AtxHeadingSequence); + sequence_open(tokenizer, code, 0) + } else { + (State::Nok, None) + } +} + +/// In the opening sequence. +/// +/// ```markdown +/// #|# alpha +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { + match code { + Code::None + | Code::CarriageReturnLineFeed + | Code::VirtualSpace + | Code::Char('\t' | '\n' | '\r' | ' ') + if rank > 0 => + { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } + Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |tokenizer, code| { + sequence_open(tokenizer, code, rank + 1) + })), + None, + ) + } + _ => (State::Nok, None), + } +} + +/// After something but before something else. +/// +/// ```markdown +/// ## |alpha +/// ## alpha| bravo +/// ## alpha |bravo +/// ## alpha bravo|## +/// ## alpha bravo ##| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::AtxHeading); + (State::Ok, Some(vec![code])) + } + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.enter(TokenType::AtxHeadingWhitespace); + whitespace(tokenizer, code) + } + Code::Char('#') => { + tokenizer.enter(TokenType::AtxHeadingSequence); + further_sequence(tokenizer, code) + } + Code::Char(_) => { + tokenizer.enter(TokenType::AtxHeadingText); + data(tokenizer, code) + } + } +} + +/// In a further sequence (after whitespace). +/// Could be normal “visible” hashes in the heading or a final sequence. +/// +/// ```markdown +/// ## alpha #|# +/// ``` +fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::Char('#') = code { + tokenizer.consume(code); + (State::Fn(Box::new(further_sequence)), None) + } else { + tokenizer.exit(TokenType::AtxHeadingSequence); + at_break(tokenizer, code) + } +} + +/// In whitespace. +/// +/// ```markdown +/// ## alpha | bravo +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::VirtualSpace | Code::Char('\t' | ' ') => { + tokenizer.consume(code); + (State::Fn(Box::new(whitespace)), None) + } + _ => { + tokenizer.exit(TokenType::AtxHeadingWhitespace); + at_break(tokenizer, code) + } + } +} + +/// In text. +/// +/// ```markdown +/// ## al|pha +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + tokenizer.exit(TokenType::AtxHeadingText); + at_break(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(data)), None) + } + } +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '' *line ] +//! cdata ::= '' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//!
This is a div but *this* is not emphasis.
+//! +//!
+//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//!
+//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { + /// Not yet known. + Unknown, + /// Symbol for ` +okay", + DANGER + ), + " +

okay

", + "should support raw script tags" + ); + + assert_eq!( + micromark_with_options( + " +okay", + DANGER + ), + " +

okay

", + "should support raw style tags" + ); + + assert_eq!( + micromark_with_options("\n\nfoo", DANGER), + "\n\nfoo", + "should support raw tags w/o ending" + ); + + // To do: phrasing. + // assert_eq!( + // micromark_with_options("\n*foo*", DANGER), + // "\n

foo

", + // "should support raw tags w/ start and end on a single line" + // ); + + assert_eq!( + micromark_with_options("1. *bar*", DANGER), + "1. *bar*", + "should support raw tags w/ more data on ending line" + ); + + assert_eq!( + micromark_with_options("</script\nmore

", + // "should not support a raw closing tag" + // ); + + assert_eq!( + micromark_with_options("", DANGER), + "", + "should support blank lines in raw" + ); + + // To do: block quote. + // assert_eq!( + // micromark_with_options(">