diff options
Diffstat (limited to '')
34 files changed, 8997 insertions, 0 deletions
| diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..201f7b7 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.rs] +indent_size = 4 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..cbee315 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,24 @@ +name: main +on: +  - pull_request +  - push +jobs: +  main: +    name: ${{matrix.rust}} +    runs-on: ubuntu-latest +    steps: +      - uses: actions/checkout@v2 +      - uses: actions-rs/toolchain@v1 +        with: +          toolchain: ${{matrix.rust}} +          components: rustfmt, clippy +      - run: cargo clippy -- -W clippy::pedantic +      - run: cargo fmt --all -- --check +      - run: cargo test +      - run: cargo install cargo-tarpaulin && cargo tarpaulin --out Xml +      - uses: codecov/codecov-action@v1 +    strategy: +      matrix: +        rust: +          - stable +          - beta diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..32a28f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.DS_Store +*.log +*.lock +coverage/ +target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..96f23d7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "micromark" +version = "0.0.0" +authors = ["Titus Wormer <tituswormer@gmail.com>"] +edition = "2015" +rust-version = "1.56" +description = "small commonmark compliant markdown parser with positional info and concrete tokens" +homepage = "https://github.com/micromark/micromark-rs" +repository = "https://github.com/micromark/micromark-rs" +license = "MIT" +keywords = ["commonmark", "markdown", "parse", "render", "tokenize"] +categories = ["compilers", "encoding", "parser-implementations", "parsing", "text-processing"] +include = ["src/", "license"] +publish = false + +[dependencies] +log = "0.4" +env_logger = "0.9" diff --git a/Untitled.txt b/Untitled.txt new file mode 100644 index 0000000..cc1576f --- /dev/null +++ b/Untitled.txt @@ -0,0 +1 @@ +micromark.js: unquoted: is `completeAttributeValueUnquoted`s case for `completeAttributeNameAfter` missing a `/`?. I’ve added it here. diff --git a/examples/lib.rs b/examples/lib.rs new file mode 100644 index 0000000..4d01161 --- /dev/null +++ b/examples/lib.rs @@ -0,0 +1,22 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, CompileOptions}; + +fn main() { +    // Turn on debugging. +    // You can show it with `RUST_LOG=debug cargo run --example lib` +    env_logger::init(); + +    // Safely turn (untrusted?) markdown into HTML. +    println!("{:?}", micromark("# Hello, world!")); + +    // Turn trusted markdown into HTML. +    println!( +        "{:?}", +        micromark_with_options( +            "<div style=\"color: tomato\">\n\n# Hello, tomato!\n\n</div>", +            &CompileOptions { +                allow_dangerous_html: true +            } +        ) +    ); +} diff --git a/funding.yml b/funding.yml new file mode 100644 index 0000000..dee132d --- /dev/null +++ b/funding.yml @@ -0,0 +1 @@ +github: wooorm @@ -0,0 +1,22 @@ +(The MIT License) + +Copyright (c) 2022 Titus Wormer <tituswormer@gmail.com> + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +'Software'), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..8892183 --- /dev/null +++ b/readme.md @@ -0,0 +1,183 @@ +# micromark-rs + +Here be dragons! +🐉 +There’s a lot to do. +Some major to dos are described here, more smaller ones are in the code. + +## Some useful scripts for now + +Run examples: + +```sh +RUST_BACKTRACE=1 RUST_LOG=debug cargo run --example lib +``` + +Format: + +```sh +cargo fmt --all +``` + +Lint: + +```sh +cargo fmt --all -- --check && cargo clippy -- -W clippy::pedantic +``` + +Tests: + +```sh +RUST_BACKTRACE=1 cargo test +``` + +Docs: + +```sh +cargo doc --document-private-items +``` + +(add `--open` to open them in a browser) + +## To do + +### Some major obstacles + +- [ ] (8) Subtokenization: figure out a good, fast way to deal with constructs in +      one content type that also are another content type +- [ ] (1) Setext headings: can they be solved in content, or do they have to be +      solved in flow somehow +- [ ] (8) Can content (and to a lesser extent string and text) operate more +      performantly than checking whether other flow constructs start a line, +      before exiting and actually attempting flow constructs? +- [ ] (5) Figure out definitions and sharing those identifiers, and references +      before definitions +- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the +      previous construct (typically paragraph) +- [ ] (5) Containers: this will be rather messy, and depends a lot on how +      subtokenization is solved +- [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by +      containers +- [ ] (3) Lazy lines, in containers, in flow and content in a paragraph, a line +      does not need to be indented +- [ ] (5) There’s a lot of rust-related choosing whether to pass (mutable) +      references or whatever around that should be refactored +- [ ] (5) Figure out extensions +- [ ] (1) Support turning off constructs + +### Small things + +- [ ] (3) Clean compiler +- [ ] (1) Optionally remove dangerous protocols when compiling +- [ ] (1) Use preferred line ending style in markdown +- [ ] (1) Handle BOM at start +- [ ] (1) Make sure tabs are handled properly and that positional info is perfect +- [ ] (1) Make sure crlf/cr/lf are working perfectly +- [ ] (3) Figure out lifetimes of things (see `life time` in source) +- [ ] (3) Use `commonmark` tests +- [ ] (3) Share a bunch of tests with `micromark-js` +- [ ] (5) Do some research on rust best practices for APIs, e.g., what to accept, +      how to integrate with streams or so? +- [ ] (1) Go through clippy rules, and such, to add strict code styles +- [ ] (1) Make sure that rust character groups match CM character groups (e.g., is +      `unicode_whitespace` or so the same?) +- [ ] (1) Any special handling of surrogates? +- [ ] (1) Make sure debugging is useful for other folks +- [ ] (3) Add some benchmarks, do some perf testing +- [ ] (3) Write comparison to other parsers +- [ ] (3) Add node/etc bindings? +- [ ] (8) After all extensions, including MDX, are done, see if we can integrate +      this with SWC to compile MDX +- [ ] (3) Bunch of docs +- [ ] (5) Site + +### Constructs + +- [ ] (5) attention (strong, emphasis) (text) +- [ ] (1) autolink +- [x] blank line +- [ ] (5) block quote +- [x] character escape +- [x] character reference +- [x] code (fenced) +- [x] code (indented) +- [ ] (1) code (text) +- [ ] (3) content +- [ ] (3) definition +- [ ] (1) hard break escape +- [x] heading (atx) +- [ ] (1) heading (setext) +- [x] html (flow) +- [ ] html (text) +- [ ] (3) label end +- [ ] (3) label start (image) +- [ ] (3) label start (link) +- [ ] (8) list +- [ ] (1) paragraph +- [x] thematic break + +### Content types + +- [ ] (8) container +  - [ ] block quote +  - [ ] list +- [ ] (1) flow +  - [x] blank line +  - [x] code (fenced) +  - [x] code (indented) +  - [ ] content +  - [x] heading (atx) +  - [x] html (flow) +  - [x] thematic break +- [ ] (3) content +  - [ ] definition +  - [ ] heading (setext) +  - [ ] paragraph +- [ ] (5) text +  - [ ] attention (strong, emphasis) (text) +  - [ ] autolink +  - [x] character escape +  - [x] character reference +  - [ ] code (text) +  - [ ] hard break escape +  - [ ] html (text) +  - [ ] label end +  - [ ] label start (image) +  - [ ] label start (link) +- [x] string +  - [x] character escape +  - [x] character reference + +### Extensions + +The main thing here is is to figure out if folks could extend from the outside +with their own code, or if we need to maintain it all here. +Regardless, it is essential for the launch of `micromark-rs` that extensions +are theoretically or practically possible. +The extensions below are listed from top to bottom from more important to less +important. + +- [ ] (1) frontmatter (yaml, toml) (flow) +      — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter) +- [ ] (3) autolink literal (GFM) (text) +      — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) +- [ ] (3) footnote (GFM) (content, text) +      — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) +- [ ] (3) strikethrough (GFM) (text) +      — [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough) +- [ ] (5) table (GFM) (flow) +      — [`micromark-extension-gfm-table`](https://github.com/micromark/micromark-extension-gfm-table) +- [ ] (1) task list item (GFM) (text) +      — [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-task-list-item) +- [ ] (3) math (flow, text) +      — [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math) +- [ ] (8) directive (flow, text) +      — [`micromark-extension-directive`](https://github.com/micromark/micromark-extension-directive) +- [ ] (8) expression (MDX) (flow, text) +      — [`micromark-extension-mdx-expression`](https://github.com/micromark/micromark-extension-mdx-expression) +- [ ] (5) JSX (MDX) (flow, text) +      — [`micromark-extension-mdx-jsx`](https://github.com/micromark/micromark-extension-mdx-jsx) +- [ ] (3) ESM (MDX) (flow) +      — [`micromark-extension-mdxjs-esm`](https://github.com/micromark/micromark-extension-mdxjs-esm) +- [ ] (1) tagfilter (GFM) (n/a, renderer) +      — [`micromark-extension-gfm-tagfilter`](https://github.com/micromark/micromark-extension-gfm-tagfilter) diff --git a/src/compiler.rs b/src/compiler.rs new file mode 100644 index 0000000..166950e --- /dev/null +++ b/src/compiler.rs @@ -0,0 +1,367 @@ +//! Turn events into a string of HTML. +use crate::construct::character_reference::Kind as CharacterReferenceKind; +use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::util::{ +    decode_named_character_reference, decode_numeric_character_reference, encode, get_span, +    slice_serialize, +}; + +/// Configuration (optional). +#[derive(Default, Debug)] +pub struct CompileOptions { +    /// Whether to allow (dangerous) HTML. +    /// The default is `false`, you can turn it on to `true` for trusted +    /// content. +    pub allow_dangerous_html: bool, +} + +/// Turn events and codes into a string of HTML. +#[allow(clippy::too_many_lines)] +pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> String { +    let mut index = 0; +    // let mut last_was_tag = false; +    let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]]; +    let mut atx_opening_sequence_size: Option<usize> = None; +    let mut atx_heading_buffer: Option<String> = None; +    let mut code_flow_seen_data: Option<bool> = None; +    let mut code_fenced_fences_count: Option<usize> = None; +    let mut slurp_one_line_ending = false; +    let mut ignore_encode = false; +    let mut character_reference_kind: Option<CharacterReferenceKind> = None; +    // let mut slurp_all_line_endings = false; + +    println!("events: {:#?}", events); + +    while index < events.len() { +        let event = &events[index]; +        let token_type = &event.token_type; + +        match event.event_type { +            EventType::Enter => match token_type { +                TokenType::Content => { +                    buf_tail_mut(buffers).push("<p>".to_string()); +                } +                TokenType::CodeIndented => { +                    code_flow_seen_data = Some(false); +                    line_ending_if_needed(buffers); +                    buf_tail_mut(buffers).push("<pre><code>".to_string()); +                } +                TokenType::CodeFenced => { +                    code_flow_seen_data = Some(false); +                    line_ending_if_needed(buffers); +                    // Note: no `>`, which is added later. +                    buf_tail_mut(buffers).push("<pre><code".to_string()); +                    code_fenced_fences_count = Some(0); +                } +                TokenType::CodeFencedFenceInfo | TokenType::CodeFencedFenceMeta => { +                    buffer(buffers); +                } +                TokenType::HtmlFlow => { +                    line_ending_if_needed(buffers); +                    if options.allow_dangerous_html { +                        ignore_encode = true; +                    } +                } +                TokenType::ContentPhrasing +                | TokenType::AtxHeading +                | TokenType::AtxHeadingSequence +                | TokenType::AtxHeadingWhitespace +                | TokenType::AtxHeadingText +                | TokenType::LineEnding +                | TokenType::ThematicBreak +                | TokenType::ThematicBreakSequence +                | TokenType::ThematicBreakWhitespace +                | TokenType::CodeIndentedPrefixWhitespace +                | TokenType::CodeFlowChunk +                | TokenType::BlankLineEnding +                | TokenType::BlankLineWhitespace +                | TokenType::Whitespace +                | TokenType::HtmlFlowData +                | TokenType::CodeFencedFence +                | TokenType::CodeFencedFenceSequence +                | TokenType::ChunkString +                | TokenType::CodeFencedFenceWhitespace +                | TokenType::Data +                | TokenType::CharacterEscape +                | TokenType::CharacterEscapeMarker +                | TokenType::CharacterEscapeValue +                | TokenType::CharacterReference +                | TokenType::CharacterReferenceMarker +                | TokenType::CharacterReferenceMarkerNumeric +                | TokenType::CharacterReferenceMarkerHexadecimal +                | TokenType::CharacterReferenceMarkerSemi +                | TokenType::CharacterReferenceValue => {} +                #[allow(unreachable_patterns)] +                _ => { +                    unreachable!("unhandled `enter` of TokenType {:?}", token_type) +                } +            }, +            EventType::Exit => match token_type { +                TokenType::ThematicBreakSequence +                | TokenType::ThematicBreakWhitespace +                | TokenType::CodeIndentedPrefixWhitespace +                | TokenType::BlankLineEnding +                | TokenType::BlankLineWhitespace +                | TokenType::Whitespace +                | TokenType::CodeFencedFenceSequence +                | TokenType::CodeFencedFenceWhitespace +                | TokenType::CharacterEscape +                | TokenType::CharacterEscapeMarker +                | TokenType::CharacterReference +                | TokenType::CharacterReferenceMarkerSemi => {} +                TokenType::HtmlFlow => { +                    ignore_encode = false; +                } +                TokenType::HtmlFlowData => { +                    let slice = slice_serialize(codes, &get_span(events, index), false); + +                    let res = if ignore_encode { slice } else { encode(&slice) }; + +                    // last_was_tag = false; +                    buf_tail_mut(buffers).push(res); +                } +                TokenType::Content => { +                    buf_tail_mut(buffers).push("</p>".to_string()); +                } +                TokenType::CodeIndented | TokenType::CodeFenced => { +                    let seen_data = +                        code_flow_seen_data.expect("`code_flow_seen_data` must be defined"); + +                    // To do: containers. +                    // One special case is if we are inside a container, and the fenced code was +                    // not closed (meaning it runs to the end). +                    // In that case, the following line ending, is considered *outside* the +                    // fenced code and block quote by micromark, but CM wants to treat that +                    // ending as part of the code. +                    // if fenced_count != None && fenced_count < 2 && tightStack.length > 0 && !last_was_tag { +                    //     line_ending(); +                    // } + +                    // But in most cases, it’s simpler: when we’ve seen some data, emit an extra +                    // line ending when needed. +                    if seen_data { +                        line_ending_if_needed(buffers); +                    } + +                    buf_tail_mut(buffers).push("</code></pre>".to_string()); + +                    if let Some(count) = code_fenced_fences_count { +                        if count < 2 { +                            line_ending_if_needed(buffers); +                        } +                    } + +                    code_flow_seen_data = None; +                    code_fenced_fences_count = None; +                    slurp_one_line_ending = false; +                } +                TokenType::CodeFencedFence => { +                    let count = if let Some(count) = code_fenced_fences_count { +                        count +                    } else { +                        0 +                    }; + +                    if count == 0 { +                        buf_tail_mut(buffers).push(">".to_string()); +                        // tag = true; +                        slurp_one_line_ending = true; +                    } + +                    code_fenced_fences_count = Some(count + 1); +                } +                TokenType::CodeFencedFenceInfo => { +                    let value = resume(buffers); +                    buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); +                    // tag = true; +                } +                TokenType::CodeFencedFenceMeta => { +                    resume(buffers); +                } +                TokenType::CodeFlowChunk => { +                    code_flow_seen_data = Some(true); +                    buf_tail_mut(buffers).push(encode(&slice_serialize( +                        codes, +                        &get_span(events, index), +                        false, +                    ))); +                } +                // `AtxHeadingWhitespace` is ignored after the opening sequence, +                // before the closing sequence, and after the closing sequence. +                // But it is used around intermediate sequences. +                // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`. +                // `AtxHeadingSequence` is ignored as the opening and closing sequence, +                // but not when intermediate. +                TokenType::AtxHeadingWhitespace | TokenType::AtxHeadingSequence => { +                    if let Some(buf) = atx_heading_buffer { +                        atx_heading_buffer = Some( +                            buf.to_string() +                                + &encode(&slice_serialize(codes, &get_span(events, index), false)), +                        ); +                    } + +                    // First fence we see. +                    if None == atx_opening_sequence_size { +                        let rank = slice_serialize(codes, &get_span(events, index), false).len(); +                        atx_opening_sequence_size = Some(rank); +                        buf_tail_mut(buffers).push(format!("<h{}>", rank)); +                    } +                } +                TokenType::AtxHeadingText => { +                    println!("text: {:?}", atx_heading_buffer); +                    if let Some(ref buf) = atx_heading_buffer { +                        if !buf.is_empty() { +                            buf_tail_mut(buffers).push(encode(buf)); +                            atx_heading_buffer = Some("".to_string()); +                        } +                    } else { +                        atx_heading_buffer = Some("".to_string()); +                    } + +                    let slice = encode(&slice_serialize(codes, &get_span(events, index), false)); +                    println!("slice: {:?}", slice); +                    buf_tail_mut(buffers).push(slice); +                } +                TokenType::AtxHeading => { +                    let rank = atx_opening_sequence_size +                        .expect("`atx_opening_sequence_size` must be set in headings"); +                    buf_tail_mut(buffers).push(format!("</h{}>", rank)); +                    atx_opening_sequence_size = None; +                    atx_heading_buffer = None; +                } +                TokenType::ThematicBreak => { +                    buf_tail_mut(buffers).push("<hr />".to_string()); +                } +                TokenType::LineEnding => { +                    // if slurp_all_line_endings { +                    //     // Empty. +                    // } else +                    if slurp_one_line_ending { +                        slurp_one_line_ending = false; +                    // } else if code_text_inside { +                    //     buf_tail_mut(buffers).push(" ".to_string()); +                    } else { +                        buf_tail_mut(buffers).push(encode(&slice_serialize( +                            codes, +                            &get_span(events, index), +                            false, +                        ))); +                    } +                } +                TokenType::CharacterReferenceMarker => { +                    character_reference_kind = Some(CharacterReferenceKind::Named); +                } +                TokenType::CharacterReferenceMarkerNumeric => { +                    character_reference_kind = Some(CharacterReferenceKind::Decimal); +                } +                TokenType::CharacterReferenceMarkerHexadecimal => { +                    character_reference_kind = Some(CharacterReferenceKind::Hexadecimal); +                } +                TokenType::CharacterReferenceValue => { +                    let kind = character_reference_kind +                        .expect("expected `character_reference_kind` to be set"); +                    let reference = slice_serialize(codes, &get_span(events, index), false); +                    let ref_string = reference.as_str(); +                    let value = match kind { +                        CharacterReferenceKind::Decimal => { +                            decode_numeric_character_reference(ref_string, 10).to_string() +                        } +                        CharacterReferenceKind::Hexadecimal => { +                            decode_numeric_character_reference(ref_string, 16).to_string() +                        } +                        CharacterReferenceKind::Named => { +                            decode_named_character_reference(ref_string) +                        } +                    }; + +                    buf_tail_mut(buffers).push(value); + +                    character_reference_kind = None; +                } +                // To do: `ContentPhrasing` should be parsed as phrasing first. +                // This branch below currently acts as the resulting `data` tokens. +                TokenType::ContentPhrasing +                // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported. +                | TokenType::ChunkString +                | TokenType::Data +                | TokenType::CharacterEscapeValue => { +                    // last_was_tag = false; +                    buf_tail_mut(buffers).push(encode(&slice_serialize( +                        codes, +                        &get_span(events, index), +                        false, +                    ))); +                } +                #[allow(unreachable_patterns)] +                _ => { +                    unreachable!("unhandled `exit` of TokenType {:?}", token_type) +                } +            }, +        } + +        index += 1; +    } + +    assert!(buffers.len() == 1, "expected 1 final buffer"); +    buffers.get(0).expect("expected 1 final buffer").concat() +} + +/// Push a buffer. +fn buffer(buffers: &mut Vec<Vec<String>>) { +    buffers.push(vec![]); +} + +/// Pop a buffer, returning its value. +fn resume(buffers: &mut Vec<Vec<String>>) -> String { +    let buf = buffers.pop().expect("Cannot resume w/o buffer"); +    buf.concat() +} + +/// Get the last chunk of current buffer. +fn buf_tail_slice(buffers: &mut [Vec<String>]) -> Option<&String> { +    let tail = buf_tail(buffers); +    tail.last() +} + +/// Get the mutable last chunk of current buffer. +fn buf_tail_mut(buffers: &mut [Vec<String>]) -> &mut Vec<String> { +    buffers +        .last_mut() +        .expect("at least one buffer should exist") +} + +/// Get the current buffer. +fn buf_tail(buffers: &mut [Vec<String>]) -> &Vec<String> { +    buffers.last().expect("at least one buffer should exist") +} + +/// Add a line ending. +fn line_ending(buffers: &mut [Vec<String>]) { +    let tail = buf_tail_mut(buffers); +    // To do: use inferred line ending style. +    // lastWasTag = false +    tail.push("\n".to_string()); +} + +/// Add a line ending if needed (as in, there’s no eol/eof already). +fn line_ending_if_needed(buffers: &mut [Vec<String>]) { +    let slice = buf_tail_slice(buffers); +    let last_char = if let Some(x) = slice { +        x.chars().last() +    } else { +        None +    }; +    let mut add = true; + +    if let Some(x) = last_char { +        if x == '\n' || x == '\r' { +            add = false; +        } +    } else { +        add = false; +    } + +    if add { +        line_ending(buffers); +    } +} diff --git a/src/constant.rs b/src/constant.rs new file mode 100644 index 0000000..332fdaf --- /dev/null +++ b/src/constant.rs @@ -0,0 +1,2561 @@ +//! Constants needed to parse markdown. +//! +//! Most of these constants are magic numbers, such as the number of markers +//! needed to parse [code (fenced)][code_fenced] +//! ([`CODE_FENCED_SEQUENCE_SIZE_MIN`][]) or the max number of allowed markers +//! in a [heading (atx)][heading_atx] +//! ([`HEADING_ATX_OPENING_FENCE_SIZE_MAX`][]). +//! +//! Some constants are instead lists of things, such as the list of tag names +//! considered in the **raw** production of [HTML (flow)][html_flow] +//! ([`HTML_RAW_NAMES`][]), or the list of allowed named character references +//! ([`CHARACTER_REFERENCE_NAMES`][]). +//! +//! [code_fenced]: crate::construct::code_fenced +//! [heading_atx]: crate::construct::heading_atx +//! [html_flow]: crate::construct::html_flow + +/// The number of characters that form a tab stop. +/// +/// This relates to the number of whitespace characters needed to form certain +/// constructs in markdown, most notable the whitespace required to form +/// [code (indented)][code_indented]. +/// +/// <!-- To do: link to somewhere that discusses virtual spaces. --> +/// <!-- Ref: https://github.com/syntax-tree/mdast-util-to-markdown/issues/51 --> +/// +/// [code_indented]: crate::construct::code_indented +pub const TAB_SIZE: usize = 4; + +/// The number of markers needed for a [thematic break][thematic_break] to form. +/// +/// Like many things in markdown, the number is `3`. +/// +/// [thematic_break]: crate::construct::thematic_break +pub const THEMATIC_BREAK_MARKER_COUNT_MIN: usize = 3; + +/// The max number of markers allowed to form a [heading (atx)][heading_atx]. +/// +/// This limitation is imposed by HTML, which imposes a max heading rank of +/// `6`. +/// +/// [heading_atx]: crate::construct::heading_atx +pub const HEADING_ATX_OPENING_FENCE_SIZE_MAX: usize = 6; + +/// The number of markers needed for [code (fenced)][code_fenced] to form. +/// +/// Like many things in markdown, the number is `3`. +/// +/// [code_fenced]: crate::construct::code_fenced +pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3; + +/// List of HTML tag names that form the **raw** production of +/// [HTML (flow)][html_flow]. +/// +/// The **raw** production allows blank lines and thus no interleaving with +/// markdown. +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// The number of the longest tag name is also stored as a constant in +/// [`HTML_RAW_SIZE_MAX`][]. +/// +/// > 👉 **Note**: `textarea` was added in `CommonMark@0.30`. +/// +/// ## References +/// +/// *   [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +/// +/// [html_flow]: crate::construct::html_flow +pub const HTML_RAW_NAMES: [&str; 4] = ["pre", "script", "style", "textarea"]; + +/// The number of the longest tag name in [`HTML_RAW_NAMES`][]. +/// +/// This is currently the size of `textarea`. +pub const HTML_RAW_SIZE_MAX: usize = 8; + +/// List of HTML tag names that form the **basic** production of +/// [HTML (flow)][html_flow]. +/// +/// The **basic** production allows interleaving HTML and markdown with blank lines +/// and allows flow (block) elements to interrupt content. +/// Tag name matching must be performed insensitive to case, and thus this list +/// includes lowercase tag names. +/// +/// Tag names not on this list result in the **complete** production. +/// +/// > 👉 **Note**: `source` was removed on `main` of the `CommonMark` spec and +/// > is slated to be released in `CommonMark@0.31`. +/// +/// ## References +/// +/// *   [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +/// *   [*Remove source element as HTML block start condition* as `commonmark/commonmark-spec#710`](https://github.com/commonmark/commonmark-spec/pull/710) +/// +/// [html_flow]: crate::construct::html_flow +pub const HTML_BLOCK_NAMES: [&str; 61] = [ +    "address", +    "article", +    "aside", +    "base", +    "basefont", +    "blockquote", +    "body", +    "caption", +    "center", +    "col", +    "colgroup", +    "dd", +    "details", +    "dialog", +    "dir", +    "div", +    "dl", +    "dt", +    "fieldset", +    "figcaption", +    "figure", +    "footer", +    "form", +    "frame", +    "frameset", +    "h1", +    "h2", +    "h3", +    "h4", +    "h5", +    "h6", +    "head", +    "header", +    "hr", +    "html", +    "iframe", +    "legend", +    "li", +    "link", +    "main", +    "menu", +    "menuitem", +    "nav", +    "noframes", +    "ol", +    "optgroup", +    "option", +    "p", +    "param", +    "section", +    "summary", +    "table", +    "tbody", +    "td", +    "tfoot", +    "th", +    "thead", +    "title", +    "tr", +    "track", +    "ul", +]; + +/// The max number of characters in a hexadecimal numeric +/// [character reference][character_reference]. +/// +/// To illustrate, this allows `�` and disallows `�`. +/// This limit is imposed because all bigger numbers are invalid. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX: usize = 6; + +/// The max number of characters in a decimal numeric +/// [character reference][character_reference]. +/// +/// To illustrate, this allows `�` and disallows `�`. +/// This limit is imposed because all bigger numbers are invalid. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_DECIMAL_SIZE_MAX: usize = 7; + +/// The max number of characters in a named +/// [character reference][character_reference]. +/// +/// This is the number of the longest name in [`CHARACTER_REFERENCE_NAMES`][]. +/// It allows `∳` and prevents the parser from +/// continuing for eons. +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_NAMED_SIZE_MAX: usize = 31; + +/// List of names that can form a named +/// [character reference][character_reference]. +/// +/// This list is sensitive to casing. +/// +/// The number of the longest name (`CounterClockwiseContourIntegral`) is also +/// stored as a constant in [`CHARACTER_REFERENCE_NAMED_SIZE_MAX`][]. +/// +/// The corresponding values of this list are stored in +/// [`CHARACTER_REFERENCE_VALUES`][]. +/// They correspond through their index. +/// +/// ## References +/// +/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_NAMES: [&str; 2222] = [ +    "AEli", +    "AElig", +    "AM", +    "AMP", +    "Aacut", +    "Aacute", +    "Abreve", +    "Acir", +    "Acirc", +    "Acy", +    "Afr", +    "Agrav", +    "Agrave", +    "Alpha", +    "Amacr", +    "And", +    "Aogon", +    "Aopf", +    "ApplyFunction", +    "Arin", +    "Aring", +    "Ascr", +    "Assign", +    "Atild", +    "Atilde", +    "Aum", +    "Auml", +    "Backslash", +    "Barv", +    "Barwed", +    "Bcy", +    "Because", +    "Bernoullis", +    "Beta", +    "Bfr", +    "Bopf", +    "Breve", +    "Bscr", +    "Bumpeq", +    "CHcy", +    "COP", +    "COPY", +    "Cacute", +    "Cap", +    "CapitalDifferentialD", +    "Cayleys", +    "Ccaron", +    "Ccedi", +    "Ccedil", +    "Ccirc", +    "Cconint", +    "Cdot", +    "Cedilla", +    "CenterDot", +    "Cfr", +    "Chi", +    "CircleDot", +    "CircleMinus", +    "CirclePlus", +    "CircleTimes", +    "ClockwiseContourIntegral", +    "CloseCurlyDoubleQuote", +    "CloseCurlyQuote", +    "Colon", +    "Colone", +    "Congruent", +    "Conint", +    "ContourIntegral", +    "Copf", +    "Coproduct", +    "CounterClockwiseContourIntegral", +    "Cross", +    "Cscr", +    "Cup", +    "CupCap", +    "DD", +    "DDotrahd", +    "DJcy", +    "DScy", +    "DZcy", +    "Dagger", +    "Darr", +    "Dashv", +    "Dcaron", +    "Dcy", +    "Del", +    "Delta", +    "Dfr", +    "DiacriticalAcute", +    "DiacriticalDot", +    "DiacriticalDoubleAcute", +    "DiacriticalGrave", +    "DiacriticalTilde", +    "Diamond", +    "DifferentialD", +    "Dopf", +    "Dot", +    "DotDot", +    "DotEqual", +    "DoubleContourIntegral", +    "DoubleDot", +    "DoubleDownArrow", +    "DoubleLeftArrow", +    "DoubleLeftRightArrow", +    "DoubleLeftTee", +    "DoubleLongLeftArrow", +    "DoubleLongLeftRightArrow", +    "DoubleLongRightArrow", +    "DoubleRightArrow", +    "DoubleRightTee", +    "DoubleUpArrow", +    "DoubleUpDownArrow", +    "DoubleVerticalBar", +    "DownArrow", +    "DownArrowBar", +    "DownArrowUpArrow", +    "DownBreve", +    "DownLeftRightVector", +    "DownLeftTeeVector", +    "DownLeftVector", +    "DownLeftVectorBar", +    "DownRightTeeVector", +    "DownRightVector", +    "DownRightVectorBar", +    "DownTee", +    "DownTeeArrow", +    "Downarrow", +    "Dscr", +    "Dstrok", +    "ENG", +    "ET", +    "ETH", +    "Eacut", +    "Eacute", +    "Ecaron", +    "Ecir", +    "Ecirc", +    "Ecy", +    "Edot", +    "Efr", +    "Egrav", +    "Egrave", +    "Element", +    "Emacr", +    "EmptySmallSquare", +    "EmptyVerySmallSquare", +    "Eogon", +    "Eopf", +    "Epsilon", +    "Equal", +    "EqualTilde", +    "Equilibrium", +    "Escr", +    "Esim", +    "Eta", +    "Eum", +    "Euml", +    "Exists", +    "ExponentialE", +    "Fcy", +    "Ffr", +    "FilledSmallSquare", +    "FilledVerySmallSquare", +    "Fopf", +    "ForAll", +    "Fouriertrf", +    "Fscr", +    "GJcy", +    "G", +    "GT", +    "Gamma", +    "Gammad", +    "Gbreve", +    "Gcedil", +    "Gcirc", +    "Gcy", +    "Gdot", +    "Gfr", +    "Gg", +    "Gopf", +    "GreaterEqual", +    "GreaterEqualLess", +    "GreaterFullEqual", +    "GreaterGreater", +    "GreaterLess", +    "GreaterSlantEqual", +    "GreaterTilde", +    "Gscr", +    "Gt", +    "HARDcy", +    "Hacek", +    "Hat", +    "Hcirc", +    "Hfr", +    "HilbertSpace", +    "Hopf", +    "HorizontalLine", +    "Hscr", +    "Hstrok", +    "HumpDownHump", +    "HumpEqual", +    "IEcy", +    "IJlig", +    "IOcy", +    "Iacut", +    "Iacute", +    "Icir", +    "Icirc", +    "Icy", +    "Idot", +    "Ifr", +    "Igrav", +    "Igrave", +    "Im", +    "Imacr", +    "ImaginaryI", +    "Implies", +    "Int", +    "Integral", +    "Intersection", +    "InvisibleComma", +    "InvisibleTimes", +    "Iogon", +    "Iopf", +    "Iota", +    "Iscr", +    "Itilde", +    "Iukcy", +    "Ium", +    "Iuml", +    "Jcirc", +    "Jcy", +    "Jfr", +    "Jopf", +    "Jscr", +    "Jsercy", +    "Jukcy", +    "KHcy", +    "KJcy", +    "Kappa", +    "Kcedil", +    "Kcy", +    "Kfr", +    "Kopf", +    "Kscr", +    "LJcy", +    "L", +    "LT", +    "Lacute", +    "Lambda", +    "Lang", +    "Laplacetrf", +    "Larr", +    "Lcaron", +    "Lcedil", +    "Lcy", +    "LeftAngleBracket", +    "LeftArrow", +    "LeftArrowBar", +    "LeftArrowRightArrow", +    "LeftCeiling", +    "LeftDoubleBracket", +    "LeftDownTeeVector", +    "LeftDownVector", +    "LeftDownVectorBar", +    "LeftFloor", +    "LeftRightArrow", +    "LeftRightVector", +    "LeftTee", +    "LeftTeeArrow", +    "LeftTeeVector", +    "LeftTriangle", +    "LeftTriangleBar", +    "LeftTriangleEqual", +    "LeftUpDownVector", +    "LeftUpTeeVector", +    "LeftUpVector", +    "LeftUpVectorBar", +    "LeftVector", +    "LeftVectorBar", +    "Leftarrow", +    "Leftrightarrow", +    "LessEqualGreater", +    "LessFullEqual", +    "LessGreater", +    "LessLess", +    "LessSlantEqual", +    "LessTilde", +    "Lfr", +    "Ll", +    "Lleftarrow", +    "Lmidot", +    "LongLeftArrow", +    "LongLeftRightArrow", +    "LongRightArrow", +    "Longleftarrow", +    "Longleftrightarrow", +    "Longrightarrow", +    "Lopf", +    "LowerLeftArrow", +    "LowerRightArrow", +    "Lscr", +    "Lsh", +    "Lstrok", +    "Lt", +    "Map", +    "Mcy", +    "MediumSpace", +    "Mellintrf", +    "Mfr", +    "MinusPlus", +    "Mopf", +    "Mscr", +    "Mu", +    "NJcy", +    "Nacute", +    "Ncaron", +    "Ncedil", +    "Ncy", +    "NegativeMediumSpace", +    "NegativeThickSpace", +    "NegativeThinSpace", +    "NegativeVeryThinSpace", +    "NestedGreaterGreater", +    "NestedLessLess", +    "NewLine", +    "Nfr", +    "NoBreak", +    "NonBreakingSpace", +    "Nopf", +    "Not", +    "NotCongruent", +    "NotCupCap", +    "NotDoubleVerticalBar", +    "NotElement", +    "NotEqual", +    "NotEqualTilde", +    "NotExists", +    "NotGreater", +    "NotGreaterEqual", +    "NotGreaterFullEqual", +    "NotGreaterGreater", +    "NotGreaterLess", +    "NotGreaterSlantEqual", +    "NotGreaterTilde", +    "NotHumpDownHump", +    "NotHumpEqual", +    "NotLeftTriangle", +    "NotLeftTriangleBar", +    "NotLeftTriangleEqual", +    "NotLess", +    "NotLessEqual", +    "NotLessGreater", +    "NotLessLess", +    "NotLessSlantEqual", +    "NotLessTilde", +    "NotNestedGreaterGreater", +    "NotNestedLessLess", +    "NotPrecedes", +    "NotPrecedesEqual", +    "NotPrecedesSlantEqual", +    "NotReverseElement", +    "NotRightTriangle", +    "NotRightTriangleBar", +    "NotRightTriangleEqual", +    "NotSquareSubset", +    "NotSquareSubsetEqual", +    "NotSquareSuperset", +    "NotSquareSupersetEqual", +    "NotSubset", +    "NotSubsetEqual", +    "NotSucceeds", +    "NotSucceedsEqual", +    "NotSucceedsSlantEqual", +    "NotSucceedsTilde", +    "NotSuperset", +    "NotSupersetEqual", +    "NotTilde", +    "NotTildeEqual", +    "NotTildeFullEqual", +    "NotTildeTilde", +    "NotVerticalBar", +    "Nscr", +    "Ntild", +    "Ntilde", +    "Nu", +    "OElig", +    "Oacut", +    "Oacute", +    "Ocir", +    "Ocirc", +    "Ocy", +    "Odblac", +    "Ofr", +    "Ograv", +    "Ograve", +    "Omacr", +    "Omega", +    "Omicron", +    "Oopf", +    "OpenCurlyDoubleQuote", +    "OpenCurlyQuote", +    "Or", +    "Oscr", +    "Oslas", +    "Oslash", +    "Otild", +    "Otilde", +    "Otimes", +    "Oum", +    "Ouml", +    "OverBar", +    "OverBrace", +    "OverBracket", +    "OverParenthesis", +    "PartialD", +    "Pcy", +    "Pfr", +    "Phi", +    "Pi", +    "PlusMinus", +    "Poincareplane", +    "Popf", +    "Pr", +    "Precedes", +    "PrecedesEqual", +    "PrecedesSlantEqual", +    "PrecedesTilde", +    "Prime", +    "Product", +    "Proportion", +    "Proportional", +    "Pscr", +    "Psi", +    "QUO", +    "QUOT", +    "Qfr", +    "Qopf", +    "Qscr", +    "RBarr", +    "RE", +    "REG", +    "Racute", +    "Rang", +    "Rarr", +    "Rarrtl", +    "Rcaron", +    "Rcedil", +    "Rcy", +    "Re", +    "ReverseElement", +    "ReverseEquilibrium", +    "ReverseUpEquilibrium", +    "Rfr", +    "Rho", +    "RightAngleBracket", +    "RightArrow", +    "RightArrowBar", +    "RightArrowLeftArrow", +    "RightCeiling", +    "RightDoubleBracket", +    "RightDownTeeVector", +    "RightDownVector", +    "RightDownVectorBar", +    "RightFloor", +    "RightTee", +    "RightTeeArrow", +    "RightTeeVector", +    "RightTriangle", +    "RightTriangleBar", +    "RightTriangleEqual", +    "RightUpDownVector", +    "RightUpTeeVector", +    "RightUpVector", +    "RightUpVectorBar", +    "RightVector", +    "RightVectorBar", +    "Rightarrow", +    "Ropf", +    "RoundImplies", +    "Rrightarrow", +    "Rscr", +    "Rsh", +    "RuleDelayed", +    "SHCHcy", +    "SHcy", +    "SOFTcy", +    "Sacute", +    "Sc", +    "Scaron", +    "Scedil", +    "Scirc", +    "Scy", +    "Sfr", +    "ShortDownArrow", +    "ShortLeftArrow", +    "ShortRightArrow", +    "ShortUpArrow", +    "Sigma", +    "SmallCircle", +    "Sopf", +    "Sqrt", +    "Square", +    "SquareIntersection", +    "SquareSubset", +    "SquareSubsetEqual", +    "SquareSuperset", +    "SquareSupersetEqual", +    "SquareUnion", +    "Sscr", +    "Star", +    "Sub", +    "Subset", +    "SubsetEqual", +    "Succeeds", +    "SucceedsEqual", +    "SucceedsSlantEqual", +    "SucceedsTilde", +    "SuchThat", +    "Sum", +    "Sup", +    "Superset", +    "SupersetEqual", +    "Supset", +    "THOR", +    "THORN", +    "TRADE", +    "TSHcy", +    "TScy", +    "Tab", +    "Tau", +    "Tcaron", +    "Tcedil", +    "Tcy", +    "Tfr", +    "Therefore", +    "Theta", +    "ThickSpace", +    "ThinSpace", +    "Tilde", +    "TildeEqual", +    "TildeFullEqual", +    "TildeTilde", +    "Topf", +    "TripleDot", +    "Tscr", +    "Tstrok", +    "Uacut", +    "Uacute", +    "Uarr", +    "Uarrocir", +    "Ubrcy", +    "Ubreve", +    "Ucir", +    "Ucirc", +    "Ucy", +    "Udblac", +    "Ufr", +    "Ugrav", +    "Ugrave", +    "Umacr", +    "UnderBar", +    "UnderBrace", +    "UnderBracket", +    "UnderParenthesis", +    "Union", +    "UnionPlus", +    "Uogon", +    "Uopf", +    "UpArrow", +    "UpArrowBar", +    "UpArrowDownArrow", +    "UpDownArrow", +    "UpEquilibrium", +    "UpTee", +    "UpTeeArrow", +    "Uparrow", +    "Updownarrow", +    "UpperLeftArrow", +    "UpperRightArrow", +    "Upsi", +    "Upsilon", +    "Uring", +    "Uscr", +    "Utilde", +    "Uum", +    "Uuml", +    "VDash", +    "Vbar", +    "Vcy", +    "Vdash", +    "Vdashl", +    "Vee", +    "Verbar", +    "Vert", +    "VerticalBar", +    "VerticalLine", +    "VerticalSeparator", +    "VerticalTilde", +    "VeryThinSpace", +    "Vfr", +    "Vopf", +    "Vscr", +    "Vvdash", +    "Wcirc", +    "Wedge", +    "Wfr", +    "Wopf", +    "Wscr", +    "Xfr", +    "Xi", +    "Xopf", +    "Xscr", +    "YAcy", +    "YIcy", +    "YUcy", +    "Yacut", +    "Yacute", +    "Ycirc", +    "Ycy", +    "Yfr", +    "Yopf", +    "Yscr", +    "Yuml", +    "ZHcy", +    "Zacute", +    "Zcaron", +    "Zcy", +    "Zdot", +    "ZeroWidthSpace", +    "Zeta", +    "Zfr", +    "Zopf", +    "Zscr", +    "aacut", +    "aacute", +    "abreve", +    "ac", +    "acE", +    "acd", +    "acir", +    "acirc", +    "acut", +    "acute", +    "acy", +    "aeli", +    "aelig", +    "af", +    "afr", +    "agrav", +    "agrave", +    "alefsym", +    "aleph", +    "alpha", +    "amacr", +    "amalg", +    "am", +    "amp", +    "and", +    "andand", +    "andd", +    "andslope", +    "andv", +    "ang", +    "ange", +    "angle", +    "angmsd", +    "angmsdaa", +    "angmsdab", +    "angmsdac", +    "angmsdad", +    "angmsdae", +    "angmsdaf", +    "angmsdag", +    "angmsdah", +    "angrt", +    "angrtvb", +    "angrtvbd", +    "angsph", +    "angst", +    "angzarr", +    "aogon", +    "aopf", +    "ap", +    "apE", +    "apacir", +    "ape", +    "apid", +    "apos", +    "approx", +    "approxeq", +    "arin", +    "aring", +    "ascr", +    "ast", +    "asymp", +    "asympeq", +    "atild", +    "atilde", +    "aum", +    "auml", +    "awconint", +    "awint", +    "bNot", +    "backcong", +    "backepsilon", +    "backprime", +    "backsim", +    "backsimeq", +    "barvee", +    "barwed", +    "barwedge", +    "bbrk", +    "bbrktbrk", +    "bcong", +    "bcy", +    "bdquo", +    "becaus", +    "because", +    "bemptyv", +    "bepsi", +    "bernou", +    "beta", +    "beth", +    "between", +    "bfr", +    "bigcap", +    "bigcirc", +    "bigcup", +    "bigodot", +    "bigoplus", +    "bigotimes", +    "bigsqcup", +    "bigstar", +    "bigtriangledown", +    "bigtriangleup", +    "biguplus", +    "bigvee", +    "bigwedge", +    "bkarow", +    "blacklozenge", +    "blacksquare", +    "blacktriangle", +    "blacktriangledown", +    "blacktriangleleft", +    "blacktriangleright", +    "blank", +    "blk12", +    "blk14", +    "blk34", +    "block", +    "bne", +    "bnequiv", +    "bnot", +    "bopf", +    "bot", +    "bottom", +    "bowtie", +    "boxDL", +    "boxDR", +    "boxDl", +    "boxDr", +    "boxH", +    "boxHD", +    "boxHU", +    "boxHd", +    "boxHu", +    "boxUL", +    "boxUR", +    "boxUl", +    "boxUr", +    "boxV", +    "boxVH", +    "boxVL", +    "boxVR", +    "boxVh", +    "boxVl", +    "boxVr", +    "boxbox", +    "boxdL", +    "boxdR", +    "boxdl", +    "boxdr", +    "boxh", +    "boxhD", +    "boxhU", +    "boxhd", +    "boxhu", +    "boxminus", +    "boxplus", +    "boxtimes", +    "boxuL", +    "boxuR", +    "boxul", +    "boxur", +    "boxv", +    "boxvH", +    "boxvL", +    "boxvR", +    "boxvh", +    "boxvl", +    "boxvr", +    "bprime", +    "breve", +    "brvba", +    "brvbar", +    "bscr", +    "bsemi", +    "bsim", +    "bsime", +    "bsol", +    "bsolb", +    "bsolhsub", +    "bull", +    "bullet", +    "bump", +    "bumpE", +    "bumpe", +    "bumpeq", +    "cacute", +    "cap", +    "capand", +    "capbrcup", +    "capcap", +    "capcup", +    "capdot", +    "caps", +    "caret", +    "caron", +    "ccaps", +    "ccaron", +    "ccedi", +    "ccedil", +    "ccirc", +    "ccups", +    "ccupssm", +    "cdot", +    "cedi", +    "cedil", +    "cemptyv", +    "cen", +    "cent", +    "centerdot", +    "cfr", +    "chcy", +    "check", +    "checkmark", +    "chi", +    "cir", +    "cirE", +    "circ", +    "circeq", +    "circlearrowleft", +    "circlearrowright", +    "circledR", +    "circledS", +    "circledast", +    "circledcirc", +    "circleddash", +    "cire", +    "cirfnint", +    "cirmid", +    "cirscir", +    "clubs", +    "clubsuit", +    "colon", +    "colone", +    "coloneq", +    "comma", +    "commat", +    "comp", +    "compfn", +    "complement", +    "complexes", +    "cong", +    "congdot", +    "conint", +    "copf", +    "coprod", +    "cop", +    "copy", +    "copysr", +    "crarr", +    "cross", +    "cscr", +    "csub", +    "csube", +    "csup", +    "csupe", +    "ctdot", +    "cudarrl", +    "cudarrr", +    "cuepr", +    "cuesc", +    "cularr", +    "cularrp", +    "cup", +    "cupbrcap", +    "cupcap", +    "cupcup", +    "cupdot", +    "cupor", +    "cups", +    "curarr", +    "curarrm", +    "curlyeqprec", +    "curlyeqsucc", +    "curlyvee", +    "curlywedge", +    "curre", +    "curren", +    "curvearrowleft", +    "curvearrowright", +    "cuvee", +    "cuwed", +    "cwconint", +    "cwint", +    "cylcty", +    "dArr", +    "dHar", +    "dagger", +    "daleth", +    "darr", +    "dash", +    "dashv", +    "dbkarow", +    "dblac", +    "dcaron", +    "dcy", +    "dd", +    "ddagger", +    "ddarr", +    "ddotseq", +    "de", +    "deg", +    "delta", +    "demptyv", +    "dfisht", +    "dfr", +    "dharl", +    "dharr", +    "diam", +    "diamond", +    "diamondsuit", +    "diams", +    "die", +    "digamma", +    "disin", +    "div", +    "divid", +    "divide", +    "divideontimes", +    "divonx", +    "djcy", +    "dlcorn", +    "dlcrop", +    "dollar", +    "dopf", +    "dot", +    "doteq", +    "doteqdot", +    "dotminus", +    "dotplus", +    "dotsquare", +    "doublebarwedge", +    "downarrow", +    "downdownarrows", +    "downharpoonleft", +    "downharpoonright", +    "drbkarow", +    "drcorn", +    "drcrop", +    "dscr", +    "dscy", +    "dsol", +    "dstrok", +    "dtdot", +    "dtri", +    "dtrif", +    "duarr", +    "duhar", +    "dwangle", +    "dzcy", +    "dzigrarr", +    "eDDot", +    "eDot", +    "eacut", +    "eacute", +    "easter", +    "ecaron", +    "ecir", +    "ecirc", +    "ecolon", +    "ecy", +    "edot", +    "ee", +    "efDot", +    "efr", +    "eg", +    "egrav", +    "egrave", +    "egs", +    "egsdot", +    "el", +    "elinters", +    "ell", +    "els", +    "elsdot", +    "emacr", +    "empty", +    "emptyset", +    "emptyv", +    "emsp13", +    "emsp14", +    "emsp", +    "eng", +    "ensp", +    "eogon", +    "eopf", +    "epar", +    "eparsl", +    "eplus", +    "epsi", +    "epsilon", +    "epsiv", +    "eqcirc", +    "eqcolon", +    "eqsim", +    "eqslantgtr", +    "eqslantless", +    "equals", +    "equest", +    "equiv", +    "equivDD", +    "eqvparsl", +    "erDot", +    "erarr", +    "escr", +    "esdot", +    "esim", +    "eta", +    "et", +    "eth", +    "eum", +    "euml", +    "euro", +    "excl", +    "exist", +    "expectation", +    "exponentiale", +    "fallingdotseq", +    "fcy", +    "female", +    "ffilig", +    "fflig", +    "ffllig", +    "ffr", +    "filig", +    "fjlig", +    "flat", +    "fllig", +    "fltns", +    "fnof", +    "fopf", +    "forall", +    "fork", +    "forkv", +    "fpartint", +    "frac1", +    "frac12", +    "frac13", +    "frac14", +    "frac15", +    "frac16", +    "frac18", +    "frac23", +    "frac25", +    "frac3", +    "frac34", +    "frac35", +    "frac38", +    "frac45", +    "frac56", +    "frac58", +    "frac78", +    "frasl", +    "frown", +    "fscr", +    "gE", +    "gEl", +    "gacute", +    "gamma", +    "gammad", +    "gap", +    "gbreve", +    "gcirc", +    "gcy", +    "gdot", +    "ge", +    "gel", +    "geq", +    "geqq", +    "geqslant", +    "ges", +    "gescc", +    "gesdot", +    "gesdoto", +    "gesdotol", +    "gesl", +    "gesles", +    "gfr", +    "gg", +    "ggg", +    "gimel", +    "gjcy", +    "gl", +    "glE", +    "gla", +    "glj", +    "gnE", +    "gnap", +    "gnapprox", +    "gne", +    "gneq", +    "gneqq", +    "gnsim", +    "gopf", +    "grave", +    "gscr", +    "gsim", +    "gsime", +    "gsiml", +    "g", +    "gt", +    "gtcc", +    "gtcir", +    "gtdot", +    "gtlPar", +    "gtquest", +    "gtrapprox", +    "gtrarr", +    "gtrdot", +    "gtreqless", +    "gtreqqless", +    "gtrless", +    "gtrsim", +    "gvertneqq", +    "gvnE", +    "hArr", +    "hairsp", +    "half", +    "hamilt", +    "hardcy", +    "harr", +    "harrcir", +    "harrw", +    "hbar", +    "hcirc", +    "hearts", +    "heartsuit", +    "hellip", +    "hercon", +    "hfr", +    "hksearow", +    "hkswarow", +    "hoarr", +    "homtht", +    "hookleftarrow", +    "hookrightarrow", +    "hopf", +    "horbar", +    "hscr", +    "hslash", +    "hstrok", +    "hybull", +    "hyphen", +    "iacut", +    "iacute", +    "ic", +    "icir", +    "icirc", +    "icy", +    "iecy", +    "iexc", +    "iexcl", +    "iff", +    "ifr", +    "igrav", +    "igrave", +    "ii", +    "iiiint", +    "iiint", +    "iinfin", +    "iiota", +    "ijlig", +    "imacr", +    "image", +    "imagline", +    "imagpart", +    "imath", +    "imof", +    "imped", +    "in", +    "incare", +    "infin", +    "infintie", +    "inodot", +    "int", +    "intcal", +    "integers", +    "intercal", +    "intlarhk", +    "intprod", +    "iocy", +    "iogon", +    "iopf", +    "iota", +    "iprod", +    "iques", +    "iquest", +    "iscr", +    "isin", +    "isinE", +    "isindot", +    "isins", +    "isinsv", +    "isinv", +    "it", +    "itilde", +    "iukcy", +    "ium", +    "iuml", +    "jcirc", +    "jcy", +    "jfr", +    "jmath", +    "jopf", +    "jscr", +    "jsercy", +    "jukcy", +    "kappa", +    "kappav", +    "kcedil", +    "kcy", +    "kfr", +    "kgreen", +    "khcy", +    "kjcy", +    "kopf", +    "kscr", +    "lAarr", +    "lArr", +    "lAtail", +    "lBarr", +    "lE", +    "lEg", +    "lHar", +    "lacute", +    "laemptyv", +    "lagran", +    "lambda", +    "lang", +    "langd", +    "langle", +    "lap", +    "laqu", +    "laquo", +    "larr", +    "larrb", +    "larrbfs", +    "larrfs", +    "larrhk", +    "larrlp", +    "larrpl", +    "larrsim", +    "larrtl", +    "lat", +    "latail", +    "late", +    "lates", +    "lbarr", +    "lbbrk", +    "lbrace", +    "lbrack", +    "lbrke", +    "lbrksld", +    "lbrkslu", +    "lcaron", +    "lcedil", +    "lceil", +    "lcub", +    "lcy", +    "ldca", +    "ldquo", +    "ldquor", +    "ldrdhar", +    "ldrushar", +    "ldsh", +    "le", +    "leftarrow", +    "leftarrowtail", +    "leftharpoondown", +    "leftharpoonup", +    "leftleftarrows", +    "leftrightarrow", +    "leftrightarrows", +    "leftrightharpoons", +    "leftrightsquigarrow", +    "leftthreetimes", +    "leg", +    "leq", +    "leqq", +    "leqslant", +    "les", +    "lescc", +    "lesdot", +    "lesdoto", +    "lesdotor", +    "lesg", +    "lesges", +    "lessapprox", +    "lessdot", +    "lesseqgtr", +    "lesseqqgtr", +    "lessgtr", +    "lesssim", +    "lfisht", +    "lfloor", +    "lfr", +    "lg", +    "lgE", +    "lhard", +    "lharu", +    "lharul", +    "lhblk", +    "ljcy", +    "ll", +    "llarr", +    "llcorner", +    "llhard", +    "lltri", +    "lmidot", +    "lmoust", +    "lmoustache", +    "lnE", +    "lnap", +    "lnapprox", +    "lne", +    "lneq", +    "lneqq", +    "lnsim", +    "loang", +    "loarr", +    "lobrk", +    "longleftarrow", +    "longleftrightarrow", +    "longmapsto", +    "longrightarrow", +    "looparrowleft", +    "looparrowright", +    "lopar", +    "lopf", +    "loplus", +    "lotimes", +    "lowast", +    "lowbar", +    "loz", +    "lozenge", +    "lozf", +    "lpar", +    "lparlt", +    "lrarr", +    "lrcorner", +    "lrhar", +    "lrhard", +    "lrm", +    "lrtri", +    "lsaquo", +    "lscr", +    "lsh", +    "lsim", +    "lsime", +    "lsimg", +    "lsqb", +    "lsquo", +    "lsquor", +    "lstrok", +    "l", +    "lt", +    "ltcc", +    "ltcir", +    "ltdot", +    "lthree", +    "ltimes", +    "ltlarr", +    "ltquest", +    "ltrPar", +    "ltri", +    "ltrie", +    "ltrif", +    "lurdshar", +    "luruhar", +    "lvertneqq", +    "lvnE", +    "mDDot", +    "mac", +    "macr", +    "male", +    "malt", +    "maltese", +    "map", +    "mapsto", +    "mapstodown", +    "mapstoleft", +    "mapstoup", +    "marker", +    "mcomma", +    "mcy", +    "mdash", +    "measuredangle", +    "mfr", +    "mho", +    "micr", +    "micro", +    "mid", +    "midast", +    "midcir", +    "middo", +    "middot", +    "minus", +    "minusb", +    "minusd", +    "minusdu", +    "mlcp", +    "mldr", +    "mnplus", +    "models", +    "mopf", +    "mp", +    "mscr", +    "mstpos", +    "mu", +    "multimap", +    "mumap", +    "nGg", +    "nGt", +    "nGtv", +    "nLeftarrow", +    "nLeftrightarrow", +    "nLl", +    "nLt", +    "nLtv", +    "nRightarrow", +    "nVDash", +    "nVdash", +    "nabla", +    "nacute", +    "nang", +    "nap", +    "napE", +    "napid", +    "napos", +    "napprox", +    "natur", +    "natural", +    "naturals", +    "nbs", +    "nbsp", +    "nbump", +    "nbumpe", +    "ncap", +    "ncaron", +    "ncedil", +    "ncong", +    "ncongdot", +    "ncup", +    "ncy", +    "ndash", +    "ne", +    "neArr", +    "nearhk", +    "nearr", +    "nearrow", +    "nedot", +    "nequiv", +    "nesear", +    "nesim", +    "nexist", +    "nexists", +    "nfr", +    "ngE", +    "nge", +    "ngeq", +    "ngeqq", +    "ngeqslant", +    "nges", +    "ngsim", +    "ngt", +    "ngtr", +    "nhArr", +    "nharr", +    "nhpar", +    "ni", +    "nis", +    "nisd", +    "niv", +    "njcy", +    "nlArr", +    "nlE", +    "nlarr", +    "nldr", +    "nle", +    "nleftarrow", +    "nleftrightarrow", +    "nleq", +    "nleqq", +    "nleqslant", +    "nles", +    "nless", +    "nlsim", +    "nlt", +    "nltri", +    "nltrie", +    "nmid", +    "nopf", +    "no", +    "not", +    "notin", +    "notinE", +    "notindot", +    "notinva", +    "notinvb", +    "notinvc", +    "notni", +    "notniva", +    "notnivb", +    "notnivc", +    "npar", +    "nparallel", +    "nparsl", +    "npart", +    "npolint", +    "npr", +    "nprcue", +    "npre", +    "nprec", +    "npreceq", +    "nrArr", +    "nrarr", +    "nrarrc", +    "nrarrw", +    "nrightarrow", +    "nrtri", +    "nrtrie", +    "nsc", +    "nsccue", +    "nsce", +    "nscr", +    "nshortmid", +    "nshortparallel", +    "nsim", +    "nsime", +    "nsimeq", +    "nsmid", +    "nspar", +    "nsqsube", +    "nsqsupe", +    "nsub", +    "nsubE", +    "nsube", +    "nsubset", +    "nsubseteq", +    "nsubseteqq", +    "nsucc", +    "nsucceq", +    "nsup", +    "nsupE", +    "nsupe", +    "nsupset", +    "nsupseteq", +    "nsupseteqq", +    "ntgl", +    "ntild", +    "ntilde", +    "ntlg", +    "ntriangleleft", +    "ntrianglelefteq", +    "ntriangleright", +    "ntrianglerighteq", +    "nu", +    "num", +    "numero", +    "numsp", +    "nvDash", +    "nvHarr", +    "nvap", +    "nvdash", +    "nvge", +    "nvgt", +    "nvinfin", +    "nvlArr", +    "nvle", +    "nvlt", +    "nvltrie", +    "nvrArr", +    "nvrtrie", +    "nvsim", +    "nwArr", +    "nwarhk", +    "nwarr", +    "nwarrow", +    "nwnear", +    "oS", +    "oacut", +    "oacute", +    "oast", +    "ocir", +    "ocirc", +    "ocy", +    "odash", +    "odblac", +    "odiv", +    "odot", +    "odsold", +    "oelig", +    "ofcir", +    "ofr", +    "ogon", +    "ograv", +    "ograve", +    "ogt", +    "ohbar", +    "ohm", +    "oint", +    "olarr", +    "olcir", +    "olcross", +    "oline", +    "olt", +    "omacr", +    "omega", +    "omicron", +    "omid", +    "ominus", +    "oopf", +    "opar", +    "operp", +    "oplus", +    "or", +    "orarr", +    "ord", +    "order", +    "orderof", +    "ordf", +    "ordm", +    "origof", +    "oror", +    "orslope", +    "orv", +    "oscr", +    "oslas", +    "oslash", +    "osol", +    "otild", +    "otilde", +    "otimes", +    "otimesas", +    "oum", +    "ouml", +    "ovbar", +    "par", +    "para", +    "parallel", +    "parsim", +    "parsl", +    "part", +    "pcy", +    "percnt", +    "period", +    "permil", +    "perp", +    "pertenk", +    "pfr", +    "phi", +    "phiv", +    "phmmat", +    "phone", +    "pi", +    "pitchfork", +    "piv", +    "planck", +    "planckh", +    "plankv", +    "plus", +    "plusacir", +    "plusb", +    "pluscir", +    "plusdo", +    "plusdu", +    "pluse", +    "plusm", +    "plusmn", +    "plussim", +    "plustwo", +    "pm", +    "pointint", +    "popf", +    "poun", +    "pound", +    "pr", +    "prE", +    "prap", +    "prcue", +    "pre", +    "prec", +    "precapprox", +    "preccurlyeq", +    "preceq", +    "precnapprox", +    "precneqq", +    "precnsim", +    "precsim", +    "prime", +    "primes", +    "prnE", +    "prnap", +    "prnsim", +    "prod", +    "profalar", +    "profline", +    "profsurf", +    "prop", +    "propto", +    "prsim", +    "prurel", +    "pscr", +    "psi", +    "puncsp", +    "qfr", +    "qint", +    "qopf", +    "qprime", +    "qscr", +    "quaternions", +    "quatint", +    "quest", +    "questeq", +    "quo", +    "quot", +    "rAarr", +    "rArr", +    "rAtail", +    "rBarr", +    "rHar", +    "race", +    "racute", +    "radic", +    "raemptyv", +    "rang", +    "rangd", +    "range", +    "rangle", +    "raqu", +    "raquo", +    "rarr", +    "rarrap", +    "rarrb", +    "rarrbfs", +    "rarrc", +    "rarrfs", +    "rarrhk", +    "rarrlp", +    "rarrpl", +    "rarrsim", +    "rarrtl", +    "rarrw", +    "ratail", +    "ratio", +    "rationals", +    "rbarr", +    "rbbrk", +    "rbrace", +    "rbrack", +    "rbrke", +    "rbrksld", +    "rbrkslu", +    "rcaron", +    "rcedil", +    "rceil", +    "rcub", +    "rcy", +    "rdca", +    "rdldhar", +    "rdquo", +    "rdquor", +    "rdsh", +    "real", +    "realine", +    "realpart", +    "reals", +    "rect", +    "re", +    "reg", +    "rfisht", +    "rfloor", +    "rfr", +    "rhard", +    "rharu", +    "rharul", +    "rho", +    "rhov", +    "rightarrow", +    "rightarrowtail", +    "rightharpoondown", +    "rightharpoonup", +    "rightleftarrows", +    "rightleftharpoons", +    "rightrightarrows", +    "rightsquigarrow", +    "rightthreetimes", +    "ring", +    "risingdotseq", +    "rlarr", +    "rlhar", +    "rlm", +    "rmoust", +    "rmoustache", +    "rnmid", +    "roang", +    "roarr", +    "robrk", +    "ropar", +    "ropf", +    "roplus", +    "rotimes", +    "rpar", +    "rpargt", +    "rppolint", +    "rrarr", +    "rsaquo", +    "rscr", +    "rsh", +    "rsqb", +    "rsquo", +    "rsquor", +    "rthree", +    "rtimes", +    "rtri", +    "rtrie", +    "rtrif", +    "rtriltri", +    "ruluhar", +    "rx", +    "sacute", +    "sbquo", +    "sc", +    "scE", +    "scap", +    "scaron", +    "sccue", +    "sce", +    "scedil", +    "scirc", +    "scnE", +    "scnap", +    "scnsim", +    "scpolint", +    "scsim", +    "scy", +    "sdot", +    "sdotb", +    "sdote", +    "seArr", +    "searhk", +    "searr", +    "searrow", +    "sec", +    "sect", +    "semi", +    "seswar", +    "setminus", +    "setmn", +    "sext", +    "sfr", +    "sfrown", +    "sharp", +    "shchcy", +    "shcy", +    "shortmid", +    "shortparallel", +    "sh", +    "shy", +    "sigma", +    "sigmaf", +    "sigmav", +    "sim", +    "simdot", +    "sime", +    "simeq", +    "simg", +    "simgE", +    "siml", +    "simlE", +    "simne", +    "simplus", +    "simrarr", +    "slarr", +    "smallsetminus", +    "smashp", +    "smeparsl", +    "smid", +    "smile", +    "smt", +    "smte", +    "smtes", +    "softcy", +    "sol", +    "solb", +    "solbar", +    "sopf", +    "spades", +    "spadesuit", +    "spar", +    "sqcap", +    "sqcaps", +    "sqcup", +    "sqcups", +    "sqsub", +    "sqsube", +    "sqsubset", +    "sqsubseteq", +    "sqsup", +    "sqsupe", +    "sqsupset", +    "sqsupseteq", +    "squ", +    "square", +    "squarf", +    "squf", +    "srarr", +    "sscr", +    "ssetmn", +    "ssmile", +    "sstarf", +    "star", +    "starf", +    "straightepsilon", +    "straightphi", +    "strns", +    "sub", +    "subE", +    "subdot", +    "sube", +    "subedot", +    "submult", +    "subnE", +    "subne", +    "subplus", +    "subrarr", +    "subset", +    "subseteq", +    "subseteqq", +    "subsetneq", +    "subsetneqq", +    "subsim", +    "subsub", +    "subsup", +    "succ", +    "succapprox", +    "succcurlyeq", +    "succeq", +    "succnapprox", +    "succneqq", +    "succnsim", +    "succsim", +    "sum", +    "sung", +    "sup", +    "sup1", +    "sup2", +    "sup3", +    "supE", +    "supdot", +    "supdsub", +    "supe", +    "supedot", +    "suphsol", +    "suphsub", +    "suplarr", +    "supmult", +    "supnE", +    "supne", +    "supplus", +    "supset", +    "supseteq", +    "supseteqq", +    "supsetneq", +    "supsetneqq", +    "supsim", +    "supsub", +    "supsup", +    "swArr", +    "swarhk", +    "swarr", +    "swarrow", +    "swnwar", +    "szli", +    "szlig", +    "target", +    "tau", +    "tbrk", +    "tcaron", +    "tcedil", +    "tcy", +    "tdot", +    "telrec", +    "tfr", +    "there4", +    "therefore", +    "theta", +    "thetasym", +    "thetav", +    "thickapprox", +    "thicksim", +    "thinsp", +    "thkap", +    "thksim", +    "thor", +    "thorn", +    "tilde", +    "time", +    "times", +    "timesb", +    "timesbar", +    "timesd", +    "tint", +    "toea", +    "top", +    "topbot", +    "topcir", +    "topf", +    "topfork", +    "tosa", +    "tprime", +    "trade", +    "triangle", +    "triangledown", +    "triangleleft", +    "trianglelefteq", +    "triangleq", +    "triangleright", +    "trianglerighteq", +    "tridot", +    "trie", +    "triminus", +    "triplus", +    "trisb", +    "tritime", +    "trpezium", +    "tscr", +    "tscy", +    "tshcy", +    "tstrok", +    "twixt", +    "twoheadleftarrow", +    "twoheadrightarrow", +    "uArr", +    "uHar", +    "uacut", +    "uacute", +    "uarr", +    "ubrcy", +    "ubreve", +    "ucir", +    "ucirc", +    "ucy", +    "udarr", +    "udblac", +    "udhar", +    "ufisht", +    "ufr", +    "ugrav", +    "ugrave", +    "uharl", +    "uharr", +    "uhblk", +    "ulcorn", +    "ulcorner", +    "ulcrop", +    "ultri", +    "umacr", +    "um", +    "uml", +    "uogon", +    "uopf", +    "uparrow", +    "updownarrow", +    "upharpoonleft", +    "upharpoonright", +    "uplus", +    "upsi", +    "upsih", +    "upsilon", +    "upuparrows", +    "urcorn", +    "urcorner", +    "urcrop", +    "uring", +    "urtri", +    "uscr", +    "utdot", +    "utilde", +    "utri", +    "utrif", +    "uuarr", +    "uum", +    "uuml", +    "uwangle", +    "vArr", +    "vBar", +    "vBarv", +    "vDash", +    "vangrt", +    "varepsilon", +    "varkappa", +    "varnothing", +    "varphi", +    "varpi", +    "varpropto", +    "varr", +    "varrho", +    "varsigma", +    "varsubsetneq", +    "varsubsetneqq", +    "varsupsetneq", +    "varsupsetneqq", +    "vartheta", +    "vartriangleleft", +    "vartriangleright", +    "vcy", +    "vdash", +    "vee", +    "veebar", +    "veeeq", +    "vellip", +    "verbar", +    "vert", +    "vfr", +    "vltri", +    "vnsub", +    "vnsup", +    "vopf", +    "vprop", +    "vrtri", +    "vscr", +    "vsubnE", +    "vsubne", +    "vsupnE", +    "vsupne", +    "vzigzag", +    "wcirc", +    "wedbar", +    "wedge", +    "wedgeq", +    "weierp", +    "wfr", +    "wopf", +    "wp", +    "wr", +    "wreath", +    "wscr", +    "xcap", +    "xcirc", +    "xcup", +    "xdtri", +    "xfr", +    "xhArr", +    "xharr", +    "xi", +    "xlArr", +    "xlarr", +    "xmap", +    "xnis", +    "xodot", +    "xopf", +    "xoplus", +    "xotime", +    "xrArr", +    "xrarr", +    "xscr", +    "xsqcup", +    "xuplus", +    "xutri", +    "xvee", +    "xwedge", +    "yacut", +    "yacute", +    "yacy", +    "ycirc", +    "ycy", +    "ye", +    "yen", +    "yfr", +    "yicy", +    "yopf", +    "yscr", +    "yucy", +    "yum", +    "yuml", +    "zacute", +    "zcaron", +    "zcy", +    "zdot", +    "zeetrf", +    "zeta", +    "zfr", +    "zhcy", +    "zigrarr", +    "zopf", +    "zscr", +    "zwj", +    "zwnj", +]; + +/// List of values corresponding to names of named +/// [character references][character_reference]. +/// +/// The corresponding names of this list are stored in +/// [`CHARACTER_REFERENCE_NAMES`][]. +/// They correspond through their index. +/// +/// ## References +/// +/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +/// +/// [character_reference]: crate::construct::character_reference +pub const CHARACTER_REFERENCE_VALUES: [&str; 2222] = [ +    "Æ", "Æ", "&", "&", "Á", "Á", "Ă", "Â", "Â", "А", "𝔄", "À", "À", "Α", "Ā", "⩓", "Ą", "𝔸", "", +    "Å", "Å", "𝒜", "≔", "Ã", "Ã", "Ä", "Ä", "∖", "⫧", "⌆", "Б", "∵", "ℬ", "Β", "𝔅", "𝔹", "˘", "ℬ", +    "≎", "Ч", "©", "©", "Ć", "⋒", "ⅅ", "ℭ", "Č", "Ç", "Ç", "Ĉ", "∰", "Ċ", "¸", "·", "ℭ", "Χ", "⊙", +    "⊖", "⊕", "⊗", "∲", "”", "’", "∷", "⩴", "≡", "∯", "∮", "ℂ", "∐", "∳", "⨯", "𝒞", "⋓", "≍", "ⅅ", +    "⤑", "Ђ", "Ѕ", "Џ", "‡", "↡", "⫤", "Ď", "Д", "∇", "Δ", "𝔇", "´", "˙", "˝", "`", "˜", "⋄", "ⅆ", +    "𝔻", "¨", "⃜", "≐", "∯", "¨", "⇓", "⇐", "⇔", "⫤", "⟸", "⟺", "⟹", "⇒", "⊨", "⇑", "⇕", "∥", "↓", +    "⤓", "⇵", "̑", "⥐", "⥞", "↽", "⥖", "⥟", "⇁", "⥗", "⊤", "↧", "⇓", "𝒟", "Đ", "Ŋ", "Ð", "Ð", "É", +    "É", "Ě", "Ê", "Ê", "Э", "Ė", "𝔈", "È", "È", "∈", "Ē", "◻", "▫", "Ę", "𝔼", "Ε", "⩵", "≂", "⇌", +    "ℰ", "⩳", "Η", "Ë", "Ë", "∃", "ⅇ", "Ф", "𝔉", "◼", "▪", "𝔽", "∀", "ℱ", "ℱ", "Ѓ", ">", ">", "Γ", +    "Ϝ", "Ğ", "Ģ", "Ĝ", "Г", "Ġ", "𝔊", "⋙", "𝔾", "≥", "⋛", "≧", "⪢", "≷", "⩾", "≳", "𝒢", "≫", "Ъ", +    "ˇ", "^", "Ĥ", "ℌ", "ℋ", "ℍ", "─", "ℋ", "Ħ", "≎", "≏", "Е", "IJ", "Ё", "Í", "Í", "Î", "Î", "И", +    "İ", "ℑ", "Ì", "Ì", "ℑ", "Ī", "ⅈ", "⇒", "∬", "∫", "⋂", "", "", "Į", "𝕀", "Ι", "ℐ", "Ĩ", "І", +    "Ï", "Ï", "Ĵ", "Й", "𝔍", "𝕁", "𝒥", "Ј", "Є", "Х", "Ќ", "Κ", "Ķ", "К", "𝔎", "𝕂", "𝒦", "Љ", "<", +    "<", "Ĺ", "Λ", "⟪", "ℒ", "↞", "Ľ", "Ļ", "Л", "⟨", "←", "⇤", "⇆", "⌈", "⟦", "⥡", "⇃", "⥙", "⌊", +    "↔", "⥎", "⊣", "↤", "⥚", "⊲", "⧏", "⊴", "⥑", "⥠", "↿", "⥘", "↼", "⥒", "⇐", "⇔", "⋚", "≦", "≶", +    "⪡", "⩽", "≲", "𝔏", "⋘", "⇚", "Ŀ", "⟵", "⟷", "⟶", "⟸", "⟺", "⟹", "𝕃", "↙", "↘", "ℒ", "↰", "Ł", +    "≪", "⤅", "М", " ", "ℳ", "𝔐", "∓", "𝕄", "ℳ", "Μ", "Њ", "Ń", "Ň", "Ņ", "Н", "\u{200B}", +    "\u{200B}", "\u{200B}", "\u{200B}", "≫", "≪", "\n", "𝔑", "\u{2060}", " ", "ℕ", "⫬", "≢", "≭", +    "∦", "∉", "≠", "≂̸", "∄", "≯", "≱", "≧̸", "≫̸", "≹", "⩾̸", "≵", "≎̸", "≏̸", "⋪", "⧏̸", "⋬", "≮", "≰", +    "≸", "≪̸", "⩽̸", "≴", "⪢̸", "⪡̸", "⊀", "⪯̸", "⋠", "∌", "⋫", "⧐̸", "⋭", "⊏̸", "⋢", "⊐̸", "⋣", "⊂⃒", "⊈", +    "⊁", "⪰̸", "⋡", "≿̸", "⊃⃒", "⊉", "≁", "≄", "≇", "≉", "∤", "𝒩", "Ñ", "Ñ", "Ν", "Œ", "Ó", "Ó", "Ô", +    "Ô", "О", "Ő", "𝔒", "Ò", "Ò", "Ō", "Ω", "Ο", "𝕆", "“", "‘", "⩔", "𝒪", "Ø", "Ø", "Õ", "Õ", "⨷", +    "Ö", "Ö", "‾", "⏞", "⎴", "⏜", "∂", "П", "𝔓", "Φ", "Π", "±", "ℌ", "ℙ", "⪻", "≺", "⪯", "≼", "≾", +    "″", "∏", "∷", "∝", "𝒫", "Ψ", "\"", "\"", "𝔔", "ℚ", "𝒬", "⤐", "®", "®", "Ŕ", "⟫", "↠", "⤖", +    "Ř", "Ŗ", "Р", "ℜ", "∋", "⇋", "⥯", "ℜ", "Ρ", "⟩", "→", "⇥", "⇄", "⌉", "⟧", "⥝", "⇂", "⥕", "⌋", +    "⊢", "↦", "⥛", "⊳", "⧐", "⊵", "⥏", "⥜", "↾", "⥔", "⇀", "⥓", "⇒", "ℝ", "⥰", "⇛", "ℛ", "↱", "⧴", +    "Щ", "Ш", "Ь", "Ś", "⪼", "Š", "Ş", "Ŝ", "С", "𝔖", "↓", "←", "→", "↑", "Σ", "∘", "𝕊", "√", "□", +    "⊓", "⊏", "⊑", "⊐", "⊒", "⊔", "𝒮", "⋆", "⋐", "⋐", "⊆", "≻", "⪰", "≽", "≿", "∋", "∑", "⋑", "⊃", +    "⊇", "⋑", "Þ", "Þ", "™", "Ћ", "Ц", "\t", "Τ", "Ť", "Ţ", "Т", "𝔗", "∴", "Θ", "  ", " ", "∼", +    "≃", "≅", "≈", "𝕋", "⃛", "𝒯", "Ŧ", "Ú", "Ú", "↟", "⥉", "Ў", "Ŭ", "Û", "Û", "У", "Ű", "𝔘", "Ù", +    "Ù", "Ū", "_", "⏟", "⎵", "⏝", "⋃", "⊎", "Ų", "𝕌", "↑", "⤒", "⇅", "↕", "⥮", "⊥", "↥", "⇑", "⇕", +    "↖", "↗", "ϒ", "Υ", "Ů", "𝒰", "Ũ", "Ü", "Ü", "⊫", "⫫", "В", "⊩", "⫦", "⋁", "‖", "‖", "∣", "|", +    "❘", "≀", " ", "𝔙", "𝕍", "𝒱", "⊪", "Ŵ", "⋀", "𝔚", "𝕎", "𝒲", "𝔛", "Ξ", "𝕏", "𝒳", "Я", "Ї", "Ю", +    "Ý", "Ý", "Ŷ", "Ы", "𝔜", "𝕐", "𝒴", "Ÿ", "Ж", "Ź", "Ž", "З", "Ż", "\u{200B}", "Ζ", "ℨ", "ℤ", +    "𝒵", "á", "á", "ă", "∾", "∾̳", "∿", "â", "â", "´", "´", "а", "æ", "æ", "", "𝔞", "à", "à", "ℵ", +    "ℵ", "α", "ā", "⨿", "&", "&", "∧", "⩕", "⩜", "⩘", "⩚", "∠", "⦤", "∠", "∡", "⦨", "⦩", "⦪", "⦫", +    "⦬", "⦭", "⦮", "⦯", "∟", "⊾", "⦝", "∢", "Å", "⍼", "ą", "𝕒", "≈", "⩰", "⩯", "≊", "≋", "'", "≈", +    "≊", "å", "å", "𝒶", "*", "≈", "≍", "ã", "ã", "ä", "ä", "∳", "⨑", "⫭", "≌", "϶", "‵", "∽", "⋍", +    "⊽", "⌅", "⌅", "⎵", "⎶", "≌", "б", "„", "∵", "∵", "⦰", "϶", "ℬ", "β", "ℶ", "≬", "𝔟", "⋂", "◯", +    "⋃", "⨀", "⨁", "⨂", "⨆", "★", "▽", "△", "⨄", "⋁", "⋀", "⤍", "⧫", "▪", "▴", "▾", "◂", "▸", "␣", +    "▒", "░", "▓", "█", "=⃥", "≡⃥", "⌐", "𝕓", "⊥", "⊥", "⋈", "╗", "╔", "╖", "╓", "═", "╦", "╩", "╤", +    "╧", "╝", "╚", "╜", "╙", "║", "╬", "╣", "╠", "╫", "╢", "╟", "⧉", "╕", "╒", "┐", "┌", "─", "╥", +    "╨", "┬", "┴", "⊟", "⊞", "⊠", "╛", "╘", "┘", "└", "│", "╪", "╡", "╞", "┼", "┤", "├", "‵", "˘", +    "¦", "¦", "𝒷", "⁏", "∽", "⋍", "\\", "⧅", "⟈", "•", "•", "≎", "⪮", "≏", "≏", "ć", "∩", "⩄", "⩉", +    "⩋", "⩇", "⩀", "∩︀", "⁁", "ˇ", "⩍", "č", "ç", "ç", "ĉ", "⩌", "⩐", "ċ", "¸", "¸", "⦲", "¢", "¢", +    "·", "𝔠", "ч", "✓", "✓", "χ", "○", "⧃", "ˆ", "≗", "↺", "↻", "®", "Ⓢ", "⊛", "⊚", "⊝", "≗", "⨐", +    "⫯", "⧂", "♣", "♣", ":", "≔", "≔", ",", "@", "∁", "∘", "∁", "ℂ", "≅", "⩭", "∮", "𝕔", "∐", "©", +    "©", "℗", "↵", "✗", "𝒸", "⫏", "⫑", "⫐", "⫒", "⋯", "⤸", "⤵", "⋞", "⋟", "↶", "⤽", "∪", "⩈", "⩆", +    "⩊", "⊍", "⩅", "∪︀", "↷", "⤼", "⋞", "⋟", "⋎", "⋏", "¤", "¤", "↶", "↷", "⋎", "⋏", "∲", "∱", "⌭", +    "⇓", "⥥", "†", "ℸ", "↓", "‐", "⊣", "⤏", "˝", "ď", "д", "ⅆ", "‡", "⇊", "⩷", "°", "°", "δ", "⦱", +    "⥿", "𝔡", "⇃", "⇂", "⋄", "⋄", "♦", "♦", "¨", "ϝ", "⋲", "÷", "÷", "÷", "⋇", "⋇", "ђ", "⌞", "⌍", +    "$", "𝕕", "˙", "≐", "≑", "∸", "∔", "⊡", "⌆", "↓", "⇊", "⇃", "⇂", "⤐", "⌟", "⌌", "𝒹", "ѕ", "⧶", +    "đ", "⋱", "▿", "▾", "⇵", "⥯", "⦦", "џ", "⟿", "⩷", "≑", "é", "é", "⩮", "ě", "ê", "ê", "≕", "э", +    "ė", "ⅇ", "≒", "𝔢", "⪚", "è", "è", "⪖", "⪘", "⪙", "⏧", "ℓ", "⪕", "⪗", "ē", "∅", "∅", "∅", " ", +    " ", " ", "ŋ", " ", "ę", "𝕖", "⋕", "⧣", "⩱", "ε", "ε", "ϵ", "≖", "≕", "≂", "⪖", "⪕", "=", "≟", +    "≡", "⩸", "⧥", "≓", "⥱", "ℯ", "≐", "≂", "η", "ð", "ð", "ë", "ë", "€", "!", "∃", "ℰ", "ⅇ", "≒", +    "ф", "♀", "ffi", "ff", "ffl", "𝔣", "fi", "fj", "♭", "fl", "▱", "ƒ", "𝕗", "∀", "⋔", "⫙", "⨍", "¼", "½", +    "⅓", "¼", "⅕", "⅙", "⅛", "⅔", "⅖", "¾", "¾", "⅗", "⅜", "⅘", "⅚", "⅝", "⅞", "⁄", "⌢", "𝒻", "≧", +    "⪌", "ǵ", "γ", "ϝ", "⪆", "ğ", "ĝ", "г", "ġ", "≥", "⋛", "≥", "≧", "⩾", "⩾", "⪩", "⪀", "⪂", "⪄", +    "⋛︀", "⪔", "𝔤", "≫", "⋙", "ℷ", "ѓ", "≷", "⪒", "⪥", "⪤", "≩", "⪊", "⪊", "⪈", "⪈", "≩", "⋧", "𝕘", +    "`", "ℊ", "≳", "⪎", "⪐", ">", ">", "⪧", "⩺", "⋗", "⦕", "⩼", "⪆", "⥸", "⋗", "⋛", "⪌", "≷", "≳", +    "≩︀", "≩︀", "⇔", " ", "½", "ℋ", "ъ", "↔", "⥈", "↭", "ℏ", "ĥ", "♥", "♥", "…", "⊹", "𝔥", "⤥", "⤦", +    "⇿", "∻", "↩", "↪", "𝕙", "―", "𝒽", "ℏ", "ħ", "⁃", "‐", "í", "í", "", "î", "î", "и", "е", "¡", +    "¡", "⇔", "𝔦", "ì", "ì", "ⅈ", "⨌", "∭", "⧜", "℩", "ij", "ī", "ℑ", "ℐ", "ℑ", "ı", "⊷", "Ƶ", "∈", +    "℅", "∞", "⧝", "ı", "∫", "⊺", "ℤ", "⊺", "⨗", "⨼", "ё", "į", "𝕚", "ι", "⨼", "¿", "¿", "𝒾", "∈", +    "⋹", "⋵", "⋴", "⋳", "∈", "", "ĩ", "і", "ï", "ï", "ĵ", "й", "𝔧", "ȷ", "𝕛", "𝒿", "ј", "є", "κ", +    "ϰ", "ķ", "к", "𝔨", "ĸ", "х", "ќ", "𝕜", "𝓀", "⇚", "⇐", "⤛", "⤎", "≦", "⪋", "⥢", "ĺ", "⦴", "ℒ", +    "λ", "⟨", "⦑", "⟨", "⪅", "«", "«", "←", "⇤", "⤟", "⤝", "↩", "↫", "⤹", "⥳", "↢", "⪫", "⤙", "⪭", +    "⪭︀", "⤌", "❲", "{", "[", "⦋", "⦏", "⦍", "ľ", "ļ", "⌈", "{", "л", "⤶", "“", "„", "⥧", "⥋", "↲", +    "≤", "←", "↢", "↽", "↼", "⇇", "↔", "⇆", "⇋", "↭", "⋋", "⋚", "≤", "≦", "⩽", "⩽", "⪨", "⩿", "⪁", +    "⪃", "⋚︀", "⪓", "⪅", "⋖", "⋚", "⪋", "≶", "≲", "⥼", "⌊", "𝔩", "≶", "⪑", "↽", "↼", "⥪", "▄", "љ", +    "≪", "⇇", "⌞", "⥫", "◺", "ŀ", "⎰", "⎰", "≨", "⪉", "⪉", "⪇", "⪇", "≨", "⋦", "⟬", "⇽", "⟦", "⟵", +    "⟷", "⟼", "⟶", "↫", "↬", "⦅", "𝕝", "⨭", "⨴", "∗", "_", "◊", "◊", "⧫", "(", "⦓", "⇆", "⌟", "⇋", +    "⥭", "", "⊿", "‹", "𝓁", "↰", "≲", "⪍", "⪏", "[", "‘", "‚", "ł", "<", "<", "⪦", "⩹", "⋖", "⋋", +    "⋉", "⥶", "⩻", "⦖", "◃", "⊴", "◂", "⥊", "⥦", "≨︀", "≨︀", "∺", "¯", "¯", "♂", "✠", "✠", "↦", "↦", +    "↧", "↤", "↥", "▮", "⨩", "м", "—", "∡", "𝔪", "℧", "µ", "µ", "∣", "*", "⫰", "·", "·", "−", "⊟", +    "∸", "⨪", "⫛", "…", "∓", "⊧", "𝕞", "∓", "𝓂", "∾", "μ", "⊸", "⊸", "⋙̸", "≫⃒", "≫̸", "⇍", "⇎", "⋘̸", +    "≪⃒", "≪̸", "⇏", "⊯", "⊮", "∇", "ń", "∠⃒", "≉", "⩰̸", "≋̸", "ʼn", "≉", "♮", "♮", "ℕ", " ", " ", "≎̸", +    "≏̸", "⩃", "ň", "ņ", "≇", "⩭̸", "⩂", "н", "–", "≠", "⇗", "⤤", "↗", "↗", "≐̸", "≢", "⤨", "≂̸", "∄", +    "∄", "𝔫", "≧̸", "≱", "≱", "≧̸", "⩾̸", "⩾̸", "≵", "≯", "≯", "⇎", "↮", "⫲", "∋", "⋼", "⋺", "∋", "њ", +    "⇍", "≦̸", "↚", "‥", "≰", "↚", "↮", "≰", "≦̸", "⩽̸", "⩽̸", "≮", "≴", "≮", "⋪", "⋬", "∤", "𝕟", "¬", +    "¬", "∉", "⋹̸", "⋵̸", "∉", "⋷", "⋶", "∌", "∌", "⋾", "⋽", "∦", "∦", "⫽⃥", "∂̸", "⨔", "⊀", "⋠", "⪯̸", +    "⊀", "⪯̸", "⇏", "↛", "⤳̸", "↝̸", "↛", "⋫", "⋭", "⊁", "⋡", "⪰̸", "𝓃", "∤", "∦", "≁", "≄", "≄", "∤", +    "∦", "⋢", "⋣", "⊄", "⫅̸", "⊈", "⊂⃒", "⊈", "⫅̸", "⊁", "⪰̸", "⊅", "⫆̸", "⊉", "⊃⃒", "⊉", "⫆̸", "≹", "ñ", +    "ñ", "≸", "⋪", "⋬", "⋫", "⋭", "ν", "#", "№", " ", "⊭", "⤄", "≍⃒", "⊬", "≥⃒", ">⃒", "⧞", "⤂", "≤⃒", +    "<⃒", "⊴⃒", "⤃", "⊵⃒", "∼⃒", "⇖", "⤣", "↖", "↖", "⤧", "Ⓢ", "ó", "ó", "⊛", "ô", "ô", "о", "⊝", "ő", +    "⨸", "⊙", "⦼", "œ", "⦿", "𝔬", "˛", "ò", "ò", "⧁", "⦵", "Ω", "∮", "↺", "⦾", "⦻", "‾", "⧀", "ō", +    "ω", "ο", "⦶", "⊖", "𝕠", "⦷", "⦹", "⊕", "∨", "↻", "º", "ℴ", "ℴ", "ª", "º", "⊶", "⩖", "⩗", "⩛", +    "ℴ", "ø", "ø", "⊘", "õ", "õ", "⊗", "⨶", "ö", "ö", "⌽", "¶", "¶", "∥", "⫳", "⫽", "∂", "п", "%", +    ".", "‰", "⊥", "‱", "𝔭", "φ", "ϕ", "ℳ", "☎", "π", "⋔", "ϖ", "ℏ", "ℎ", "ℏ", "+", "⨣", "⊞", "⨢", +    "∔", "⨥", "⩲", "±", "±", "⨦", "⨧", "±", "⨕", "𝕡", "£", "£", "≺", "⪳", "⪷", "≼", "⪯", "≺", "⪷", +    "≼", "⪯", "⪹", "⪵", "⋨", "≾", "′", "ℙ", "⪵", "⪹", "⋨", "∏", "⌮", "⌒", "⌓", "∝", "∝", "≾", "⊰", +    "𝓅", "ψ", " ", "𝔮", "⨌", "𝕢", "⁗", "𝓆", "ℍ", "⨖", "?", "≟", "\"", "\"", "⇛", "⇒", "⤜", "⤏", +    "⥤", "∽̱", "ŕ", "√", "⦳", "⟩", "⦒", "⦥", "⟩", "»", "»", "→", "⥵", "⇥", "⤠", "⤳", "⤞", "↪", "↬", +    "⥅", "⥴", "↣", "↝", "⤚", "∶", "ℚ", "⤍", "❳", "}", "]", "⦌", "⦎", "⦐", "ř", "ŗ", "⌉", "}", "р", +    "⤷", "⥩", "”", "”", "↳", "ℜ", "ℛ", "ℜ", "ℝ", "▭", "®", "®", "⥽", "⌋", "𝔯", "⇁", "⇀", "⥬", "ρ", +    "ϱ", "→", "↣", "⇁", "⇀", "⇄", "⇌", "⇉", "↝", "⋌", "˚", "≓", "⇄", "⇌", "", "⎱", "⎱", "⫮", "⟭", +    "⇾", "⟧", "⦆", "𝕣", "⨮", "⨵", ")", "⦔", "⨒", "⇉", "›", "𝓇", "↱", "]", "’", "’", "⋌", "⋊", "▹", +    "⊵", "▸", "⧎", "⥨", "℞", "ś", "‚", "≻", "⪴", "⪸", "š", "≽", "⪰", "ş", "ŝ", "⪶", "⪺", "⋩", "⨓", +    "≿", "с", "⋅", "⊡", "⩦", "⇘", "⤥", "↘", "↘", "§", "§", ";", "⤩", "∖", "∖", "✶", "𝔰", "⌢", "♯", +    "щ", "ш", "∣", "∥", "\u{AD}", "\u{AD}", "σ", "ς", "ς", "∼", "⩪", "≃", "≃", "⪞", "⪠", "⪝", "⪟", +    "≆", "⨤", "⥲", "←", "∖", "⨳", "⧤", "∣", "⌣", "⪪", "⪬", "⪬︀", "ь", "/", "⧄", "⌿", "𝕤", "♠", "♠", +    "∥", "⊓", "⊓︀", "⊔", "⊔︀", "⊏", "⊑", "⊏", "⊑", "⊐", "⊒", "⊐", "⊒", "□", "□", "▪", "▪", "→", "𝓈", +    "∖", "⌣", "⋆", "☆", "★", "ϵ", "ϕ", "¯", "⊂", "⫅", "⪽", "⊆", "⫃", "⫁", "⫋", "⊊", "⪿", "⥹", "⊂", +    "⊆", "⫅", "⊊", "⫋", "⫇", "⫕", "⫓", "≻", "⪸", "≽", "⪰", "⪺", "⪶", "⋩", "≿", "∑", "♪", "⊃", "¹", +    "²", "³", "⫆", "⪾", "⫘", "⊇", "⫄", "⟉", "⫗", "⥻", "⫂", "⫌", "⊋", "⫀", "⊃", "⊇", "⫆", "⊋", "⫌", +    "⫈", "⫔", "⫖", "⇙", "⤦", "↙", "↙", "⤪", "ß", "ß", "⌖", "τ", "⎴", "ť", "ţ", "т", "⃛", "⌕", "𝔱", +    "∴", "∴", "θ", "ϑ", "ϑ", "≈", "∼", " ", "≈", "∼", "þ", "þ", "˜", "×", "×", "⊠", "⨱", "⨰", "∭", +    "⤨", "⊤", "⌶", "⫱", "𝕥", "⫚", "⤩", "‴", "™", "▵", "▿", "◃", "⊴", "≜", "▹", "⊵", "◬", "≜", "⨺", +    "⨹", "⧍", "⨻", "⏢", "𝓉", "ц", "ћ", "ŧ", "≬", "↞", "↠", "⇑", "⥣", "ú", "ú", "↑", "ў", "ŭ", "û", +    "û", "у", "⇅", "ű", "⥮", "⥾", "𝔲", "ù", "ù", "↿", "↾", "▀", "⌜", "⌜", "⌏", "◸", "ū", "¨", "¨", +    "ų", "𝕦", "↑", "↕", "↿", "↾", "⊎", "υ", "ϒ", "υ", "⇈", "⌝", "⌝", "⌎", "ů", "◹", "𝓊", "⋰", "ũ", +    "▵", "▴", "⇈", "ü", "ü", "⦧", "⇕", "⫨", "⫩", "⊨", "⦜", "ϵ", "ϰ", "∅", "ϕ", "ϖ", "∝", "↕", "ϱ", +    "ς", "⊊︀", "⫋︀", "⊋︀", "⫌︀", "ϑ", "⊲", "⊳", "в", "⊢", "∨", "⊻", "≚", "⋮", "|", "|", "𝔳", "⊲", "⊂⃒", +    "⊃⃒", "𝕧", "∝", "⊳", "𝓋", "⫋︀", "⊊︀", "⫌︀", "⊋︀", "⦚", "ŵ", "⩟", "∧", "≙", "℘", "𝔴", "𝕨", "℘", "≀", +    "≀", "𝓌", "⋂", "◯", "⋃", "▽", "𝔵", "⟺", "⟷", "ξ", "⟸", "⟵", "⟼", "⋻", "⨀", "𝕩", "⨁", "⨂", "⟹", +    "⟶", "𝓍", "⨆", "⨄", "△", "⋁", "⋀", "ý", "ý", "я", "ŷ", "ы", "¥", "¥", "𝔶", "ї", "𝕪", "𝓎", "ю", +    "ÿ", "ÿ", "ź", "ž", "з", "ż", "ℨ", "ζ", "𝔷", "ж", "⇝", "𝕫", "𝓏", "", "", +]; diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs new file mode 100644 index 0000000..7b7962b --- /dev/null +++ b/src/construct/blank_line.rs @@ -0,0 +1,61 @@ +//! Blank lines are a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! blank_line ::= *(' ' '\t') +//! ``` +//! +//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! from another paragraph. +//! In several cases, blank lines are not needed between flow constructs, +//! such as between two headings. +//! Sometimes, whether blank lines are present, changes the behavior of how +//! HTML is rendered, such as whether blank lines are present between list +//! items in a list. +//! More than one blank line is never needed in `CommonMark`. +//! +//! Because blank lines can be empty (line endings are not considered part of +//! it), and events cannot be empty, blank lines are not present as a token. +//! +//! ## References +//! +//! *   [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js) +//! *   [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) +//! +//! <!-- To do: link `flow`, `heading`, `list`, `paragraph` --> + +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a blank line. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt( +        |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace), +        |_ok| Box::new(after), +    )(tokenizer, code) +} + +/// After zero or more spaces or tabs, before a line ending or EOF. +/// +/// Note: `␠` represents a space character. +/// +/// ```markdown +/// |␠␠ +/// | +/// ``` +fn after(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            (State::Ok, Some(vec![code])) +        } +        _ => (State::Nok, None), +    } +} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs new file mode 100644 index 0000000..5ea995e --- /dev/null +++ b/src/construct/character_escape.rs @@ -0,0 +1,69 @@ +//! Character escapes are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_escape ::= '\\' ascii_punctuation +//! ``` +//! +//! Like much of markdown, there are no “invalid” character escapes: just a +//! slash, or a slash followed by anything other than an ASCII punctuation +//! character, is exactly that: just a slash. +//! To escape (most) arbitrary characters, use a +//! [character reference][] instead +//! (as in, `&`, `{`, or say `	`). +//! It is also possible to escape a line ending in text with a similar +//! construct: a backslash followed by a line ending (that is part of the +//! construct instead of ending it). +//! +//! ## References +//! +//! *   [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js) +//! *   [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes) +//! +//! [character reference]: crate::construct::character_reference +//! +//! <!-- To do: link `hard_break_escape`, `string`, `text` --> + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a character escape. +/// +/// ```markdown +/// a|\*b +/// a|\b +/// a|\ b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::Char('\\') => { +            tokenizer.enter(TokenType::CharacterEscape); +            tokenizer.enter(TokenType::CharacterEscapeMarker); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::CharacterEscapeMarker); +            (State::Fn(Box::new(inside)), None) +        } +        _ => (State::Nok, None), +    } +} + +/// Inside a character escape, after `\`. +/// +/// ```markdown +/// a\|*b +/// a\|b +/// a\| b +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::Char(char) if char.is_ascii_punctuation() => { +            tokenizer.enter(TokenType::CharacterEscapeValue); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::CharacterEscapeValue); +            tokenizer.exit(TokenType::CharacterEscape); +            (State::Ok, None) +        } +        _ => (State::Nok, None), +    } +} diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs new file mode 100644 index 0000000..27275d5 --- /dev/null +++ b/src/construct/character_reference.rs @@ -0,0 +1,237 @@ +//! Character references are a construct that occurs in the string and text +//! content types. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! character_reference ::= '&' (numeric | named) ';' +//! +//! numeric ::= '#' (hexadecimal | decimal) +//! ; Note: Limit of `6` imposed as all bigger numbers are invalid: +//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) +//! ; Note: Limit of `7` imposed as all bigger numbers are invalid: +//! decimal ::= 1*7(ascii_digit) +//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`: +//! ; Note: Limited to any known named character reference (see `constants.rs`) +//! named ::= 1*31(ascii_alphanumeric) +//! ``` +//! +//! Like much of markdown, there are no “invalid” character references. +//! However, for security reasons, several numeric character references parse +//! fine but are not rendered as their corresponding character and they are +//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). +//! See [`decode_numeric_character_reference`][decode_numeric] for more info. +//! +//! To escape ASCII punctuation characters, use the terser +//! [character escape][character_escape] construct instead (as in, `\&`). +//! +//! Character references in markdown are not the same as character references +//! in HTML. +//! Notably, HTML allows several character references without a closing +//! semicolon. +//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info. +//! +//! Character references are parsed insensitive to casing. +//! The casing of hexadecimal numeric character references has no effect. +//! The casing of named character references does not matter when parsing them, +//! but does affect whether they match. +//! Depending on the name, one or more cases are allowed, such as that `AMP` +//! and `amp` are both allowed but other cases are not. +//! See [`CHARACTER_REFERENCE_NAMES`][character_reference_names] for which +//! names match. +//! +//! ## References +//! +//! *   [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) +//! *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +//! +//! [character_escape]: crate::construct::character_reference +//! [decode_numeric]: crate::util::decode_numeric_character_reference +//! [character_reference_names]: crate::constant::CHARACTER_REFERENCE_NAMES +//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state +//! +//! <!-- To do: link `string`, `text` --> + +use crate::constant::{ +    CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, +    CHARACTER_REFERENCE_NAMED_SIZE_MAX, CHARACTER_REFERENCE_NAMES, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of a character reference. +#[derive(Debug, Clone)] +pub enum Kind { +    /// Numeric decimal character reference (`	`). +    Decimal, +    /// Numeric hexadecimal character reference (`{`). +    Hexadecimal, +    /// Named character reference (`&`). +    Named, +} + +/// State needed to parse character references. +#[derive(Debug, Clone)] +struct Info { +    /// All parsed characters. +    buffer: Vec<char>, +    /// Kind of character reference. +    kind: Kind, +} + +/// Start of a character reference. +/// +/// ```markdown +/// a|&b +/// a|{b +/// a|	b +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::Char('&') => { +            tokenizer.enter(TokenType::CharacterReference); +            tokenizer.enter(TokenType::CharacterReferenceMarker); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::CharacterReferenceMarker); +            (State::Fn(Box::new(open)), None) +        } +        _ => (State::Nok, None), +    } +} + +/// Inside a character reference, after `&`, before `#` for numeric references +/// or an alphanumeric for named references. +/// +/// ```markdown +/// a&|amp;b +/// a&|#123;b +/// a&|#x9;b +/// ``` +fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    if let Code::Char('#') = code { +        tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric); +        tokenizer.consume(code); +        tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); +        (State::Fn(Box::new(numeric)), None) +    } else { +        tokenizer.enter(TokenType::CharacterReferenceValue); +        value( +            tokenizer, +            code, +            Info { +                buffer: vec![], +                kind: Kind::Named, +            }, +        ) +    } +} + +/// Inside a numeric character reference, right before `x` for hexadecimals, +/// or a digit for decimals. +/// +/// ```markdown +/// a&#|123;b +/// a&#|x9;b +/// ``` +fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::Char(char) if char == 'x' || char == 'X' => { +            tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal); +            tokenizer.enter(TokenType::CharacterReferenceValue); + +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    value( +                        tokenizer, +                        code, +                        Info { +                            buffer: vec![], +                            kind: Kind::Hexadecimal, +                        }, +                    ) +                })), +                None, +            ) +        } +        _ => { +            tokenizer.enter(TokenType::CharacterReferenceValue); + +            value( +                tokenizer, +                code, +                Info { +                    buffer: vec![], +                    kind: Kind::Decimal, +                }, +            ) +        } +    } +} + +/// Inside a character reference value, after the markers (`&#x`, `&#`, or +/// `&`) that define its kind, but before the `;`. +/// The character reference kind defines what and how many characters are +/// allowed. +/// +/// ```markdown +/// a&a|mp;b +/// a|23;b +/// a&#x|9;b +/// ``` +fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { +    match code { +        Code::Char(';') if !info.buffer.is_empty() => { +            tokenizer.exit(TokenType::CharacterReferenceValue); +            let value = info.buffer.iter().collect::<String>(); + +            if let Kind::Named = info.kind { +                if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { +                    return (State::Nok, Some(vec![code])); +                } +            } + +            tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); +            tokenizer.exit(TokenType::CharacterReference); +            (State::Ok, None) +        } +        Code::Char(char) => { +            let len = info.buffer.len(); + +            let cont = match info.kind { +                Kind::Hexadecimal +                    if char.is_ascii_hexdigit() +                        && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX => +                { +                    true +                } +                Kind::Decimal +                    if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX => +                { +                    true +                } +                Kind::Named +                    if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX => +                { +                    true +                } +                _ => false, +            }; + +            if cont { +                let mut clone = info; +                clone.buffer.push(char); +                tokenizer.consume(code); +                ( +                    State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))), +                    None, +                ) +            } else { +                (State::Nok, None) +            } +        } +        _ => (State::Nok, None), +    } +} diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs new file mode 100644 index 0000000..2068a62 --- /dev/null +++ b/src/construct/code_fenced.rs @@ -0,0 +1,581 @@ +//! Code (fenced) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! +//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab +//! ; Restriction: the number of markers in the closing fence sequence must be +//! ; equal to or greater than the number of markers in the opening fence +//! ; sequence. +//! ; Restriction: the marker in the closing fence sequence must match the +//! ; marker in the opening fence sequence +//! fence_close ::= sequence *space_or_tab +//! sequence ::= 3*'`' | 3*'~' +//! info ::= 1*text +//! meta ::= 1*text *( *space_or_tab 1*text ) +//! +//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the +//! ; marker of the opening fence sequence. +//! text ::= code - eol - space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! code ::= . ; any unicode code point (other than line endings). +//! ``` +//! +//! The above grammar does not show how whitespace is handled. +//! To parse code (fenced), let `X` be the number of whitespace characters +//! before the opening fence sequence. +//! Each line of content is then allowed (not required) to be indented with up +//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! considered as part of the code. +//! This indent does not affect the closing fence. +//! It can be indented up to a separate 3 spaces or tabs. +//! A bigger indent makes it part of the code instead of a fence. +//! +//! Code (fenced) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` +//! element*][html-code] in the HTML spec for more info. +//! +//! The optional `meta` part is ignored: it is not used when parsing or +//! rendering. +//! The optional `info` part is used and is expected to specify the programming +//! language that the code is in. +//! Which value it holds depends on what your syntax highlighter supports, if +//! one is used. +//! The `info` is, when rendering to HTML, typically exposed as a class. +//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code` +//! element*][html-code]). +//! For example: +//! +//! ```markdown +//! ~~~css +//! * { color: tomato } +//! ~~~ +//! ``` +//! +//! Yields: +//! +//! ```html +//! <pre><code class="language-css">* { color: tomato } +//! </code></pre> +//! ``` +//! +//! The `info` and `meta` parts are interpreted as the string content type. +//! That means that character escapes and character reference are allowed. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the +//! [code (indented)][code-indented] construct. +//! That construct is less explicit, different from code (text), and has no +//! support for specifying the programming language, so it is recommended to +//! use code (fenced) instead of code (indented). +//! +//! ## References +//! +//! *   [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js) +//! *   [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks) +//! +//! [code-indented]: crate::construct::code_indented +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! <!-- To do: link `flow`, `text`, `code_text`, `string` --> + +use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE}; +use crate::construct::partial_whitespace::start as whitespace; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::get_span; + +/// Kind of fences. +#[derive(Debug, Clone, PartialEq)] +pub enum Kind { +    /// Grave accent (tick) code. +    GraveAccent, +    /// Tilde code. +    Tilde, +} + +/// State needed to parse code (fenced). +#[derive(Debug, Clone)] +struct Info { +    /// Number of markers on the opening fence sequence. +    size: usize, +    /// Number of tabs or spaces of indentation before the opening fence +    /// sequence. +    prefix: usize, +    /// Kind of fences. +    kind: Kind, +} + +/// Start of fenced code. +/// +/// ```markdown +/// | ~~~js +///  console.log(1); +///  ~~~ +/// ``` +/// +/// Parsing note: normally, the prefix is already stripped. +/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need +/// it. +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.enter(TokenType::CodeFenced); +    tokenizer.enter(TokenType::CodeFencedFence); +    tokenizer.attempt( +        |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), +        |_ok| Box::new(before_sequence_open), +    )(tokenizer, code) +} + +/// Inside the opening fence, after an optional prefix, before a sequence. +/// +/// ```markdown +/// |~~~js +/// console.log(1); +/// ~~~ +/// ``` +fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    let tail = tokenizer.events.last(); +    let mut prefix = 0; + +    if let Some(event) = tail { +        if event.token_type == TokenType::Whitespace { +            let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); +            prefix = span.end_index - span.start_index; +        } +    } + +    match code { +        Code::Char(char) if char == '`' || char == '~' => { +            tokenizer.enter(TokenType::CodeFencedFenceSequence); +            sequence_open( +                tokenizer, +                Info { +                    prefix, +                    size: 0, +                    kind: if char == '`' { +                        Kind::GraveAccent +                    } else { +                        Kind::Tilde +                    }, +                }, +                code, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// Inside the opening fence sequence. +/// +/// ```markdown +/// ~|~~js +/// console.log(1); +/// ~~~ +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    let marker = if info.kind == Kind::GraveAccent { +        '`' +    } else { +        '~' +    }; + +    match code { +        Code::Char(char) if char == marker => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    let mut info = info; +                    info.size += 1; +                    sequence_open(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => { +            if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN { +                (State::Nok, None) +            } else { +                tokenizer.exit(TokenType::CodeFencedFenceSequence); +                tokenizer.attempt( +                    |tokenizer, code| { +                        whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace) +                    }, +                    |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)), +                )(tokenizer, code) +            } +        } +    } +} + +/// Inside the opening fence, after the sequence (and optional whitespace), before the info. +/// +/// ```markdown +/// ~~~|js +/// console.log(1); +/// ~~~ +/// ``` +fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::CodeFencedFence); +            at_break(tokenizer, info, code) +        } +        _ => { +            tokenizer.enter(TokenType::CodeFencedFenceInfo); +            tokenizer.enter(TokenType::ChunkString); +            info_inside(tokenizer, info, code, vec![]) +        } +    } +} + +/// Inside the opening fence info. +/// +/// ```markdown +/// ~~~j|s +/// console.log(1); +/// ~~~ +/// ``` +fn info_inside( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +    codes: Vec<Code>, +) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            println!("to do: subtokenize: {:?}", codes); +            tokenizer.exit(TokenType::ChunkString); +            tokenizer.exit(TokenType::CodeFencedFenceInfo); +            tokenizer.exit(TokenType::CodeFencedFence); +            at_break(tokenizer, info, code) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            println!("to do: subtokenize: {:?}", codes); +            tokenizer.exit(TokenType::ChunkString); +            tokenizer.exit(TokenType::CodeFencedFenceInfo); +            tokenizer.attempt( +                |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), +                |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)), +            )(tokenizer, code) +        } +        Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), +        Code::Char(_) => { +            let mut codes = codes; +            codes.push(code); +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    info_inside(tokenizer, info, code, codes) +                })), +                None, +            ) +        } +    } +} + +/// Inside the opening fence, after the info and whitespace, before the meta. +/// +/// ```markdown +/// ~~~js |eval +/// console.log(1); +/// ~~~ +/// ``` +fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::CodeFencedFence); +            at_break(tokenizer, info, code) +        } +        _ => { +            tokenizer.enter(TokenType::CodeFencedFenceMeta); +            tokenizer.enter(TokenType::ChunkString); +            meta(tokenizer, info, code) +        } +    } +} + +/// Inside the opening fence meta. +/// +/// ```markdown +/// ~~~js e|val +/// console.log(1); +/// ~~~ +/// ``` +fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::ChunkString); +            tokenizer.exit(TokenType::CodeFencedFenceMeta); +            tokenizer.exit(TokenType::CodeFencedFence); +            at_break(tokenizer, info, code) +        } +        Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), +        _ => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))), +                None, +            ) +        } +    } +} + +/// At an eol/eof in code, before a closing fence or before content. +/// +/// ```markdown +/// ~~~js| +/// aa| +/// ~~~ +/// ``` +fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    let clone = info.clone(); + +    match code { +        Code::None => after(tokenizer, code), +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt( +            |tokenizer, code| { +                tokenizer.enter(TokenType::LineEnding); +                tokenizer.consume(code); +                tokenizer.exit(TokenType::LineEnding); +                ( +                    State::Fn(Box::new(|tokenizer, code| { +                        close_before(tokenizer, info, code) +                    })), +                    None, +                ) +            }, +            |ok| { +                if ok { +                    Box::new(after) +                } else { +                    Box::new(|tokenizer, code| { +                        tokenizer.enter(TokenType::LineEnding); +                        tokenizer.consume(code); +                        tokenizer.exit(TokenType::LineEnding); +                        ( +                            State::Fn(Box::new(|tokenizer, code| { +                                content_start(tokenizer, clone, code) +                            })), +                            None, +                        ) +                    }) +                } +            }, +        )(tokenizer, code), +        _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), +    } +} + +/// Before a closing fence, before optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +/// |  ~~~ +/// ``` +fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    tokenizer.enter(TokenType::CodeFencedFence); +    tokenizer.attempt( +        |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), +        |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)), +    )(tokenizer, code) +} + +/// In a closing fence, after optional whitespace, before sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// |~~~ +/// +/// ~~~js +/// console.log('1') +///   |~~~ +/// ``` +fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    let tail = tokenizer.events.last(); +    let mut prefix = 0; +    let marker = if info.kind == Kind::GraveAccent { +        '`' +    } else { +        '~' +    }; + +    if let Some(event) = tail { +        if event.token_type == TokenType::Whitespace { +            let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); +            prefix = span.end_index - span.start_index; +        } +    } + +    // To do: 4+ should be okay if code (indented) is turned off! +    if prefix >= TAB_SIZE { +        return (State::Nok, None); +    } + +    match code { +        Code::Char(char) if char == marker => { +            tokenizer.enter(TokenType::CodeFencedFenceSequence); +            close_sequence(tokenizer, info, code, 0) +        } +        _ => (State::Nok, None), +    } +} + +/// In the closing fence sequence. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~|~~ +/// ``` +fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult { +    let marker = if info.kind == Kind::GraveAccent { +        '`' +    } else { +        '~' +    }; + +    match code { +        Code::Char(char) if char == marker => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    close_sequence(tokenizer, info, code, size + 1) +                })), +                None, +            ) +        } +        _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => { +            tokenizer.exit(TokenType::CodeFencedFenceSequence); +            tokenizer.attempt( +                |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace), +                |_ok| Box::new(close_whitespace_after), +            )(tokenizer, code) +        } +        _ => (State::Nok, None), +    } +} + +/// After the closing fence sequence after optional whitespace. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~ | +/// ``` +fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::CodeFencedFence); +            (State::Ok, Some(vec![code])) +        } +        _ => (State::Nok, None), +    } +} + +/// Before code content, definitely not before a closing fence. +/// +/// ```markdown +/// ~~~js +/// |aa +/// ~~~ +/// ``` +fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            at_break(tokenizer, info, code) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => { +            tokenizer.enter(TokenType::Whitespace); +            content_prefix(tokenizer, info, 0, code) +        } +        _ => { +            tokenizer.enter(TokenType::CodeFlowChunk); +            content_continue(tokenizer, info, code) +        } +    } +} + +/// Before code content, in a prefix. +/// +/// ```markdown +///   ~~~js +///  | aa +///   ~~~ +/// ``` +fn content_prefix( +    tokenizer: &mut Tokenizer, +    info: Info, +    prefix: usize, +    code: Code, +) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    content_prefix(tokenizer, info, prefix + 1, code) +                })), +                None, +            ) +        } +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::Whitespace); +            at_break(tokenizer, info, code) +        } +        _ => { +            tokenizer.exit(TokenType::Whitespace); +            tokenizer.enter(TokenType::CodeFlowChunk); +            content_continue(tokenizer, info, code) +        } +    } +} + +/// In code content. +/// +/// ```markdown +/// ~~~js +/// |ab +/// a|b +/// ab| +/// ~~~ +/// ``` +fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::CodeFlowChunk); +            at_break(tokenizer, info, code) +        } +        _ => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    content_continue(tokenizer, info, code) +                })), +                None, +            ) +        } +    } +} + +/// After fenced code. +/// +/// ```markdown +/// ~~~js +/// console.log('1') +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.exit(TokenType::CodeFenced); +    (State::Ok, Some(vec![code])) +} diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs new file mode 100644 index 0000000..6bf089b --- /dev/null +++ b/src/construct/code_indented.rs @@ -0,0 +1,190 @@ +//! Code (indented) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line ) +//! +//! ; Restriction: at least one `code` must not be whitespace. +//! indented_filled_line ::= 4space_or_tab *code +//! blank_line ::= *space_or_tab +//! eol ::= '\r' | '\r\n' | '\n' +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Code (indented) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` +//! element*][html-code] in the HTML spec for more info. +//! +//! In markdown, it is also possible to use code (text) in the text content +//! type. +//! It is also possible to create code with the [code (fenced)][code-fenced] +//! construct. +//! That construct is more explicit, more similar to code (text), and has +//! support for specifying the programming language that the code is in, so it +//! is recommended to use that instead of indented code. +//! +//! ## References +//! +//! *   [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js) +//! *   [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks) +//! +//! [code-fenced]: crate::construct::code_fenced +//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element +//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! +//! <!-- To do: link `flow`, `code_text` --> + +use crate::constant::TAB_SIZE; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of code (indented). +/// +/// ```markdown +/// |    asd +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char(' ' | '\t') => { +            tokenizer.enter(TokenType::CodeIndented); +            tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); +            indent(tokenizer, code, 0) +        } +        _ => (State::Nok, None), +    } +} + +/// Inside the initial whitespace. +/// +/// ```markdown +///  |   asd +///   |  asd +///    | asd +///     |asd +/// ``` +/// +/// > **Parsing note**: it is not needed to check if this first line is a +/// > filled line (that it has a non-whitespace character), because blank lines +/// > are parsed already, so we never run into that. +fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { +    match code { +        _ if size == TAB_SIZE => { +            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); +            at_break(tokenizer, code) +        } +        Code::VirtualSpace | Code::Char(' ' | '\t') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    indent(tokenizer, code, size + 1) +                })), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// At a break. +/// +/// ```markdown +///     |asd +///     asd| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => after(tokenizer, code), +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer +            .attempt(further_start, |ok| { +                Box::new(if ok { at_break } else { after }) +            })(tokenizer, code), +        _ => { +            tokenizer.enter(TokenType::CodeFlowChunk); +            content(tokenizer, code) +        } +    } +} + +/// Inside code content. +/// +/// ```markdown +///     |ab +///     a|b +///     ab| +/// ``` +fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::CodeFlowChunk); +            at_break(tokenizer, code) +        } +        _ => { +            tokenizer.consume(code); +            (State::Fn(Box::new(content)), None) +        } +    } +} + +/// After indented code. +/// +/// ```markdown +///     ab| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.exit(TokenType::CodeIndented); +    (State::Ok, Some(vec![code])) +} + +/// Right at a line ending, trying to parse another indent. +/// +/// ```markdown +///     ab| +///     cd +/// ``` +fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    // To do: `nok` if lazy line. +    match code { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.enter(TokenType::LineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::LineEnding); +            (State::Fn(Box::new(further_start)), None) +        } +        Code::VirtualSpace | Code::Char(' ' | '\t') => { +            tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace); +            further_indent(tokenizer, code, 0) +        } +        _ => (State::Nok, None), +    } +} + +/// Inside further whitespace. +/// +/// ```markdown +///     asd +///   |  asd +/// ``` +fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { +    match code { +        _ if size == TAB_SIZE => { +            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); +            (State::Ok, Some(vec![code])) +        } +        Code::VirtualSpace | Code::Char(' ' | '\t') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    further_indent(tokenizer, code, size + 1) +                })), +                None, +            ) +        } +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace); +            further_start(tokenizer, code) +        } +        _ => (State::Nok, None), +    } +} diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs new file mode 100644 index 0000000..b3aef1b --- /dev/null +++ b/src/construct/heading_atx.rs @@ -0,0 +1,175 @@ +//! Heading (atx) is a construct that occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! +//! code ::= . ; any unicode code point (other than line endings). +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! `CommonMark` introduced the requirement on whitespace existing after the +//! opening sequence and before text. +//! In older markdown versions, this was not required, and headings would form +//! without it. +//! +//! In markdown, it is also possible to create headings with the setext heading +//! construct. +//! The benefit of setext headings is that their text can include line endings. +//! However, their limit is that they cannot form `<h3>` through `<h6>` +//! headings. +//! Due to this limitation, it is recommended to use atx headings. +//! +//! > 🏛 **Background**: the word *setext* originates from a small markup +//! > language by Ian Feldman from 1991. +//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > The word *atx* originates from a tiny markup language by Aaron Swartz +//! > from 2002. +//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for +//! > more info. +//! +//! ## References +//! +//! *   [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) +//! *   [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings) +//! +//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements +//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [atx]: http://www.aaronsw.com/2002/atx/ +//! +//! <!-- To do: link `flow`, `setext` --> + +use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a heading (atx). +/// +/// ```markdown +/// |## alpha +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    if Code::Char('#') == code { +        tokenizer.enter(TokenType::AtxHeading); +        tokenizer.enter(TokenType::AtxHeadingSequence); +        sequence_open(tokenizer, code, 0) +    } else { +        (State::Nok, None) +    } +} + +/// In the opening sequence. +/// +/// ```markdown +/// #|# alpha +/// ``` +fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult { +    match code { +        Code::None +        | Code::CarriageReturnLineFeed +        | Code::VirtualSpace +        | Code::Char('\t' | '\n' | '\r' | ' ') +            if rank > 0 => +        { +            tokenizer.exit(TokenType::AtxHeadingSequence); +            at_break(tokenizer, code) +        } +        Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    sequence_open(tokenizer, code, rank + 1) +                })), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// After something but before something else. +/// +/// ```markdown +/// ## |alpha +/// ## alpha| bravo +/// ## alpha |bravo +/// ## alpha bravo|## +/// ## alpha bravo ##| +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::AtxHeading); +            (State::Ok, Some(vec![code])) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.enter(TokenType::AtxHeadingWhitespace); +            whitespace(tokenizer, code) +        } +        Code::Char('#') => { +            tokenizer.enter(TokenType::AtxHeadingSequence); +            further_sequence(tokenizer, code) +        } +        Code::Char(_) => { +            tokenizer.enter(TokenType::AtxHeadingText); +            data(tokenizer, code) +        } +    } +} + +/// In a further sequence (after whitespace). +/// Could be normal “visible” hashes in the heading or a final sequence. +/// +/// ```markdown +/// ## alpha #|# +/// ``` +fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    if let Code::Char('#') = code { +        tokenizer.consume(code); +        (State::Fn(Box::new(further_sequence)), None) +    } else { +        tokenizer.exit(TokenType::AtxHeadingSequence); +        at_break(tokenizer, code) +    } +} + +/// In whitespace. +/// +/// ```markdown +/// ## alpha | bravo +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            (State::Fn(Box::new(whitespace)), None) +        } +        _ => { +            tokenizer.exit(TokenType::AtxHeadingWhitespace); +            at_break(tokenizer, code) +        } +    } +} + +/// In text. +/// +/// ```markdown +/// ## al|pha +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { +            tokenizer.exit(TokenType::AtxHeadingText); +            at_break(tokenizer, code) +        } +        _ => { +            tokenizer.consume(code); +            (State::Fn(Box::new(data)), None) +        } +    } +} diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs new file mode 100644 index 0000000..b7d5570 --- /dev/null +++ b/src/construct/html_flow.rs @@ -0,0 +1,1068 @@ +//! HTML (flow) is a construct that occurs in the flow content type. +//! +//! It forms with the following BNF: +//! +//! ```bnf +//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete +//! +//! ; Note: closing tag name need to match opening tag name. +//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ] +//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ] +//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ] +//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ] +//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ] +//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] +//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! +//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive. +//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. +//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! closing_tag ::= '</' tag_name whitespace_optional '>' +//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) +//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] +//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) +//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" )  "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') +//! +//! whitespace ::= 1*space_or_tab +//! whitespace_optional ::= [ space_or_tab ] +//! line ::= code - eol +//! eol ::= '\r' | '\r\n' | '\n' +//! space_or_tab ::= ' ' | '\t' +//! ``` +//! +//! The grammar for HTML in markdown does not resemble the rules of parsing +//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML +//! spec][html-parsing]. +//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?) +//! attempt to parse an XML-like language. +//! By extension, another notable property of the grammar is that it can +//! result in invalid HTML, in that it allows things that wouldn’t work or +//! wouldn’t work well in HTML, such as mismatched tags. +//! +//! Because the **basic** and **complete** productions in the grammar form with +//! a tag, followed by more stuff, and stop at a blank line, it is possible to +//! interleave (a word for switching between languages) markdown and HTML +//! together, by placing the opening and closing tags on their own lines, +//! with blank lines between them and markdown. +//! For example: +//! +//! ```markdown +//! <div>This is a <code>div</code> but *this* is not emphasis.</div> +//! +//! <div> +//! +//! This is a paragraph in a `div` and *this* is emphasis. +//! +//! </div> +//! ``` +//! +//! The **complete** production of HTML (flow) is not allowed to interrupt +//! content. +//! That means that a blank line is needed between a paragraph and it. +//! However, HTML (text) has a similar production, which will typically kick-in +//! instead. +//! +//! The list of tag names allowed in the **raw** production are defined in +//! [`HTML_RAW_NAMES`][html_raw_names]. +//! This production exists because there are a few cases where markdown +//! *inside* some elements, and hence interleaving, does not make sense. +//! +//! The list of tag names allowed in the **basic** production are defined in +//! [`HTML_BLOCK_NAMES`][html_block_names]. +//! This production exists because there are a few cases where we can decide +//! early that something is going to be a flow (block) element instead of a +//! phrasing (inline) element. +//! We *can* interrupt and don’t have to care too much about it being +//! well-formed. +//! +//! ## References +//! +//! *   [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js) +//! *   [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks) +//! +//! [html_raw_names]: crate::constant::HTML_RAW_NAMES +//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES +//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! +//! <!-- To do: link stuff --> + +use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX}; +use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Kind of HTML (flow). +#[derive(Debug, Clone, PartialEq)] +enum Kind { +    /// Not yet known. +    Unknown, +    /// Symbol for `<script>` (condition 1). +    Raw, +    /// Symbol for `<!---->` (condition 2). +    Comment, +    /// Symbol for `<?php?>` (condition 3). +    Instruction, +    /// Symbol for `<!doctype>` (condition 4). +    Declaration, +    /// Symbol for `<![CDATA[]]>` (condition 5). +    Cdata, +    /// Symbol for `<div` (condition 6). +    Basic, +    /// Symbol for `<x>` (condition 7). +    Complete, +} + +/// Type of quote, if we’re in an attribure, in complete (condition 7). +#[derive(Debug, Clone, PartialEq)] +enum QuoteKind { +    /// Not in a quoted attribute. +    None, +    /// In a double quoted (`"`) attribute. +    Double, +    /// In a single quoted (`"`) attribute. +    Single, +} + +/// State needed to parse HTML (flow). +#[derive(Debug, Clone)] +struct Info { +    /// Kind of HTML (flow). +    kind: Kind, +    /// Whether this is a start tag (`<` not followed by `/`). +    start_tag: bool, +    /// Used depending on `kind` to either collect all parsed characters, or to +    /// store expected characters. +    buffer: Vec<char>, +    /// `index` into `buffer` when expecting certain characters. +    index: usize, +    /// Current quote, when in a double or single quoted attribute value. +    quote: QuoteKind, +} + +// To do: mark as concrete (block quotes or lists can’t “pierce” into HTML). + +/// Start of HTML (flow), before optional whitespace. +/// +/// ```markdown +/// |<x /> +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.enter(TokenType::HtmlFlow); +    tokenizer.enter(TokenType::HtmlFlowData); +    tokenizer.attempt( +        |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), +        |_ok| Box::new(before), +    )(tokenizer, code) +} + +/// After optional whitespace, before `<`. +/// +/// ```markdown +/// |<x /> +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    if Code::Char('<') == code { +        tokenizer.consume(code); +        ( +            State::Fn(Box::new(|tokenizer, code| { +                open( +                    tokenizer, +                    Info { +                        kind: Kind::Unknown, +                        start_tag: false, +                        buffer: vec![], +                        index: 0, +                        quote: QuoteKind::None, +                    }, +                    code, +                ) +            })), +            None, +        ) +    } else { +        (State::Nok, None) +    } +} + +/// After `<`, before a tag name or other stuff. +/// +/// ```markdown +/// <|x /> +/// <|!doctype /> +/// <|!--xxx--/> +/// ``` +fn open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('!') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    declaration_start(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char('/') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    tag_close_start(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char('?') => { +            // To do: life times. +            let mut clone = info; +            clone.kind = Kind::Instruction; +            tokenizer.consume(code); +            // While we’re in an instruction instead of a declaration, we’re on a `?` +            // right now, so we do need to search for `>`, similar to declarations. +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, clone, code) +                })), +                None, +            ) +        } +        Code::Char(char) if char.is_ascii_alphabetic() => { +            // To do: life times. +            let mut clone = info; +            clone.start_tag = true; +            tag_name(tokenizer, clone, code) +        } +        _ => (State::Nok, None), +    } +} + +/// After `<!`, so inside a declaration, comment, or CDATA. +/// +/// ```markdown +/// <!|doctype /> +/// <!|--xxx--/> +/// <!|[CDATA[>&<]]> +/// ``` +fn declaration_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('-') => { +            tokenizer.consume(code); +            let mut clone = info; +            clone.kind = Kind::Comment; +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    comment_open_inside(tokenizer, clone, code) +                })), +                None, +            ) +        } +        Code::Char('[') => { +            tokenizer.consume(code); +            let mut clone = info; +            clone.kind = Kind::Cdata; +            clone.buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; +            clone.index = 0; +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    cdata_open_inside(tokenizer, clone, code) +                })), +                None, +            ) +        } +        Code::Char(char) if char.is_ascii_alphabetic() => { +            tokenizer.consume(code); +            // To do: life times. +            let mut clone = info; +            clone.kind = Kind::Declaration; +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, clone, code) +                })), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// After `<!-`, inside a comment, before another `-`. +/// +/// ```markdown +/// <!-|-xxx--/> +/// ``` +fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('-') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// After `<![`, inside CDATA, expecting `CDATA[`. +/// +/// ```markdown +/// <![|CDATA[>&<]]> +/// <![CD|ATA[>&<]]> +/// <![CDA|TA[>&<]]> +/// <![CDAT|A[>&<]]> +/// <![CDATA|[>&<]]> +/// ``` +fn cdata_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char(char) if char == info.buffer[info.index] => { +            let mut clone = info; +            clone.index += 1; +            tokenizer.consume(code); + +            if clone.index == clone.buffer.len() { +                clone.buffer.clear(); +                ( +                    State::Fn(Box::new(|tokenizer, code| { +                        continuation(tokenizer, clone, code) +                    })), +                    None, +                ) +            } else { +                ( +                    State::Fn(Box::new(|tokenizer, code| { +                        cdata_open_inside(tokenizer, clone, code) +                    })), +                    None, +                ) +            } +        } +        _ => (State::Nok, None), +    } +} + +/// After `</`, in a closing tag, before a tag name. +/// +/// ```markdown +/// </|x> +/// ``` +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char(char) if char.is_ascii_alphabetic() => { +            tokenizer.consume(code); +            // To do: life times. +            let mut clone = info; +            clone.buffer.push(char); +            ( +                State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// In a tag name. +/// +/// ```markdown +/// <a|b> +/// </a|b> +/// ``` +fn tag_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None +        | Code::CarriageReturnLineFeed +        | Code::VirtualSpace +        | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { +            let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); +            let name = tag_name_buffer.as_str(); +            let slash = if let Code::Char(char) = code { +                char == '/' +            } else { +                false +            }; + +            if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { +                // To do: life times. +                let mut clone = info; +                clone.kind = Kind::Raw; +                clone.buffer.clear(); +                continuation(tokenizer, clone, code) +            } else if HTML_BLOCK_NAMES.contains(&name) { +                // To do: life times. +                let mut clone = info; +                clone.kind = Kind::Basic; +                clone.buffer.clear(); + +                if slash { +                    tokenizer.consume(code); +                    ( +                        State::Fn(Box::new(|tokenizer, code| { +                            basic_self_closing(tokenizer, clone, code) +                        })), +                        None, +                    ) +                } else { +                    continuation(tokenizer, clone, code) +                } +            } else { +                // To do: life times. +                let mut clone = info; +                clone.kind = Kind::Complete; + +                // To do: do not support complete HTML when interrupting. +                if clone.start_tag { +                    complete_attribute_name_before(tokenizer, clone, code) +                } else { +                    complete_closing_tag_after(tokenizer, clone, code) +                } +            } +        } +        Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { +            tokenizer.consume(code); +            let mut clone = info; +            clone.buffer.push(char); +            ( +                State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))), +                None, +            ) +        } +        Code::Char(_) => (State::Nok, None), +    } +} + +/// After a closing slash of a basic tag name. +/// +/// ```markdown +/// <div/|> +/// ``` +fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('>') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// After a closing slash of a complete tag name. +/// +/// ```markdown +/// <x/|> +/// </x/|> +/// ``` +fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_closing_tag_after(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => complete_end(tokenizer, info, code), +    } +} + +/// At a place where an attribute name would be valid. +/// +/// At first, this state is used after a complete tag name, after whitespace, +/// where it expects optional attributes or the end of the tag. +/// It is also reused after attributes, when expecting more optional +/// attributes. +/// +/// ```markdown +/// <x |/> +/// <x |:asd> +/// <x |_asd> +/// <x |asd> +/// <x | > +/// <x |> +/// ``` +fn complete_attribute_name_before( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::Char('/') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_end(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_name(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_name_before(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => complete_end(tokenizer, info, code), +    } +} + +/// In an attribute name. +/// +/// ```markdown +/// <x :|> +/// <x _|> +/// <x a|> +/// ``` +fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char(char) +            if char == '-' +                || char == '.' +                || char == ':' +                || char == '_' +                || char.is_ascii_alphanumeric() => +        { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_name(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => complete_attribute_name_after(tokenizer, info, code), +    } +} + +/// After an attribute name, before an attribute initializer, the end of the +/// tag, or whitespace. +/// +/// ```markdown +/// <x a|> +/// <x a|=b> +/// <x a|="c"> +/// ``` +fn complete_attribute_name_after( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::Char('=') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_value_before(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_name_after(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => complete_attribute_name_before(tokenizer, info, code), +    } +} + +/// Before an unquoted, double quoted, or single quoted attribute value, +/// allowing whitespace. +/// +/// ```markdown +/// <x a=|b> +/// <x a=|"c"> +/// ``` +fn complete_attribute_value_before( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), +        Code::Char(char) if char == '"' || char == '\'' => { +            tokenizer.consume(code); +            // To do: life times. +            let mut clone = info; +            clone.quote = if char == '"' { +                QuoteKind::Double +            } else { +                QuoteKind::Single +            }; + +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_value_quoted(tokenizer, clone, code) +                })), +                None, +            ) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_value_before(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => complete_attribute_value_unquoted(tokenizer, info, code), +    } +} + +/// In a double or single quoted attribute value. +/// +/// ```markdown +/// <x a="|"> +/// <x a='|'> +/// ``` +fn complete_attribute_value_quoted( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    let marker = if info.quote == QuoteKind::Double { +        '"' +    } else { +        '\'' +    }; + +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), +        Code::Char(char) if char == marker => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_value_quoted_after(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_value_quoted(tokenizer, info, code) +                })), +                None, +            ) +        } +    } +} + +/// In an unquoted attribute value. +/// +/// ```markdown +/// <x a=b|c> +/// ``` +fn complete_attribute_value_unquoted( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::None +        | Code::CarriageReturnLineFeed +        | Code::VirtualSpace +        | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { +            complete_attribute_name_after(tokenizer, info, code) +        } +        Code::Char(_) => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_attribute_value_unquoted(tokenizer, info, code) +                })), +                None, +            ) +        } +    } +} + +/// After a double or single quoted attribute value, before whitespace or the +/// end of the tag. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_attribute_value_quoted_after( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { +            complete_attribute_name_before(tokenizer, info, code) +        } +        _ => (State::Nok, None), +    } +} + +/// In certain circumstances of a complete tag where only an `>` is allowed. +/// +/// ```markdown +/// <x a="b"|> +/// ``` +fn complete_end(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('>') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_after(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// After `>` in a complete tag. +/// +/// ```markdown +/// <x>| +/// ``` +fn complete_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            continuation(tokenizer, info, code) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    complete_after(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char(_) => (State::Nok, None), +    } +} + +/// Inside continuation of any HTML kind. +/// +/// ```markdown +/// <!--x|xx--> +/// ``` +fn continuation(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('-') if info.kind == Kind::Comment => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_comment_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char('<') if info.kind == Kind::Raw => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_raw_tag_open(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char('>') if info.kind == Kind::Declaration => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_close(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char('?') if info.kind == Kind::Instruction => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char(']') if info.kind == Kind::Cdata => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_character_data_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') +            if info.kind == Kind::Basic || info.kind == Kind::Complete => +        { +            let clone = info; + +            tokenizer.check(blank_line_before, |ok| { +                if ok { +                    Box::new(|tokenizer, code| continuation_close(tokenizer, clone, code)) +                } else { +                    Box::new(|tokenizer, code| continuation_at_line_ending(tokenizer, clone, code)) +                } +            })(tokenizer, code) +        } +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            continuation_at_line_ending(tokenizer, info, code) +        } +        _ => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation(tokenizer, info, code) +                })), +                None, +            ) +        } +    } +} + +/// In continuation, before an eol or eof. +/// +/// ```markdown +/// <x>| +/// ``` +fn continuation_at_line_ending(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    tokenizer.exit(TokenType::HtmlFlowData); +    html_continue_start(tokenizer, info, code) +} + +/// In continuation, after an eol. +/// +/// ```markdown +/// <x>| +/// asd +/// ``` +fn html_continue_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None => { +            tokenizer.exit(TokenType::HtmlFlow); +            (State::Ok, Some(vec![code])) +        } +        // To do: do not allow lazy lines. +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.enter(TokenType::LineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::LineEnding); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    html_continue_start(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => { +            tokenizer.enter(TokenType::HtmlFlowData); +            continuation(tokenizer, info, code) +        } +    } +} + +/// In comment continuation, after one `-`, expecting another. +/// +/// ```markdown +/// <!--xxx-|-> +/// ``` +fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('-') if info.kind == Kind::Comment => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => continuation(tokenizer, info, code), +    } +} + +/// In raw continuation, after `<`, expecting a `/`. +/// +/// ```markdown +/// <script>console.log(1)<|/script> +/// ``` +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('/') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_raw_end_tag(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => continuation(tokenizer, info, code), +    } +} + +/// In raw continuation, after `</`, expecting or inside a raw tag name. +/// +/// ```markdown +/// <script>console.log(1)</|script> +/// <script>console.log(1)</s|cript> +/// <script>console.log(1)</script|> +/// ``` +fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::Char('>') => { +            let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); +            // To do: life times. +            let mut clone = info; +            clone.buffer.clear(); + +            if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { +                tokenizer.consume(code); +                ( +                    State::Fn(Box::new(|tokenizer, code| { +                        continuation_close(tokenizer, clone, code) +                    })), +                    None, +                ) +            } else { +                continuation(tokenizer, clone, code) +            } +        } +        Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => { +            tokenizer.consume(code); +            // To do: life times. +            let mut clone = info; +            clone.buffer.push(char); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_raw_end_tag(tokenizer, clone, code) +                })), +                None, +            ) +        } +        _ => continuation(tokenizer, info, code), +    } +} + +/// In cdata continuation, after `]`, expecting `]>`. +/// +/// ```markdown +/// <![CDATA[>&<]|]> +/// ``` +fn continuation_character_data_inside( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::Char(']') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => continuation(tokenizer, info, code), +    } +} + +/// In declaration or instruction continuation, waiting for `>` to close it. +/// +/// ```markdown +/// <!--|> +/// <?ab?|> +/// <?|> +/// <!q|> +/// <!--ab--|> +/// <!--ab--|-> +/// <!--ab---|> +/// <![CDATA[>&<]]|> +/// ``` +fn continuation_declaration_inside( +    tokenizer: &mut Tokenizer, +    info: Info, +    code: Code, +) -> StateFnResult { +    match code { +        Code::Char('>') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_close(tokenizer, info, code) +                })), +                None, +            ) +        } +        Code::Char('-') if info.kind == Kind::Comment => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_declaration_inside(tokenizer, info, code) +                })), +                None, +            ) +        } +        _ => continuation(tokenizer, info, code), +    } +} + +/// In closed continuation: everything we get until the eol/eof is part of it. +/// +/// ```markdown +/// <!doctype>| +/// ``` +fn continuation_close(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::HtmlFlowData); +            tokenizer.exit(TokenType::HtmlFlow); +            (State::Ok, Some(vec![code])) +        } +        _ => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    continuation_close(tokenizer, info, code) +                })), +                None, +            ) +        } +    } +} + +/// Before a line ending, expecting a blank line. +/// +/// ```markdown +/// <div>| +/// +/// ``` +fn blank_line_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.enter(TokenType::LineEnding); +    tokenizer.consume(code); +    tokenizer.exit(TokenType::LineEnding); +    (State::Fn(Box::new(blank_line)), None) +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs new file mode 100644 index 0000000..d671db6 --- /dev/null +++ b/src/construct/mod.rs @@ -0,0 +1,11 @@ +//! Constructs found in markdown. + +pub mod blank_line; +pub mod character_escape; +pub mod character_reference; +pub mod code_fenced; +pub mod code_indented; +pub mod heading_atx; +pub mod html_flow; +pub mod partial_whitespace; +pub mod thematic_break; diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs new file mode 100644 index 0000000..dd0d2b5 --- /dev/null +++ b/src/construct/partial_whitespace.rs @@ -0,0 +1,66 @@ +//! A little helper to parse `space_or_tab` +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! space_or_tab ::= 1*(' ' '\t') +//! ``` +//! +//! Depending on where whitespace can occur, it can be optional (or not), +//! and present in the rendered result (or not). +//! +//! ## References +//! +//! *   [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js) +//! +//! <!-- To do: link stuff --> + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +// To do: should `token_type` be a `Some`, with `None` defaulting to something? +// To do: should `max: Some(usize)` be added? + +/// Before whitespace. +/// +/// ```markdown +/// alpha| bravo +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            // To do: lifetimes. +            let clone = token_type.clone(); +            tokenizer.enter(token_type); +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))), +                None, +            ) +        } +        _ => (State::Nok, None), +    } +} + +/// In whitespace. +/// +/// ```markdown +/// alpha |bravo +/// alpha | bravo +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(|tokenizer, code| { +                    inside(tokenizer, code, token_type) +                })), +                None, +            ) +        } +        _ => { +            tokenizer.exit(token_type); +            (State::Ok, Some(vec![code])) +        } +    } +} diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs new file mode 100644 index 0000000..15ebac7 --- /dev/null +++ b/src/construct/thematic_break.rs @@ -0,0 +1,137 @@ +//! Thematic breaks, sometimes called horizontal rules, are a construct that +//! occurs in the flow content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: all markers must be identical. +//! ; Restriction: at least 3 markers must be used. +//! thematic_break ::= *space_or_tab 1*(1*marker *space_or_tab) +//! +//! space_or_tab ::= ' ' | '\t' +//! marker ::= '*' | '-' | '_' +//! ``` +//! +//! Thematic breaks in markdown typically relate to the HTML element `<hr>`. +//! See [*§ 4.4.2 The `hr` element* in the HTML spec][html] for more info. +//! +//! It is recommended to use exactly three asterisks without whitespace when +//! writing markdown. +//! As using more than three markers has no effect other than wasting space, +//! it is recommended to use exactly three markers. +//! Thematic breaks formed with asterisks or dashes can interfere with lists +//! in if there is whitespace between them: `* * *` and `- - -`. +//! For these reasons, it is recommend to not use spaces or tabs between the +//! markers. +//! Thematic breaks formed with dashes (without whitespace) can also form +//! setext headings. +//! As dashes and underscores frequently occur in natural language and URLs, it +//! is recommended to use asterisks for thematic breaks to distinguish from +//! such use. +//! Because asterisks can be used to form the most markdown constructs, using +//! them has the added benefit of making it easier to gloss over markdown: you +//! can look for asterisks to find syntax while not worrying about other +//! characters. +//! +//! ## References +//! +//! *   [`thematic-break.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/thematic-break.js) +//! *   [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks) +//! +//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element +//! +//! <!-- To do: link `flow` --> + +use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Start of a thematic break. +/// +/// ```markdown +/// |*** +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::Char(char) if char == '*' || char == '-' || char == '_' => { +            tokenizer.enter(TokenType::ThematicBreak); +            at_break(tokenizer, code, char, 0) +        } +        _ => (State::Nok, None), +    } +} + +/// After something but before something else. +/// +/// ```markdown +/// |*** +/// *| * * +/// * |* * +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { +    match code { +        Code::Char(char) if char == marker => { +            tokenizer.enter(TokenType::ThematicBreakSequence); +            sequence(tokenizer, code, marker, size) +        } +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.enter(TokenType::ThematicBreakWhitespace); +            whitespace(tokenizer, code, marker, size) +        } +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') +            if size >= THEMATIC_BREAK_MARKER_COUNT_MIN => +        { +            tokenizer.exit(TokenType::ThematicBreak); +            (State::Ok, Some(vec![code])) +        } +        _ => (State::Nok, None), +    } +} + +/// In a sequence of markers. +/// +/// ```markdown +/// |*** +/// *|** +/// **|* +/// ``` +fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { +    match code { +        Code::Char(char) if char == marker => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    sequence(tokenizer, code, marker, size + 1) +                })), +                None, +            ) +        } +        _ => { +            tokenizer.exit(TokenType::ThematicBreakSequence); +            at_break(tokenizer, code, marker, size) +        } +    } +} + +/// In whitespace. +/// +/// ```markdown +/// * |* * +/// * | * * +/// ``` +fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult { +    match code { +        Code::VirtualSpace | Code::Char('\t' | ' ') => { +            tokenizer.consume(code); +            ( +                State::Fn(Box::new(move |tokenizer, code| { +                    whitespace(tokenizer, code, marker, size) +                })), +                None, +            ) +        } +        _ => { +            tokenizer.exit(TokenType::ThematicBreakWhitespace); +            at_break(tokenizer, code, marker, size) +        } +    } +} diff --git a/src/content/flow.rs b/src/content/flow.rs new file mode 100644 index 0000000..21c5721 --- /dev/null +++ b/src/content/flow.rs @@ -0,0 +1,258 @@ +//! The flow content type. +//! +//! **Flow** represents the sections, such as headings, code, and content, which +//! is parsed per line. +//! An example is HTML, which has a certain starting condition (such as +//! `<script>` on its own line), then continues for a while, until an end +//! condition is found (such as `</style>`). +//! If that line with an end condition is never found, that flow goes until +//! the end. +//! +//! The constructs found in flow are: +//! +//! *   [Blank line][crate::construct::blank_line] +//! *   [Code (fenced)][crate::construct::code_fenced] +//! *   [Code (indented)][crate::construct::code_indented] +//! *   [Heading (atx)][crate::construct::heading_atx] +//! *   [HTML (flow)][crate::construct::html_flow] +//! *   [Thematic break][crate::construct::thematic_break] +//! +//! <!-- To do: `setext` in content? Link to content. --> + +use crate::construct::{ +    blank_line::start as blank_line, code_fenced::start as code_fenced, +    code_indented::start as code_indented, heading_atx::start as heading_atx, +    html_flow::start as html_flow, partial_whitespace::start as whitespace, +    thematic_break::start as thematic_break, +}; +use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; + +/// Turn `codes` as the flow content type into events. +// To do: remove this `allow` when all the content types are glued together. +#[allow(dead_code)] +pub fn flow(codes: Vec<Code>) -> Vec<Event> { +    let mut tokenizer = Tokenizer::new(); +    let (state, remainder) = tokenizer.feed(codes, Box::new(start), true); + +    if let Some(ref x) = remainder { +        if !x.is_empty() { +            unreachable!("expected no final remainder {:?}", x); +        } +    } + +    match state { +        State::Ok => {} +        _ => unreachable!("expected final state to be `State::Ok`"), +    } + +    tokenizer.events +} + +/// Before flow. +/// +/// First we assume a blank line. +// +/// ```markdown +/// | +/// |## alpha +/// |    bravo +/// |*** +/// ``` +fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        _ => tokenizer.attempt(blank_line, |ok| { +            Box::new(if ok { blank_line_after } else { initial_before }) +        })(tokenizer, code), +    } +} + +/// After a blank line. +/// +/// Move to `start` afterwards. +/// +/// ```markdown +/// ␠␠| +/// ``` +fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.enter(TokenType::BlankLineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::BlankLineEnding); +            (State::Fn(Box::new(start)), None) +        } +        _ => unreachable!("expected eol/eof after blank line `{:?}`", code), +    } +} + +/// Before flow (initial). +/// +/// “Initial” flow means unprefixed flow, so right at the start of a line. +/// Interestingly, the only flow (initial) construct is indented code. +/// Move to `before` afterwards. +/// +/// ```markdown +/// |qwe +/// |    asd +/// ``` +fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        _ => tokenizer.attempt(code_indented, |ok| { +            Box::new(if ok { +                after +            } else { +                initial_before_not_code_indented +            }) +        })(tokenizer, code), +    } +} + +/// After a flow construct. +/// +/// ```markdown +/// ## alpha| +/// | +/// ~~~js +/// asd +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.enter(TokenType::LineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::LineEnding); +            (State::Fn(Box::new(start)), None) +        } +        _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code), +    } +} + +/// Before flow (initial), but not at code (indented). +/// +/// ```markdown +/// |qwe +/// ``` +fn initial_before_not_code_indented(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        _ => tokenizer.attempt(code_fenced, |ok| { +            Box::new(if ok { +                after +            } else { +                initial_before_not_code_fenced +            }) +        })(tokenizer, code), +    } +} + +/// Before flow (initial), but not at code (fenced). +/// +/// ```markdown +/// |qwe +/// ``` +fn initial_before_not_code_fenced(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        _ => tokenizer.attempt(html_flow, |ok| Box::new(if ok { after } else { before }))( +            tokenizer, code, +        ), +    } +} + +/// Before flow, but not at code (indented) or code (fenced). +/// +/// Compared to flow (initial), normal flow can be arbitrarily prefixed. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt( +        |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), +        |_ok| Box::new(before_after_prefix), +    )(tokenizer, code) +} + +/// Before flow, after potential whitespace. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt(heading_atx, |ok| { +        Box::new(if ok { after } else { before_not_heading_atx }) +    })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_not_heading_atx(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt(thematic_break, |ok| { +        Box::new(if ok { after } else { before_not_thematic_break }) +    })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) or thematic break. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_not_thematic_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt(html_flow, |ok| { +        Box::new(if ok { after } else { content_before }) +    })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) or thematic break. +/// +/// At this point, we’re at content (zero or more definitions and zero or one +/// paragraph/setext heading). +/// +/// ```markdown +/// |qwe +/// ``` +// To do: currently only parses a single line. +// To do: +// - Multiline +// - One or more definitions. +// - Setext heading. +fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            after(tokenizer, code) +        } +        _ => { +            tokenizer.enter(TokenType::Content); +            tokenizer.enter(TokenType::ContentPhrasing); +            tokenizer.consume(code); +            (State::Fn(Box::new(content)), None) +        } +    } +} +/// In content. +/// +/// ```markdown +/// al|pha +/// ``` +// To do: lift limitations as documented above. +fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::ContentPhrasing); +            tokenizer.exit(TokenType::Content); +            after(tokenizer, code) +        } +        _ => { +            tokenizer.consume(code); +            (State::Fn(Box::new(content)), None) +        } +    } +} diff --git a/src/content/mod.rs b/src/content/mod.rs new file mode 100644 index 0000000..d5771a3 --- /dev/null +++ b/src/content/mod.rs @@ -0,0 +1,4 @@ +//! Content types found in markdown. + +pub mod flow; +pub mod string; diff --git a/src/content/string.rs b/src/content/string.rs new file mode 100644 index 0000000..a8a81b2 --- /dev/null +++ b/src/content/string.rs @@ -0,0 +1,120 @@ +//! The string content type. +//! +//! **String** is a limited **text** like content type which only allows +//! character escapes and character references. +//! It exists in things such as identifiers (media references, definitions), +//! titles, URLs, code (fenced) info and meta parts. +//! +//! The constructs found in strin are: +//! +//! *   [Character escape][crate::construct::character_escape] +//! *   [Character reference][crate::construct::character_reference] + +use crate::construct::{ +    character_escape::start as character_escape, character_reference::start as character_reference, +}; +use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; + +/// Turn `codes` as the string content type into events. +// To do: remove this `allow` when all the content types are glued together. +#[allow(dead_code)] +pub fn string(codes: Vec<Code>) -> Vec<Event> { +    let mut tokenizer = Tokenizer::new(); +    let (state, remainder) = tokenizer.feed(codes, Box::new(before), true); + +    if let Some(ref x) = remainder { +        if !x.is_empty() { +            unreachable!("expected no final remainder {:?}", x); +        } +    } + +    match state { +        State::Ok => {} +        _ => unreachable!("expected final state to be `State::Ok`"), +    } + +    tokenizer.events +} + +/// Before string. +/// +/// First we assume character reference. +/// +/// ```markdown +/// |& +/// |\& +/// |qwe +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        _ => tokenizer.attempt(character_reference, |ok| { +            Box::new(if ok { +                before +            } else { +                before_not_character_reference +            }) +        })(tokenizer, code), +    } +} + +/// Before string, not at a character reference. +/// +/// Assume character escape. +/// +/// ```markdown +/// |\& +/// |qwe +/// ``` +fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => (State::Ok, None), +        _ => tokenizer.attempt(character_escape, |ok| { +            Box::new(if ok { +                before +            } else { +                before_not_character_escape +            }) +        })(tokenizer, code), +    } +} + +/// Before string, not at a character reference or character escape. +/// +/// We’re at data. +/// +/// ```markdown +/// |qwe +/// ``` +fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    if let Code::None = code { +        (State::Ok, None) +    } else { +        tokenizer.enter(TokenType::Data); +        tokenizer.consume(code); +        (State::Fn(Box::new(in_data)), None) +    } +} + +/// In data. +/// +/// ```markdown +/// q|w|e +/// ``` +fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => { +            tokenizer.exit(TokenType::Data); +            (State::Ok, None) +        } +        // To do: somehow get these markers from constructs. +        Code::Char('&' | '\\') => { +            tokenizer.exit(TokenType::Data); +            before(tokenizer, code) +        } +        _ => { +            tokenizer.consume(code); +            (State::Fn(Box::new(in_data)), None) +        } +    } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..1624a22 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,52 @@ +//! Public API of micromark. +//! +//! This module exposes [`micromark`][] (and [`micromark_with_options`][]). +//! `micromark` is a safe way to transform (untrusted?) markdown into HTML. +//! `micromark_with_options` allows you to configure how markdown is turned into +//! HTML, such as by allowing dangerous HTML when you trust it. +mod compiler; +mod constant; +mod construct; +mod content; +mod parser; +mod tokenizer; +mod util; + +use crate::compiler::compile; +pub use crate::compiler::CompileOptions; +use crate::parser::parse; + +/// Turn markdown into HTML. +/// +/// ## Examples +/// +/// ```rust +/// use micromark::micromark; +/// +/// let result = micromark("# Hello, world!"); +/// +/// assert_eq!(result, "<h1>Hello, world!</h1>"); +/// ``` +#[must_use] +pub fn micromark(value: &str) -> String { +    micromark_with_options(value, &CompileOptions::default()) +} + +/// Turn markdown into HTML, with configuration. +/// +/// ## Examples +/// +/// ```rust +/// use micromark::{micromark_with_options, CompileOptions}; +/// +/// let result = micromark_with_options("<div>\n\n# Hello, world!\n\n</div>", &CompileOptions { +///     allow_dangerous_html: true, +/// }); +/// +/// assert_eq!(result, "<div>\n<h1>Hello, world!</h1>\n</div>"); +/// ``` +#[must_use] +pub fn micromark_with_options(value: &str, options: &CompileOptions) -> String { +    let (events, codes) = parse(value); +    compile(&events, &codes, options) +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..10c6e7a --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,14 @@ +//! Turn a string of markdown into events. +// To do: this should start with `containers`, when they’re done. +// To do: definitions and such will mean more data has to be passed around. +use crate::content::flow::flow; +use crate::tokenizer::{as_codes, Code, Event}; + +/// Turn a string of markdown into events. +/// Passes the codes back so the compiler can access the source. +pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) { +    let codes = as_codes(value); +    // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough. +    let events = flow(codes.clone()); +    (events, codes) +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..c8b1440 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,580 @@ +//! The tokenizer glues states from the state machine together. +//! +//! It facilitates everything needed to turn codes into tokens and events with +//! a state machine. +//! It also enables logic needed for parsing markdown, such as an [`attempt`][] +//! to parse something, which can succeed or, when unsuccessful, revert the +//! attempt. +//! Similarly, a [`check`][] exists, which does the same as an `attempt` but +//! reverts even if successful. +//! +//! [`attempt`]: Tokenizer::attempt +//! [`check`]: Tokenizer::check + +use crate::constant::TAB_SIZE; + +/// Semantic label of a span. +// To do: figure out how to share this so extensions can add their own stuff, +// though perhaps that’s impossible and we should inline all extensions? +// To do: document each variant. +#[derive(Debug, Clone, PartialEq)] +pub enum TokenType { +    AtxHeading, +    AtxHeadingSequence, +    AtxHeadingWhitespace, +    AtxHeadingText, + +    CharacterEscape, +    CharacterEscapeMarker, +    CharacterEscapeValue, + +    CharacterReference, +    CharacterReferenceMarker, +    CharacterReferenceMarkerNumeric, +    CharacterReferenceMarkerHexadecimal, +    CharacterReferenceMarkerSemi, +    CharacterReferenceValue, + +    CodeFenced, +    CodeFencedFence, +    CodeFencedFenceSequence, +    CodeFencedFenceWhitespace, +    CodeFencedFenceInfo, +    CodeFencedFenceMeta, + +    CodeIndented, +    CodeIndentedPrefixWhitespace, + +    CodeFlowChunk, + +    Data, + +    HtmlFlow, +    HtmlFlowData, + +    ThematicBreak, +    ThematicBreakSequence, +    ThematicBreakWhitespace, + +    Whitespace, +    LineEnding, +    BlankLineEnding, +    BlankLineWhitespace, + +    Content, +    ContentPhrasing, +    ChunkString, +} + +/// Enum representing a character code. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Code { +    /// End of the input stream (called eof). +    None, +    /// Used to make parsing line endings easier as it represents both +    /// `Code::Char('\r')` and `Code::Char('\n')` combined. +    CarriageReturnLineFeed, +    /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab +    /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s. +    VirtualSpace, +    /// The most frequent variant of this enum is `Code::Char(char)`, which just +    /// represents a char, but micromark adds meaning to certain other values. +    Char(char), +} + +/// A location in the document (`line`/`column`/`offset`). +/// +/// The interface for the location in the document comes from unist `Point`: +/// <https://github.com/syntax-tree/unist#point>. +#[derive(Debug, Clone, PartialEq)] +pub struct Point { +    /// 1-indexed line number. +    pub line: usize, +    /// 1-indexed column number. +    /// Note that this is increases up to a tab stop for tabs. +    /// Some editors count tabs as 1 character, so this position is not always +    /// the same as editors. +    pub column: usize, +    /// 0-indexed position in the document. +    pub offset: usize, +} + +/// Possible event types. +#[derive(Debug, PartialEq)] +pub enum EventType { +    /// The start of something. +    Enter, +    /// The end of something. +    Exit, +} + +/// Something semantic happening somewhere. +#[derive(Debug)] +pub struct Event { +    pub event_type: EventType, +    pub token_type: TokenType, +    pub point: Point, +    pub index: usize, +} + +/// The essence of the state machine are functions: `StateFn`. +/// It’s responsible for dealing with that single passed [`Code`][]. +/// It yields a [`StateFnResult`][]. +pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult; +/// Each [`StateFn`][] yields something back: primarily the state. +/// In certain cases, it can also yield back up parsed codes that were passed down. +pub type StateFnResult = (State, Option<Vec<Code>>); + +/// The result of a state. +pub enum State { +    /// There is a future state: a boxed [`StateFn`][] to pass the next code to. +    Fn(Box<StateFn>), +    /// The state is successful. +    Ok, +    /// The state is not successful. +    Nok, +} + +/// The internal state of a tokenizer, not to be confused with states from the +/// state machine, this instead is all the information about where we currently +/// are and what’s going on. +#[derive(Debug, Clone)] +struct InternalState { +    /// Length of `events`. We only add to events, so reverting will just pop stuff off. +    events_len: usize, +    /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. +    stack_len: usize, +    /// Current code. +    current: Code, +    /// `index` in codes of the current code. +    index: usize, +    /// Current relative and absolute position in the file. +    point: Point, +} + +/// A tokenizer itself. +#[derive(Debug)] +pub struct Tokenizer { +    /// Track whether a character is expected to be consumed, and whether it’s +    /// actually consumed +    /// +    /// Tracked to make sure everything’s valid. +    consumed: bool, +    /// Semantic labels of one or more codes in `codes`. +    pub events: Vec<Event>, +    /// Hierarchy of semantic labels. +    /// +    /// Tracked to make sure everything’s valid. +    stack: Vec<TokenType>, +    /// Current character code. +    current: Code, +    /// `index` in codes of the current code. +    index: usize, +    /// Current relative and absolute place in the file. +    point: Point, +} + +impl Tokenizer { +    /// Create a new tokenizer. +    pub fn new() -> Tokenizer { +        Tokenizer { +            current: Code::None, +            index: 0, +            consumed: true, +            point: Point { +                line: 1, +                column: 1, +                offset: 0, +            }, +            stack: vec![], +            events: vec![], +        } +    } + +    /// Prepare for a next code to get consumed. +    fn expect(&mut self, code: Code) { +        assert!(self.consumed, "expected previous character to be consumed"); +        self.consumed = false; +        self.current = code; +    } + +    /// Consume the current character. +    /// Each [`StateFn`][] is expected to call this to signal that this code is +    /// used, or call a next `StateFn`. +    pub fn consume(&mut self, code: Code) { +        assert_eq!( +            code, self.current, +            "expected given code to equal expected code" +        ); +        log::debug!("consume: `{:?}` ({:?})", code, self.point); +        assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); + +        match code { +            Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +                self.point.line += 1; +                self.point.column = 1; +                self.point.offset += if code == Code::CarriageReturnLineFeed { +                    2 +                } else { +                    1 +                }; +                // To do: accountForPotentialSkip() +                log::debug!("position: after eol: `{:?}`", self.point); +            } +            Code::VirtualSpace => { +                // Empty. +            } +            _ => { +                self.point.column += 1; +                self.point.offset += 1; +            } +        } + +        self.index += 1; +        // Mark as consumed. +        self.consumed = true; +    } + +    /// Mark the start of a semantic label. +    pub fn enter(&mut self, token_type: TokenType) { +        log::debug!("enter `{:?}` ({:?})", token_type, self.point); +        let event = Event { +            event_type: EventType::Enter, +            token_type: token_type.clone(), +            point: self.point.clone(), +            index: self.index, +        }; + +        self.events.push(event); +        self.stack.push(token_type); +    } + +    /// Mark the end of a semantic label. +    pub fn exit(&mut self, token_type: TokenType) { +        let token_on_stack = self.stack.pop().expect("cannot close w/o open tokens"); + +        assert_eq!( +            token_on_stack, token_type, +            "expected exit TokenType to match current TokenType" +        ); + +        let ev = self.events.last().expect("cannot close w/o open event"); + +        let point = self.point.clone(); + +        assert!( +            token_on_stack != ev.token_type || ev.point != point, +            "expected non-empty TokenType" +        ); + +        log::debug!("exit `{:?}` ({:?})", token_type, self.point); +        let event = Event { +            event_type: EventType::Exit, +            token_type, +            point, +            index: self.index, +        }; + +        self.events.push(event); +    } + +    /// Capture the internal state. +    fn capture(&mut self) -> InternalState { +        InternalState { +            index: self.index, +            current: self.current, +            point: self.point.clone(), +            events_len: self.events.len(), +            stack_len: self.stack.len(), +        } +    } + +    /// Apply the internal state. +    fn free(&mut self, previous: InternalState) { +        self.index = previous.index; +        self.current = previous.current; +        self.point = previous.point; +        assert!( +            self.events.len() >= previous.events_len, +            "expected to restore less events than before" +        ); +        self.events.truncate(previous.events_len); +        assert!( +            self.stack.len() >= previous.stack_len, +            "expected to restore less stack items than before" +        ); +        self.stack.truncate(previous.stack_len); +    } + +    /// Check if `state` and its future states are successful or not. +    /// +    /// This captures the current state of the tokenizer, returns a wrapped +    /// state that captures all codes and feeds them to `state` and its future +    /// states until it yields [`State::Ok`][] or [`State::Nok`][]. +    /// It then applies the captured state, calls `done`, and feeds all +    /// captured codes to its future states. +    pub fn check( +        &mut self, +        state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        done: impl FnOnce(bool) -> Box<StateFn> + 'static, +    ) -> Box<StateFn> { +        let previous = self.capture(); + +        attempt_impl( +            state, +            vec![], +            |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| { +                let codes = result.0; +                tokenizer.free(previous); +                log::debug!( +                    "check: {:?}, codes: {:?}, at {:?}", +                    ok, +                    codes, +                    tokenizer.point +                ); +                let result = done(ok); +                tokenizer.feed(codes, result, false) +            }, +        ) +    } + +    /// Attempt to parse with `state` and its future states, reverting if +    /// unsuccessful. +    /// +    /// This captures the current state of the tokenizer, returns a wrapped +    /// state that captures all codes and feeds them to `state` and its future +    /// states until it yields [`State::Ok`][], at which point it calls `done` +    /// and yields its result. +    /// If instead [`State::Nok`][] was yielded, the captured state is applied, +    /// `done` is called, and all captured codes are fed to its future states. +    pub fn attempt( +        &mut self, +        state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        done: impl FnOnce(bool) -> Box<StateFn> + 'static, +    ) -> Box<StateFn> { +        let previous = self.capture(); + +        attempt_impl( +            state, +            vec![], +            |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| { +                let codes = if ok { +                    result.1 +                } else { +                    tokenizer.free(previous); +                    result.0 +                }; + +                log::debug!( +                    "attempt: {:?}, codes: {:?}, at {:?}", +                    ok, +                    codes, +                    tokenizer.point +                ); +                let result = done(ok); +                tokenizer.feed(codes, result, false) +            }, +        ) +    } + +    /// Feed a list of `codes` into `start`. +    /// +    /// This is set up to support repeatedly calling `feed`, and thus streaming +    /// markdown into the state machine, and normally pauses after feeding. +    /// When `done: true` is passed, the EOF is fed. +    pub fn feed( +        &mut self, +        codes: Vec<Code>, +        start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        drain: bool, +    ) -> StateFnResult { +        let mut codes = codes; +        let mut state = State::Fn(Box::new(start)); +        let mut index = 0; + +        self.consumed = true; + +        while index < codes.len() { +            let code = codes[index]; + +            match state { +                State::Nok | State::Ok => { +                    break; +                } +                State::Fn(func) => { +                    log::debug!("main: passing `{:?}`", code); +                    self.expect(code); +                    let (next, remainder) = check_statefn_result(func(self, code)); +                    state = next; +                    index = index + 1 +                        - (if let Some(ref x) = remainder { +                            x.len() +                        } else { +                            0 +                        }); +                } +            } +        } + +        // Yield to a higher loop if we shouldn’t feed EOFs. +        if !drain { +            return (state, Some(codes.split_off(index))); +        } + +        loop { +            // Feed EOF. +            match state { +                State::Ok | State::Nok => break, +                State::Fn(func) => { +                    let code = Code::None; +                    log::debug!("main: passing eof"); +                    self.expect(code); +                    let (next, remainder) = check_statefn_result(func(self, code)); + +                    if let Some(ref x) = remainder { +                        if !x.is_empty() { +                            // To do: handle? +                            unreachable!("drain:remainder {:?}", x); +                        } +                    } + +                    state = next; +                } +            } +        } + +        check_statefn_result((state, None)) +    } +} + +/// Internal utility to wrap states to also capture codes. +/// +/// Recurses into itself. +/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and  [`Tokenizer::check`][Tokenizer::check]. +fn attempt_impl( +    state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +    codes: Vec<Code>, +    done: impl FnOnce((Vec<Code>, Vec<Code>), bool, &mut Tokenizer) -> StateFnResult + 'static, +) -> Box<StateFn> { +    Box::new(|tokenizer, code| { +        let mut codes = codes; + +        let (next, remainder) = check_statefn_result(state(tokenizer, code)); + +        match code { +            Code::None => {} +            _ => { +                codes.push(code); +            } +        } + +        // To do: `remainder` must never be bigger than codes I guess? +        // To do: `remainder` probably has to be taken *from* `codes`, in a similar vain to the `Ok` handling below. +        match next { +            State::Ok => { +                let remaining = if let Some(x) = remainder { x } else { vec![] }; +                check_statefn_result(done((codes, remaining), true, tokenizer)) +            } +            State::Nok => check_statefn_result(done((codes, vec![]), false, tokenizer)), +            State::Fn(func) => { +                check_statefn_result((State::Fn(attempt_impl(func, codes, done)), None)) +            } +        } +    }) +} + +/// Turn a string into codes. +// To do: handle BOM at start? +pub fn as_codes(value: &str) -> Vec<Code> { +    let mut codes: Vec<Code> = vec![]; +    let mut at_carriage_return = false; +    let mut column = 1; + +    for char in value.chars() { +        // Send a CRLF. +        if at_carriage_return && '\n' == char { +            at_carriage_return = false; +            codes.push(Code::CarriageReturnLineFeed); +        } else { +            // Send the previous CR: we’re not at a next `\n`. +            if at_carriage_return { +                at_carriage_return = false; +                codes.push(Code::Char('\r')); +            } + +            match char { +                // Send a replacement character. +                '\0' => { +                    column += 1; +                    codes.push(Code::Char('�')); +                } +                // Send a tab and virtual spaces. +                '\t' => { +                    // To do: is this correct? +                    let virtual_spaces = TAB_SIZE - (column % TAB_SIZE); +                    println!("tabs, expand {:?}, {:?}", column, virtual_spaces); +                    codes.push(Code::Char(char)); +                    column += 1; +                    let mut index = 0; +                    while index < virtual_spaces { +                        codes.push(Code::VirtualSpace); +                        column += 1; +                        index += 1; +                    } +                } +                // Send an LF. +                '\n' => { +                    column = 1; +                    codes.push(Code::Char(char)); +                } +                // Don’t send anything yet. +                '\r' => { +                    column = 1; +                    at_carriage_return = true; +                } +                // Send the char. +                _ => { +                    column += 1; +                    codes.push(Code::Char(char)); +                } +            } +        }; +    } + +    // To do: handle a final CR? + +    codes +} + +/// Check a [`StateFnResult`][], make sure its valid (that there are no bugs), +/// and clean a final eof passed back in `remainder`. +fn check_statefn_result(result: StateFnResult) -> StateFnResult { +    let (state, mut remainder) = result; + +    match state { +        State::Nok | State::Fn(_) => { +            if let Some(ref x) = remainder { +                assert_eq!( +                    x.len(), +                    0, +                    "expected `None` to be passed back as remainder from `State::Nok`, `State::Fn`" +                ); +            } +        } +        State::Ok => {} +    } + +    // Remove an eof. +    // For convencience, feeding back an eof is allowed, but cleaned here. +    // Most states handle eof and eol in the same branch, and hence pass +    // all back. +    // This might not be needed, because if EOF is passed back, we’re at the EOF. +    // But they’re not supposed to be in codes, so here we remove them. +    if let Some(ref mut list) = remainder { +        if Some(&Code::None) == list.last() { +            list.pop(); +        } +    } + +    (state, remainder) +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..47359a3 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,241 @@ +//! Some utilities helpful when parsing and compiling markdown. + +use crate::constant::{CHARACTER_REFERENCE_NAMES, CHARACTER_REFERENCE_VALUES}; +use crate::tokenizer::{Code, Event, EventType}; + +/// Encode dangerous html characters. +/// +/// This ensures that certain characters which have special meaning in HTML are +/// dealt with. +/// Technically, we can skip `>` and `"` in many cases, but CM includes them. +/// +/// This behavior is not explained in prose in `CommonMark` but can be inferred +/// from the input/output test cases. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::encode; +/// +/// assert_eq!(encode("I <3 🦀"), "I <3 🦀"); +/// ``` +/// +/// ## References +/// +/// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) +pub fn encode(value: &str) -> String { +    value +        .replace('&', "&") +        .replace('"', """) +        .replace('<', "<") +        .replace('>', ">") +} + +/// Decode numeric character references. +/// +/// Turn the number (in string form as either hexadecimal or decimal) coming +/// from a numeric character reference into a character. +/// Whether the base of the string form is `10` (decimal) or `16` (hexadecimal) +/// must be passed as the `radix` parameter. +/// +/// This returns the `char` associated with that number or a replacement +/// character for C0 control characters (except for ASCII whitespace), C1 +/// control characters, lone surrogates, noncharacters, and out of range +/// characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_numeric_character_reference; +/// +/// assert_eq!(decode_numeric_character_reference("123", 10), '{'); +/// assert_eq!(decode_numeric_character_reference("9", 16), '\t'); +/// assert_eq!(decode_numeric_character_reference("0", 10), '�'); // Not allowed. +/// ``` +/// +/// ## Panics +/// +/// This function panics if a invalid string or an out of bounds valid string +/// is given. +/// It is expected that figuring out whether a number is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) +/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_numeric_character_reference(value: &str, radix: u32) -> char { +    let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int"); + +    if +    // C0 except for HT, LF, FF, CR, space +    code < 0x09 || +    code == 0x0B || +    (code > 0x0D && code < 0x20) || +    // Control character (DEL) of the basic block and C1 controls. +    (code > 0x7E && code < 0xA0) || +    // Lone high surrogates and low surrogates. +    (code > 0xd7ff && code < 0xe000) || +    // Noncharacters. +    (code > 0xfdcf && code < 0xfdf0) || +    ((code & 0xffff) == 0xffff) || +    ((code & 0xffff) == 0xfffe) || +    // Out of range +    code > 0x0010_ffff +    { +        '�' +    } else { +        char::from_u32(code).expect("expected valid `code`") +    } +} + +/// Decode named character references. +/// +/// Turn the name coming from a named character reference (without the `&` or +/// `;`) into a string. +/// This looks the given string up in [`CHARACTER_REFERENCE_NAMES`][] and then +/// takes the corresponding value from [`CHARACTER_REFERENCE_VALUES`][]. +/// +/// The result is `String` instead of `char` because named character references +/// can expand into multiple characters. +/// +/// ## Examples +/// +/// ```rust ignore +/// use micromark::util::decode_named_character_reference; +/// +/// assert_eq!(decode_named_character_reference("amp"), "&"); +/// assert_eq!(decode_named_character_reference("AElig"), "Æ"); +/// assert_eq!(decode_named_character_reference("aelig"), "æ"); +/// ``` +/// +/// ## Panics +/// +/// This function panics if a name not in [`CHARACTER_REFERENCE_NAMES`][] is +/// given. +/// It is expected that figuring out whether a name is allowed is handled in +/// the parser. +/// When `micromark` is used, this function never panics. +/// +/// ## References +/// +/// *   [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference) +/// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) +pub fn decode_named_character_reference(value: &str) -> String { +    let position = CHARACTER_REFERENCE_NAMES.iter().position(|&x| x == value); +    if let Some(index) = position { +        CHARACTER_REFERENCE_VALUES[index].to_string() +    } else { +        unreachable!("expected valid `name`") +    } +} + +/// A struct representing the span of an opening and closing event of a token. +#[derive(Debug)] +pub struct Span { +    // To do: probably needed in the future. +    // start: Point, +    /// Absolute offset (and `index` in `codes`) of where this span starts. +    pub start_index: usize, +    // To do: probably needed in the future. +    // end: Point, +    /// Absolute offset (and `index` in `codes`) of where this span ends. +    pub end_index: usize, +    // To do: probably needed in the future. +    // token_type: TokenType, +} + +/// Get a span from an event. +/// +/// Get the span of an `exit` event, by looking backwards through the events to +/// find the corresponding `enter` event. +/// This assumes that tokens with the same are not nested. +/// +/// ## Panics +/// +/// This function panics if an enter event is given. +/// When `micromark` is used, this function never panics. +pub fn get_span(events: &[Event], index: usize) -> Span { +    let exit = &events[index]; +    // let end = exit.point.clone(); +    let end_index = exit.index; +    let token_type = exit.token_type.clone(); +    // To do: support `enter` events if needed and walk forwards? +    assert_eq!( +        exit.event_type, +        EventType::Exit, +        "expected get_span to be called on `exit` event" +    ); +    let mut start_index = index - 1; + +    loop { +        let enter = &events[start_index]; +        if enter.event_type == EventType::Enter && enter.token_type == token_type { +            return Span { +                // start: enter.point.clone(), +                start_index: enter.index, +                // end, +                end_index, +                // token_type, +            }; +        } + +        start_index -= 1; +    } +} + +/// Serialize a span, optionally expanding tabs. +pub fn slice_serialize(codes: &[Code], span: &Span, expand_tabs: bool) -> String { +    serialize_chunks(slice_codes(codes, span), expand_tabs) +} + +/// Get a slice of codes from a span. +pub fn slice_codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] { +    &codes[span.start_index..span.end_index] +} + +/// Serialize a slice of codes, optionally expanding tabs. +pub fn serialize_chunks(codes: &[Code], expand_tabs: bool) -> String { +    let mut at_tab = false; +    let mut index = 0; +    let mut value: Vec<char> = vec![]; + +    while index < codes.len() { +        let code = codes[index]; +        let mut at_tab_next = false; + +        match code { +            Code::CarriageReturnLineFeed => { +                value.push('\r'); +                value.push('\n'); +            } +            Code::Char(char) if char == '\n' || char == '\r' => { +                value.push(char); +            } +            Code::Char(char) if char == '\t' => { +                at_tab_next = true; +                value.push(if expand_tabs { ' ' } else { char }); +            } +            Code::VirtualSpace => { +                if !expand_tabs && at_tab { +                    index += 1; +                    continue; +                } +                value.push(' '); +            } +            Code::Char(char) => { +                value.push(char); +            } +            Code::None => { +                unreachable!("unexpected EOF code in codes"); +            } +        } + +        at_tab = at_tab_next; + +        index += 1; +    } + +    value.into_iter().collect() +} diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs new file mode 100644 index 0000000..46fa9cb --- /dev/null +++ b/tests/code_fenced.rs @@ -0,0 +1,266 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn code_fenced() { +    assert_eq!( +        micromark("```\n<\n >\n```"), +        "<pre><code><\n >\n</code></pre>", +        "should support fenced code w/ grave accents" +    ); + +    assert_eq!( +        micromark("~~~\n<\n >\n~~~"), +        "<pre><code><\n >\n</code></pre>", +        "should support fenced code w/ tildes" +    ); + +    // To do: code (text). +    // assert_eq!( +    //     micromark("``\nfoo\n``"), +    //     "<p><code>foo</code></p>", +    //     "should not support fenced code w/ less than three markers" +    // ); + +    assert_eq!( +        micromark("```\naaa\n~~~\n```"), +        "<pre><code>aaa\n~~~\n</code></pre>", +        "should not support a tilde closing sequence for a grave accent opening sequence" +    ); + +    assert_eq!( +        micromark("~~~\naaa\n```\n~~~"), +        "<pre><code>aaa\n```\n</code></pre>", +        "should not support a grave accent closing sequence for a tilde opening sequence" +    ); + +    assert_eq!( +        micromark("````\naaa\n```\n``````"), +        "<pre><code>aaa\n```\n</code></pre>", +        "should support a closing sequence longer, but not shorter than, the opening" +    ); + +    assert_eq!( +        micromark("~~~~\naaa\n~~~\n~~~~"), +        "<pre><code>aaa\n~~~\n</code></pre>", +        "should support a closing sequence equal to, but not shorter than, the opening" +    ); + +    assert_eq!( +        micromark("```"), +        "<pre><code></code></pre>\n", +        "should support an eof right after an opening sequence" +    ); + +    assert_eq!( +        micromark("`````\n\n```\naaa\n"), +        "<pre><code>\n```\naaa\n</code></pre>\n", +        "should support an eof somewhere in content" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark("> ```\n> aaa\n\nbbb"), +    //     "<blockquote>\n<pre><code>aaa\n</code></pre>\n</blockquote>\n<p>bbb</p>", +    //     "should support no closing sequence in a block quote" +    // ); + +    assert_eq!( +        micromark("```\n\n  \n```"), +        "<pre><code>\n  \n</code></pre>", +        "should support blank lines in fenced code" +    ); + +    assert_eq!( +        micromark("```\n```"), +        "<pre><code></code></pre>", +        "should support empty fenced code" +    ); + +    assert_eq!( +      micromark(" ```\n aaa\naaa\n```"), +      "<pre><code>aaa\naaa\n</code></pre>", +      "should remove up to one space from the content if the opening sequence is indented w/ 1 space" +    ); + +    assert_eq!( +      micromark("  ```\naaa\n  aaa\naaa\n  ```"), +      "<pre><code>aaa\naaa\naaa\n</code></pre>", +      "should remove up to two space from the content if the opening sequence is indented w/ 2 spaces" +    ); + +    assert_eq!( +      micromark("   ```\n   aaa\n    aaa\n  aaa\n   ```"), +      "<pre><code>aaa\n aaa\naaa\n</code></pre>", +      "should remove up to three space from the content if the opening sequence is indented w/ 3 spaces" +    ); + +    assert_eq!( +        micromark("    ```\n    aaa\n    ```"), +        "<pre><code>```\naaa\n```\n</code></pre>", +        "should not support indenteding the opening sequence w/ 4 spaces" +    ); + +    assert_eq!( +        micromark("```\naaa\n  ```"), +        "<pre><code>aaa\n</code></pre>", +        "should support an indented closing sequence" +    ); + +    assert_eq!( +        micromark("   ```\naaa\n  ```"), +        "<pre><code>aaa\n</code></pre>", +        "should support a differently indented closing sequence than the opening sequence" +    ); + +    assert_eq!( +        micromark("```\naaa\n    ```\n"), +        "<pre><code>aaa\n    ```\n</code></pre>\n", +        "should not support an indented closing sequence w/ 4 spaces" +    ); + +    // To do: code (text). +    // assert_eq!( +    //     micromark("``` ```\naaa"), +    //     "<p><code> </code>\naaa</p>", +    //     "should not support grave accents in the opening fence after the opening sequence" +    // ); + +    assert_eq!( +        micromark("~~~~~~\naaa\n~~~ ~~\n"), +        "<pre><code>aaa\n~~~ ~~\n</code></pre>\n", +        "should not support spaces in the closing sequence" +    ); + +    assert_eq!( +        micromark("foo\n```\nbar\n```\nbaz"), +        "<p>foo</p>\n<pre><code>bar\n</code></pre>\n<p>baz</p>", +        "should support interrupting paragraphs" +    ); + +    // To do: setext. +    // assert_eq!( +    //     micromark("foo\n---\n~~~\nbar\n~~~\n# baz"), +    //     "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>", +    //     "should support interrupting other content" +    // ); + +    assert_eq!( +        micromark("```ruby\ndef foo(x)\n  return 3\nend\n```"), +        "<pre><code class=\"language-ruby\">def foo(x)\n  return 3\nend\n</code></pre>", +        "should support the info string as a `language-` class (1)" +    ); + +    assert_eq!( +        micromark("````;\n````"), +        "<pre><code class=\"language-;\"></code></pre>", +        "should support the info string as a `language-` class (2)" +    ); + +    assert_eq!( +        micromark("~~~~    ruby startline=3 $%@#$\ndef foo(x)\n  return 3\nend\n~~~~~~~"), +        "<pre><code class=\"language-ruby\">def foo(x)\n  return 3\nend\n</code></pre>", +        "should support the info string as a `language-` class, but not the meta string" +    ); + +    // To do: code (text). +    // assert_eq!( +    //     micromark("``` aa ```\nfoo"), +    //     "<p><code>aa</code>\nfoo</p>", +    //     "should not support grave accents in the meta string" +    // ); + +    assert_eq!( +        micromark("~~~ aa ``` ~~~\nfoo\n~~~"), +        "<pre><code class=\"language-aa\">foo\n</code></pre>", +        "should support grave accents and tildes in the meta string of tilde fenced code" +    ); + +    assert_eq!( +        micromark("```\n``` aaa\n```"), +        "<pre><code>``` aaa\n</code></pre>", +        "should not support info string on closing sequences" +    ); + +    // Our own: +    assert_eq!( +        micromark("```  "), +        "<pre><code></code></pre>\n", +        "should support an eof after whitespace, after the start fence sequence" +    ); + +    assert_eq!( +        micromark("```  js\nalert(1)\n```"), +        "<pre><code class=\"language-js\">alert(1)\n</code></pre>", +        "should support whitespace between the sequence and the info string" +    ); + +    assert_eq!( +        micromark("```js"), +        "<pre><code class=\"language-js\"></code></pre>\n", +        "should support an eof after the info string" +    ); + +    assert_eq!( +        micromark("```  js \nalert(1)\n```"), +        "<pre><code class=\"language-js\">alert(1)\n</code></pre>", +        "should support whitespace after the info string" +    ); + +    assert_eq!( +        micromark("```\n  "), +        "<pre><code>  \n</code></pre>\n", +        "should support an eof after whitespace in content" +    ); + +    assert_eq!( +        micromark("  ```\n "), +        "<pre><code></code></pre>\n", +        "should support an eof in the prefix, in content" +    ); + +    // To do: strings. +    // assert_eq!( +    //     micromark("```j\\+s©"), +    //     "<pre><code class=\"language-j+s©\"></code></pre>\n", +    //     "should support character escapes and character references in info strings" +    // ); + +    assert_eq!( +      micromark("   ```\naaa\n    ```"), +      "<pre><code>aaa\n ```\n</code></pre>\n", +      "should not support a closing sequence w/ too much indent, regardless of opening sequence (1)" +    ); + +    // To do: blockquote. +    //     assert_eq!( +    //   micromark("> ```\n>\n>\n>\n\na"), +    //   "<blockquote>\n<pre><code>\n\n\n</code></pre>\n</blockquote>\n<p>a</p>", +    //   "should not support a closing sequence w/ too much indent, regardless of opening sequence (2)" +    // ); + +    //     assert_eq!( +    //         micromark("> ```a\nb"), +    //         "<blockquote>\n<pre><code class=\"language-a\"></code></pre>\n</blockquote>\n<p>b</p>", +    //         "should not support lazyness (1)" +    //     ); + +    //     assert_eq!( +    //         micromark("> a\n```b"), +    //         "<blockquote>\n<p>a</p>\n</blockquote>\n<pre><code class=\"language-b\"></code></pre>\n", +    //         "should not support lazyness (2)" +    //     ); + +    //     assert_eq!( +    //   micromark("> ```a\n```"), +    //   "<blockquote>\n<pre><code class=\"language-a\"></code></pre>\n</blockquote>\n<pre><code></code></pre>\n", +    //   "should not support lazyness (3)" +    // ); + +    // To do: extensions. +    // assert_eq!( +    //   micromark("```", {extensions: [{disable: {null: ["codeFenced"]}}]}), +    //   "<p>```</p>", +    //   "should support turning off code (fenced)" +    // ); +} diff --git a/tests/code_indented.rs b/tests/code_indented.rs new file mode 100644 index 0000000..f5926c0 --- /dev/null +++ b/tests/code_indented.rs @@ -0,0 +1,196 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn code_indented() { +    assert_eq!( +        micromark("    a simple\n      indented code block"), +        "<pre><code>a simple\n  indented code block\n</code></pre>", +        "should support indented code" +    ); + +    // To do: list. +    // assert_eq!( +    //     micromark("  - foo\n\n    bar"), +    //     "<ul>\n<li>\n<p>foo</p>\n<p>bar</p>\n</li>\n</ul>", +    //     "should prefer list item content over indented code (1)" +    // ); + +    //     assert_eq!( +    //         micromark("1.  foo\n\n    - bar"), +    //         "<ol>\n<li>\n<p>foo</p>\n<ul>\n<li>bar</li>\n</ul>\n</li>\n</ol>", +    //         "should prefer list item content over indented code (2)" +    //     ); + +    assert_eq!( +        micromark("    <a/>\n    *hi*\n\n    - one"), +        "<pre><code><a/>\n*hi*\n\n- one\n</code></pre>", +        "should support blank lines in indented code (1)" +    ); + +    assert_eq!( +        micromark("    chunk1\n\n    chunk2\n  \n \n \n    chunk3"), +        "<pre><code>chunk1\n\nchunk2\n\n\n\nchunk3\n</code></pre>", +        "should support blank lines in indented code (2)" +    ); + +    assert_eq!( +        micromark("    chunk1\n      \n      chunk2"), +        "<pre><code>chunk1\n  \n  chunk2\n</code></pre>", +        "should support blank lines in indented code (3)" +    ); + +    // To do: paragraphs. +    // assert_eq!( +    //     micromark("Foo\n    bar"), +    //     "<p>Foo\nbar</p>", +    //     "should not support interrupting paragraphs" +    // ); + +    // To do: paragraphs. +    // assert_eq!( +    //     micromark("    foo\nbar"), +    //     "<pre><code>foo\n</code></pre>\n<p>bar</p>", +    //     "should support paragraphs directly after indented code" +    // ); + +    // To do: setext. +    // assert_eq!( +    //   micromark("# Heading\n    foo\nHeading\n------\n    foo\n----"), +    //   "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />", +    //   "should mix w/ other content" +    // ); + +    assert_eq!( +        micromark("        foo\n    bar"), +        "<pre><code>    foo\nbar\n</code></pre>", +        "should support extra whitespace on the first line" +    ); + +    assert_eq!( +        micromark("\n    \n    foo\n    "), +        "<pre><code>foo\n</code></pre>", +        "should not support initial blank lines" +    ); + +    assert_eq!( +        micromark("    foo  "), +        "<pre><code>foo  \n</code></pre>", +        "should support trailing whitespace" +    ); + +    // To do: blockquote. +    //     assert_eq!( +    //         micromark(">     a\nb"), +    //         "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<p>b</p>", +    //         "should not support lazyness (1)" +    //     ); + +    //     assert_eq!( +    //         micromark("> a\n    b"), +    //         "<blockquote>\n<p>a\nb</p>\n</blockquote>", +    //         "should not support lazyness (2)" +    //     ); + +    //     assert_eq!( +    //         micromark("> a\n     b"), +    //         "<blockquote>\n<p>a\nb</p>\n</blockquote>", +    //         "should not support lazyness (3)" +    //     ); + +    //     assert_eq!( +    //         micromark("> a\n      b"), +    //         "<blockquote>\n<p>a\nb</p>\n</blockquote>", +    //         "should not support lazyness (4)" +    //     ); + +    //     assert_eq!( +    //         micromark(">     a\n    b"), +    //         "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code>b\n</code></pre>", +    //         "should not support lazyness (5)" +    //     ); + +    //     assert_eq!( +    //         micromark(">     a\n     b"), +    //         "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code> b\n</code></pre>", +    //         "should not support lazyness (6)" +    //     ); + +    //     assert_eq!( +    //         micromark(">     a\n      b"), +    //         "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code>  b\n</code></pre>", +    //         "should not support lazyness (7)" +    //     ); + +    // To do: extensions. +    // assert_eq!( +    //   micromark("   a", {extensions: [{disable: {null: ["codeIndented"]}}]}), +    //   "<p>a</p>", +    //   "should support turning off code (indented, 1)" +    // ); + +    // assert_eq!( +    //   micromark("> a\n    b", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<blockquote>\n<p>a\nb</p>\n</blockquote>", +    //   "should support turning off code (indented, 2)" +    // ); + +    // assert_eq!( +    //   micromark("- a\n    b", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<ul>\n<li>a\nb</li>\n</ul>", +    //   "should support turning off code (indented, 3)" +    // ); + +    // assert_eq!( +    //   micromark("- a\n    - b", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<ul>\n<li>a\n<ul>\n<li>b</li>\n</ul>\n</li>\n</ul>", +    //   "should support turning off code (indented, 4)" +    // ); + +    // assert_eq!( +    //   micromark("- a\n    - b", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<ul>\n<li>a\n<ul>\n<li>b</li>\n</ul>\n</li>\n</ul>", +    //   "should support turning off code (indented, 5)" +    // ); + +    // assert_eq!( +    //   micromark("```\na\n    ```", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<pre><code>a\n</code></pre>", +    //   "should support turning off code (indented, 6)" +    // ); + +    // assert_eq!( +    //   micromark("a <?\n    ?>", { +    //     allowDangerousHtml: true, +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<p>a <?\n?></p>", +    //   "should support turning off code (indented, 7)" +    // ); + +    // assert_eq!( +    //   micromark("- Foo\n---", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<ul>\n<li>Foo</li>\n</ul>\n<hr />", +    //   "should support turning off code (indented, 8)" +    // ); + +    // assert_eq!( +    //   micromark("- Foo\n     ---", { +    //     extensions: [{disable: {null: ["codeIndented"]}}] +    //   }), +    //   "<ul>\n<li>\n<h2>Foo</h2>\n</li>\n</ul>", +    //   "should support turning off code (indented, 9)" +    // ); +} diff --git a/tests/heading_atx.rs b/tests/heading_atx.rs new file mode 100644 index 0000000..b75d058 --- /dev/null +++ b/tests/heading_atx.rs @@ -0,0 +1,208 @@ +extern crate micromark; +use micromark::micromark; +#[test] +fn heading_atx() { +    assert_eq!( +        micromark("# foo"), +        "<h1>foo</h1>", +        "should support a heading w/ rank 1" +    ); + +    assert_eq!( +        micromark("## foo"), +        "<h2>foo</h2>", +        "should support a heading w/ rank 2" +    ); + +    assert_eq!( +        micromark("### foo"), +        "<h3>foo</h3>", +        "should support a heading w/ rank 3" +    ); + +    assert_eq!( +        micromark("#### foo"), +        "<h4>foo</h4>", +        "should support a heading w/ rank 4" +    ); + +    assert_eq!( +        micromark("##### foo"), +        "<h5>foo</h5>", +        "should support a heading w/ rank 5" +    ); + +    assert_eq!( +        micromark("###### foo"), +        "<h6>foo</h6>", +        "should support a heading w/ rank 6" +    ); + +    assert_eq!( +        micromark("####### foo"), +        "<p>####### foo</p>", +        "should not support a heading w/ rank 7" +    ); + +    assert_eq!( +        micromark("#5 bolt"), +        "<p>#5 bolt</p>", +        "should not support a heading for a number sign not followed by whitespace (1)" +    ); + +    assert_eq!( +        micromark("#hashtag"), +        "<p>#hashtag</p>", +        "should not support a heading for a number sign not followed by whitespace (2)" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark("\\## foo"), +    //     "<p>## foo</p>", +    //     "should not support a heading for an escaped number sign" +    // ); + +    // assert_eq!( +    //     micromark("# foo *bar* \\*baz\\*"), +    //     "<h1>foo <em>bar</em> *baz*</h1>", +    //     "should support text content in headings" +    // ); + +    assert_eq!( +        micromark("#                  foo                     "), +        "<h1>foo</h1>", +        "should support arbitrary initial and final whitespace" +    ); + +    assert_eq!( +        micromark(" ### foo"), +        "<h3>foo</h3>", +        "should support an initial space" +    ); + +    assert_eq!( +        micromark("  ## foo"), +        "<h2>foo</h2>", +        "should support two initial spaces" +    ); + +    assert_eq!( +        micromark("   # foo"), +        "<h1>foo</h1>", +        "should support three initial spaces" +    ); + +    assert_eq!( +        micromark("    # foo"), +        "<pre><code># foo\n</code></pre>", +        "should not support four initial spaces" +    ); + +    // To do: lazy. +    // assert_eq!( +    //     micromark("foo\n    # bar"), +    //     "<p>foo\n# bar</p>", +    //     "should not support four initial spaces when interrupting" +    // ); + +    assert_eq!( +        micromark("## foo ##"), +        "<h2>foo</h2>", +        "should support a closing sequence (1)" +    ); + +    assert_eq!( +        micromark("  ###   bar    ###"), +        "<h3>bar</h3>", +        "should support a closing sequence (2)" +    ); + +    assert_eq!( +        micromark("# foo ##################################"), +        "<h1>foo</h1>", +        "should support a closing sequence w/ an arbitrary number of number signs (1)" +    ); + +    assert_eq!( +        micromark("##### foo ##"), +        "<h5>foo</h5>", +        "should support a closing sequence w/ an arbitrary number of number signs (2)" +    ); + +    assert_eq!( +        micromark("### foo ###     "), +        "<h3>foo</h3>", +        "should support trailing whitespace after a closing sequence" +    ); + +    assert_eq!( +        micromark("### foo ### b"), +        "<h3>foo ### b</h3>", +        "should not support other content after a closing sequence" +    ); + +    assert_eq!( +        micromark("# foo#"), +        "<h1>foo#</h1>", +        "should not support a closing sequence w/o whitespace before it" +    ); + +    // Phrasing. +    // assert_eq!( +    //     micromark("### foo \\###"), +    //     "<h3>foo ###</h3>", +    //     "should not support an “escaped” closing sequence (1)" +    // ); + +    // assert_eq!( +    //     micromark("## foo #\\##"), +    //     "<h2>foo ###</h2>", +    //     "should not support an “escaped” closing sequence (2)" +    // ); + +    // assert_eq!( +    //     micromark("# foo \\#"), +    //     "<h1>foo #</h1>", +    //     "should not support an “escaped” closing sequence (3)" +    // ); + +    assert_eq!( +        micromark("****\n## foo\n****"), +        "<hr />\n<h2>foo</h2>\n<hr />", +        "should support atx headings when not surrounded by blank lines" +    ); + +    assert_eq!( +        micromark("Foo bar\n# baz\nBar foo"), +        "<p>Foo bar</p>\n<h1>baz</h1>\n<p>Bar foo</p>", +        "should support atx headings interrupting paragraphs" +    ); + +    // Line endings. +    assert_eq!( +        micromark("## \n#\n### ###"), +        "<h2></h2>\n<h1></h1>\n<h3></h3>", +        "should support empty atx headings" +    ); + +    // To do: block quote. +    // assert_eq!( +    //     micromark("> #\na"), +    //     "<blockquote>\n<h1></h1>\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark("> a\n#"), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<h1></h1>", +    //     "should not support lazyness (2)" +    // ); + +    // Extensions: +    // assert_eq!( +    //   micromark("# a", {extensions: [{disable: {null: ["headingAtx"]}}]}), +    //   "<p># a</p>", +    //   "should support turning off heading (atx)" +    // ); +} diff --git a/tests/html_flow.rs b/tests/html_flow.rs new file mode 100644 index 0000000..51d1a2a --- /dev/null +++ b/tests/html_flow.rs @@ -0,0 +1,1058 @@ +extern crate micromark; +use micromark::{micromark, micromark_with_options, CompileOptions}; + +const DANGER: &CompileOptions = &CompileOptions { +    allow_dangerous_html: true, +}; + +#[test] +fn html_flow() { +    assert_eq!( +        micromark("<!-- asd -->"), +        "<!-- asd -->", +        "should support a heading w/ rank 1" +    ); + +    assert_eq!( +        micromark_with_options("<!-- asd -->", DANGER), +        "<!-- asd -->", +        "should support a heading w/ rank 1" +    ); + +    // To do: extensions. +    // assert_eq!( +    //   micromark_with_options("<x>", {extensions: [{disable: {null: ["htmlFlow"]}}]}), +    //   "<p><x></p>", +    //   "should support turning off html (flow)" +    // ); +} + +#[test] +fn html_flow_1_raw() { +    assert_eq!( +        micromark_with_options( +            "<pre language=\"haskell\"><code> +import Text.HTML.TagSoup + +main :: IO () +main = print $ parseTags tags +</code></pre> +okay", +            DANGER +        ), +        "<pre language=\"haskell\"><code> +import Text.HTML.TagSoup + +main :: IO () +main = print $ parseTags tags +</code></pre> +<p>okay</p>", +        "should support raw pre tags (type 1)" +    ); + +    assert_eq!( +        micromark_with_options( +            "<script type=\"text/javascript\"> +// JavaScript example + +document.getElementById(\"demo\").innerHTML = \"Hello JavaScript!\"; +</script> +okay", +            DANGER +        ), +        "<script type=\"text/javascript\"> +// JavaScript example + +document.getElementById(\"demo\").innerHTML = \"Hello JavaScript!\"; +</script> +<p>okay</p>", +        "should support raw script tags" +    ); + +    assert_eq!( +        micromark_with_options( +            "<style +  type=\"text/css\"> +h1 {color:red;} + +p {color:blue;} +</style> +okay", +            DANGER +        ), +        "<style +  type=\"text/css\"> +h1 {color:red;} + +p {color:blue;} +</style> +<p>okay</p>", +        "should support raw style tags" +    ); + +    assert_eq!( +        micromark_with_options("<style\n  type=\"text/css\">\n\nfoo", DANGER), +        "<style\n  type=\"text/css\">\n\nfoo", +        "should support raw tags w/o ending" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<style>p{color:red;}</style>\n*foo*", DANGER), +    //     "<style>p{color:red;}</style>\n<p><em>foo</em></p>", +    //     "should support raw tags w/ start and end on a single line" +    // ); + +    assert_eq!( +        micromark_with_options("<script>\nfoo\n</script>1. *bar*", DANGER), +        "<script>\nfoo\n</script>1. *bar*", +        "should support raw tags w/ more data on ending line" +    ); + +    assert_eq!( +        micromark_with_options("<script", DANGER), +        "<script", +        "should support an eof directly after a raw tag name" +    ); + +    // To do: paragraphs. +    // assert_eq!( +    //     micromark_with_options("</script\nmore", DANGER), +    //     "<p></script\nmore</p>", +    //     "should not support a raw closing tag" +    // ); + +    assert_eq!( +        micromark_with_options("<script/", DANGER), +        "<p><script/</p>", +        "should not support an eof after a self-closing slash" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<script/\n*asd*", DANGER), +    //     "<p><script/\n<em>asd</em></p>", +    //     "should not support a line ending after a self-closing slash" +    // ); + +    assert_eq!( +        micromark_with_options("<script/>", DANGER), +        "<script/>", +        "should support an eof after a self-closing tag" +    ); + +    assert_eq!( +        micromark_with_options("<script/>\na", DANGER), +        "<script/>\na", +        "should support a line ending after a self-closing tag" +    ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<script/>a", DANGER), +    //     "<p><script/>a</p>", +    //     "should not support other characters after a self-closing tag" +    // ); + +    assert_eq!( +        micromark_with_options("<script>a", DANGER), +        "<script>a", +        "should support other characters after a raw opening tag" +    ); + +    // Extra. +    assert_eq!( +        micromark_with_options("Foo\n<script", DANGER), +        "<p>Foo</p>\n<script", +        "should support interrupting paragraphs w/ raw tags" +    ); + +    assert_eq!( +        micromark_with_options("<script>\n  \n  \n</script>", DANGER), +        "<script>\n  \n  \n</script>", +        "should support blank lines in raw" +    ); + +    // To do: block quote. +    // assert_eq!( +    //     micromark_with_options("> <script>\na", DANGER), +    //     "<blockquote>\n<script>\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<script>", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<script>", +    //     "should not support lazyness (2)" +    // ); +} + +#[test] +fn html_flow_2_comment() { +    assert_eq!( +        micromark_with_options("<!-- Foo\n\nbar\n   baz -->\nokay", DANGER), +        "<!-- Foo\n\nbar\n   baz -->\n<p>okay</p>", +        "should support comments (type 2)" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<!-- foo -->*bar*\n*baz*", DANGER), +    //     "<!-- foo -->*bar*\n<p><em>baz</em></p>", +    //     "should support comments w/ start and end on a single line" +    // ); + +    assert_eq!( +        micromark_with_options("<!-asd-->", DANGER), +        "<p><!-asd--></p>", +        "should not support a single dash to start comments" +    ); + +    assert_eq!( +        micromark_with_options("<!-->", DANGER), +        "<!-->", +        "should support comments where the start dashes are the end dashes (1)" +    ); + +    assert_eq!( +        micromark_with_options("<!--->", DANGER), +        "<!--->", +        "should support comments where the start dashes are the end dashes (2)" +    ); + +    assert_eq!( +        micromark_with_options("<!---->", DANGER), +        "<!---->", +        "should support empty comments" +    ); + +    // If the `\"` is encoded, we’re in text. If it remains, we’re in HTML. +    assert_eq!( +        micromark_with_options("<!--\n->\n\"", DANGER), +        "<!--\n->\n\"", +        "should not end a comment at one dash (`->`)" +    ); +    assert_eq!( +        micromark_with_options("<!--\n-->\n\"", DANGER), +        "<!--\n-->\n<p>"</p>", +        "should end a comment at two dashes (`-->`)" +    ); +    assert_eq!( +        micromark_with_options("<!--\n--->\n\"", DANGER), +        "<!--\n--->\n<p>"</p>", +        "should end a comment at three dashes (`--->`)" +    ); +    assert_eq!( +        micromark_with_options("<!--\n---->\n\"", DANGER), +        "<!--\n---->\n<p>"</p>", +        "should end a comment at four dashes (`---->`)" +    ); + +    assert_eq!( +        micromark_with_options("  <!-- foo -->", DANGER), +        "  <!-- foo -->", +        "should support comments w/ indent" +    ); + +    assert_eq!( +        micromark_with_options("    <!-- foo -->", DANGER), +        "<pre><code><!-- foo -->\n</code></pre>", +        "should not support comments w/ a 4 character indent" +    ); + +    // Extra. +    assert_eq!( +        micromark_with_options("Foo\n<!--", DANGER), +        "<p>Foo</p>\n<!--", +        "should support interrupting paragraphs w/ comments" +    ); + +    assert_eq!( +        micromark_with_options("<!--\n  \n  \n-->", DANGER), +        "<!--\n  \n  \n-->", +        "should support blank lines in comments" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark_with_options("> <!--\na", DANGER), +    //     "<blockquote>\n<!--\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<!--", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<!--", +    //     "should not support lazyness (2)" +    // ); +} + +#[test] +fn html_flow_3_instruction() { +    assert_eq!( +        micromark_with_options("<?php\n\n  echo \">\";\n\n?>\nokay", DANGER), +        "<?php\n\n  echo \">\";\n\n?>\n<p>okay</p>", +        "should support instructions (type 3)" +    ); + +    assert_eq!( +        micromark_with_options("<?>", DANGER), +        "<?>", +        "should support empty instructions where the `?` is part of both the start and the end" +    ); + +    assert_eq!( +        micromark_with_options("<??>", DANGER), +        "<??>", +        "should support empty instructions" +    ); + +    // Extra. +    assert_eq!( +        micromark_with_options("Foo\n<?", DANGER), +        "<p>Foo</p>\n<?", +        "should support interrupting paragraphs w/ instructions" +    ); + +    assert_eq!( +        micromark_with_options("<?\n  \n  \n?>", DANGER), +        "<?\n  \n  \n?>", +        "should support blank lines in instructions" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark_with_options("> <?\na", DANGER), +    //     "<blockquote>\n<?\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<?", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<?", +    //     "should not support lazyness (2)" +    // ); +} + +#[test] +fn html_flow_4_declaration() { +    assert_eq!( +        micromark_with_options("<!DOCTYPE html>", DANGER), +        "<!DOCTYPE html>", +        "should support declarations (type 4)" +    ); + +    assert_eq!( +        micromark_with_options("<!123>", DANGER), +        "<p><!123></p>", +        "should not support declarations that start w/o an alpha" +    ); + +    assert_eq!( +        micromark_with_options("<!>", DANGER), +        "<p><!></p>", +        "should not support declarations w/o an identifier" +    ); + +    assert_eq!( +        micromark_with_options("<!a>", DANGER), +        "<!a>", +        "should support declarations w/o a single alpha as identifier" +    ); + +    // Extra. +    assert_eq!( +        micromark_with_options("Foo\n<!d", DANGER), +        "<p>Foo</p>\n<!d", +        "should support interrupting paragraphs w/ declarations" +    ); + +    // Note about the lower letter: +    // <https://github.com/commonmark/commonmark-spec/pull/621> +    assert_eq!( +        micromark_with_options("<!a\n  \n  \n>", DANGER), +        "<!a\n  \n  \n>", +        "should support blank lines in declarations" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark_with_options("> <!a\nb", DANGER), +    //     "<blockquote>\n<!a\n</blockquote>\n<p>b</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<!b", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<!b", +    //     "should not support lazyness (2)" +    // ); +} + +#[test] +fn html_flow_5_cdata() { +    assert_eq!( +    micromark_with_options( +      "<![CDATA[\nfunction matchwo(a,b)\n{\n  if (a < b && a < 0) then {\n    return 1;\n\n  } else {\n\n    return 0;\n  }\n}\n]]>\nokay", +      DANGER +    ), +    "<![CDATA[\nfunction matchwo(a,b)\n{\n  if (a < b && a < 0) then {\n    return 1;\n\n  } else {\n\n    return 0;\n  }\n}\n]]>\n<p>okay</p>", +    "should support cdata (type 5)" +  ); + +    assert_eq!( +        micromark_with_options("<![CDATA[]]>", DANGER), +        "<![CDATA[]]>", +        "should support empty cdata" +    ); + +    assert_eq!( +        micromark_with_options("<![CDATA]]>", DANGER), +        "<p><![CDATA]]></p>", +        "should not support cdata w/ a missing `[`" +    ); + +    assert_eq!( +        micromark_with_options("<![CDATA[]]]>", DANGER), +        "<![CDATA[]]]>", +        "should support cdata w/ a single `]` as content" +    ); + +    // Extra. +    assert_eq!( +        micromark_with_options("Foo\n<![CDATA[", DANGER), +        "<p>Foo</p>\n<![CDATA[", +        "should support interrupting paragraphs w/ cdata" +    ); + +    // Note: cmjs parses this differently. +    // See: <https://github.com/commonmark/commonmark.js/issues/193> +    assert_eq!( +        micromark_with_options("<![cdata[]]>", DANGER), +        "<p><![cdata[]]></p>", +        "should not support lowercase cdata" +    ); + +    assert_eq!( +        micromark_with_options("<![CDATA[\n  \n  \n]]>", DANGER), +        "<![CDATA[\n  \n  \n]]>", +        "should support blank lines in cdata" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark_with_options("> <![CDATA[\na", DANGER), +    //     "<blockquote>\n<![CDATA[\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<![CDATA[", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<![CDATA[", +    //     "should not support lazyness (2)" +    // ); +} + +#[test] +fn html_flow_6_basic() { +    // To do: phrasing, paragraphs, etc. +    // assert_eq!( +    //     micromark_with_options( +    //         "<table><tr><td>\n<pre>\n**Hello**,\n\n_world_.\n</pre>\n</td></tr></table>", +    //         DANGER +    //     ), +    //     "<table><tr><td>\n<pre>\n**Hello**,\n<p><em>world</em>.\n</pre></p>\n</td></tr></table>", +    //     "should support html (basic)" +    // ); + +    // To do: paragraphs. +    //     assert_eq!( +    //         micromark_with_options( +    //             "<table> +    //   <tr> +    //     <td> +    //            hi +    //     </td> +    //   </tr> +    // </table> + +    // okay.", +    //             DANGER +    //         ), +    //         "<table> +    //   <tr> +    //     <td> +    //            hi +    //     </td> +    //   </tr> +    // </table> +    // <p>okay.</p>", +    //         "should support html of type 6 (1)" +    //     ); + +    assert_eq!( +        micromark_with_options(" <div>\n  *hello*\n         <foo><a>", DANGER), +        " <div>\n  *hello*\n         <foo><a>", +        "should support html of type 6 (2)" +    ); + +    assert_eq!( +        micromark_with_options("</div>\n*foo*", DANGER), +        "</div>\n*foo*", +        "should support html starting w/ a closing tag" +    ); + +    // To do: phrasing +    // assert_eq!( +    //     micromark_with_options("<DIV CLASS=\"foo\">\n\n*Markdown*\n\n</DIV>", DANGER), +    //     "<DIV CLASS=\"foo\">\n<p><em>Markdown</em></p>\n</DIV>", +    //     "should support html w/ markdown in between" +    // ); + +    assert_eq!( +        micromark_with_options("<div id=\"foo\"\n  class=\"bar\">\n</div>", DANGER), +        "<div id=\"foo\"\n  class=\"bar\">\n</div>", +        "should support html w/ line endings (1)" +    ); + +    assert_eq!( +        micromark_with_options("<div id=\"foo\" class=\"bar\n  baz\">\n</div>", DANGER), +        "<div id=\"foo\" class=\"bar\n  baz\">\n</div>", +        "should support html w/ line endings (2)" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<div>\n*foo*\n\n*bar*", DANGER), +    //     "<div>\n*foo*\n<p><em>bar</em></p>", +    //     "should support an unclosed html element" +    // ); + +    assert_eq!( +        micromark_with_options("<div id=\"foo\"\n*hi*", DANGER), +        "<div id=\"foo\"\n*hi*", +        "should support garbage html (1)" +    ); + +    assert_eq!( +        micromark_with_options("<div class\nfoo", DANGER), +        "<div class\nfoo", +        "should support garbage html (2)" +    ); + +    assert_eq!( +        micromark_with_options("<div *???-&&&-<---\n*foo*", DANGER), +        "<div *???-&&&-<---\n*foo*", +        "should support garbage html (3)" +    ); + +    assert_eq!( +        micromark_with_options("<div><a href=\"bar\">*foo*</a></div>", DANGER), +        "<div><a href=\"bar\">*foo*</a></div>", +        "should support other tags in the opening (1)" +    ); + +    assert_eq!( +        micromark_with_options("<table><tr><td>\nfoo\n</td></tr></table>", DANGER), +        "<table><tr><td>\nfoo\n</td></tr></table>", +        "should support other tags in the opening (2)" +    ); + +    assert_eq!( +        micromark_with_options("<div></div>\n``` c\nint x = 33;\n```", DANGER), +        "<div></div>\n``` c\nint x = 33;\n```", +        "should include everything ’till a blank line" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark_with_options("> <div>\n> foo\n\nbar", DANGER), +    //     "<blockquote>\n<div>\nfoo\n</blockquote>\n<p>bar</p>", +    //     "should support basic tags w/o ending in containers (1)" +    // ); + +    // To do: list. +    // assert_eq!( +    //     micromark_with_options("- <div>\n- foo", DANGER), +    //     "<ul>\n<li>\n<div>\n</li>\n<li>foo</li>\n</ul>", +    //     "should support basic tags w/o ending in containers (2)" +    // ); + +    assert_eq!( +        micromark_with_options("  <div>", DANGER), +        "  <div>", +        "should support basic tags w/ indent" +    ); + +    assert_eq!( +        micromark_with_options("    <div>", DANGER), +        "<pre><code><div>\n</code></pre>", +        "should not support basic tags w/ a 4 character indent" +    ); + +    assert_eq!( +        micromark_with_options("Foo\n<div>\nbar\n</div>", DANGER), +        "<p>Foo</p>\n<div>\nbar\n</div>", +        "should support interrupting paragraphs w/ basic tags" +    ); + +    assert_eq!( +        micromark_with_options("<div>\nbar\n</div>\n*foo*", DANGER), +        "<div>\nbar\n</div>\n*foo*", +        "should require a blank line to end" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<div>\n\n*Emphasized* text.\n\n</div>", DANGER), +    //     "<div>\n<p><em>Emphasized</em> text.</p>\n</div>", +    //     "should support interleaving w/ blank lines" +    // ); + +    assert_eq!( +        micromark_with_options("<div>\n*Emphasized* text.\n</div>", DANGER), +        "<div>\n*Emphasized* text.\n</div>", +        "should not support interleaving w/o blank lines" +    ); + +    assert_eq!( +        micromark_with_options( +            "<table>\n\n<tr>\n\n<td>\nHi\n</td>\n\n</tr>\n\n</table>", +            DANGER +        ), +        "<table>\n<tr>\n<td>\nHi\n</td>\n</tr>\n</table>", +        "should support blank lines between adjacent html" +    ); + +    assert_eq!( +        micromark_with_options( +            "<table> + +  <tr> + +    <td> +      Hi +    </td> + +  </tr> + +</table>", +            DANGER +        ), +        "<table> +  <tr> +<pre><code><td> +  Hi +</td> +</code></pre> +  </tr> +</table>", +        "should not support indented, blank-line delimited, adjacent html" +    ); + +    assert_eq!( +        micromark_with_options("</1>", DANGER), +        "<p></1></p>", +        "should not support basic tags w/ an incorrect name start character" +    ); + +    assert_eq!( +        micromark_with_options("<div", DANGER), +        "<div", +        "should support an eof directly after a basic tag name" +    ); + +    assert_eq!( +        micromark_with_options("<div\n", DANGER), +        "<div\n", +        "should support a line ending directly after a tag name" +    ); + +    assert_eq!( +        micromark_with_options("<div ", DANGER), +        "<div ", +        "should support an eof after a space directly after a tag name" +    ); + +    assert_eq!( +        micromark_with_options("<div/", DANGER), +        "<p><div/</p>", +        "should not support an eof directly after a self-closing slash" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<div/\n*asd*", DANGER), +    //     "<p><div/\n<em>asd</em></p>", +    //     "should not support a line ending after a self-closing slash" +    // ); + +    assert_eq!( +        micromark_with_options("<div/>", DANGER), +        "<div/>", +        "should support an eof after a self-closing tag" +    ); + +    assert_eq!( +        micromark_with_options("<div/>\na", DANGER), +        "<div/>\na", +        "should support a line ending after a self-closing tag" +    ); + +    assert_eq!( +        micromark_with_options("<div/>a", DANGER), +        "<div/>a", +        "should support another character after a self-closing tag" +    ); + +    assert_eq!( +        micromark_with_options("<div>a", DANGER), +        "<div>a", +        "should support another character after a basic opening tag" +    ); + +    // Extra. +    assert_eq!( +        micromark_with_options("Foo\n<div/>", DANGER), +        "<p>Foo</p>\n<div/>", +        "should support interrupting paragraphs w/ self-closing basic tags" +    ); + +    // To do: block quote. +    // assert_eq!( +    //     micromark_with_options("<div\n  \n  \n>", DANGER), +    //     "<div\n<blockquote>\n</blockquote>", +    //     "should not support blank lines in basic" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> <div\na", DANGER), +    //     "<blockquote>\n<div\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<div", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<div", +    //     "should not support lazyness (2)" +    // ); +} + +#[test] +fn html_flow_7_complete() { +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<a href=\"foo\">\n*bar*\n</a>", DANGER), +    //     "<a href=\"foo\">\n*bar*\n</a>", +    //     "should support complete tags (type 7)" +    // ); + +    assert_eq!( +        micromark_with_options("<Warning>\n*bar*\n</Warning>", DANGER), +        "<Warning>\n*bar*\n</Warning>", +        "should support non-html tag names" +    ); + +    assert_eq!( +        micromark_with_options("<i class=\"foo\">\n*bar*\n</i>", DANGER), +        "<i class=\"foo\">\n*bar*\n</i>", +        "should support non-“block” html tag names (1)" +    ); + +    assert_eq!( +        micromark_with_options("<del>\n*foo*\n</del>", DANGER), +        "<del>\n*foo*\n</del>", +        "should support non-“block” html tag names (2)" +    ); + +    assert_eq!( +        micromark_with_options("</ins>\n*bar*", DANGER), +        "</ins>\n*bar*", +        "should support closing tags" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<del>\n\n*foo*\n\n</del>", DANGER), +    //     "<del>\n<p><em>foo</em></p>\n</del>", +    //     "should support interleaving" +    // ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<del>*foo*</del>", DANGER), +    //     "<p><del><em>foo</em></del></p>", +    //     "should not support interleaving w/o blank lines" +    // ); + +    assert_eq!( +        micromark_with_options("<div>\n  \nasd", DANGER), +        "<div>\n<p>asd</p>", +        "should support interleaving w/ whitespace-only blank lines" +    ); + +    // To do: interrupting. +    // assert_eq!( +    //     micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER), +    //     "<p>Foo\n<a href=\"bar\">\nbaz</p>", +    //     "should not support interrupting paragraphs w/ complete tags" +    // ); + +    assert_eq!( +        micromark_with_options("<x", DANGER), +        "<p><x</p>", +        "should not support an eof directly after a tag name" +    ); + +    assert_eq!( +        micromark_with_options("<x/", DANGER), +        "<p><x/</p>", +        "should not support an eof directly after a self-closing slash" +    ); + +    assert_eq!( +        micromark_with_options("<x\n", DANGER), +        "<p><x</p>\n", +        "should not support a line ending directly after a tag name" +    ); + +    // To do: paragraphs (trailing whitespace). +    // assert_eq!( +    //     micromark_with_options("<x ", DANGER), +    //     "<p><x</p>", +    //     "should not support an eof after a space directly after a tag name" +    // ); + +    assert_eq!( +        micromark_with_options("<x/", DANGER), +        "<p><x/</p>", +        "should not support an eof directly after a self-closing slash" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark_with_options("<x/\n*asd*", DANGER), +    //     "<p><x/\n<em>asd</em></p>", +    //     "should not support a line ending after a self-closing slash" +    // ); + +    assert_eq!( +        micromark_with_options("<x/>", DANGER), +        "<x/>", +        "should support an eof after a self-closing tag" +    ); + +    assert_eq!( +        micromark_with_options("<x/>\na", DANGER), +        "<x/>\na", +        "should support a line ending after a self-closing tag" +    ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x/>a", DANGER), +    //     "<p><x/>a</p>", +    //     "should not support another character after a self-closing tag" +    // ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x>a", DANGER), +    //     "<p><x>a</p>", +    //     "should not support another character after an opening tag" +    // ); + +    assert_eq!( +        micromark_with_options("<x y>", DANGER), +        "<x y>", +        "should support boolean attributes in a complete tag" +    ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x\ny>", DANGER), +    //     "<p><x\ny></p>", +    //     "should not support a line ending before an attribute name" +    // ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x\n  y>", DANGER), +    //     "<p><x\ny></p>", +    //     "should not support a line ending w/ whitespace before an attribute name" +    // ); + +    assert_eq!( +    micromark_with_options("<x\n  \ny>", DANGER), +    "<p><x</p>\n<p>y></p>", +    "should not support a line ending w/ whitespace and another line ending before an attribute name" +  ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x y\nz>", DANGER), +    //     "<p><x y\nz></p>", +    //     "should not support a line ending between attribute names" +    // ); + +    assert_eq!( +        micromark_with_options("<x y   z>", DANGER), +        "<x y   z>", +        "should support whitespace between attribute names" +    ); + +    assert_eq!( +        micromark_with_options("<x:y>", DANGER), +        "<p><x:y></p>", +        "should not support a colon in a tag name" +    ); + +    assert_eq!( +        micromark_with_options("<x_y>", DANGER), +        "<p><x_y></p>", +        "should not support an underscore in a tag name" +    ); + +    assert_eq!( +        micromark_with_options("<x.y>", DANGER), +        "<p><x.y></p>", +        "should not support a dot in a tag name" +    ); + +    assert_eq!( +        micromark_with_options("<x :y>", DANGER), +        "<x :y>", +        "should support a colon to start an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x _y>", DANGER), +        "<x _y>", +        "should support an underscore to start an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x .y>", DANGER), +        "<p><x .y></p>", +        "should not support a dot to start an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x y:>", DANGER), +        "<x y:>", +        "should support a colon to end an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x y_>", DANGER), +        "<x y_>", +        "should support an underscore to end an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x y.>", DANGER), +        "<x y.>", +        "should support a dot to end an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x y123>", DANGER), +        "<x y123>", +        "should support numbers to end an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x data->", DANGER), +        "<x data->", +        "should support a dash to end an attribute name" +    ); + +    assert_eq!( +        micromark_with_options("<x y=>", DANGER), +        "<p><x y=></p>", +        "should not upport an initializer w/o a value" +    ); + +    assert_eq!( +        micromark_with_options("<x y==>", DANGER), +        "<p><x y==></p>", +        "should not support an equals to as an initializer" +    ); + +    assert_eq!( +        micromark_with_options("<x y=z>", DANGER), +        "<x y=z>", +        "should support a single character as an unquoted attribute value" +    ); + +    assert_eq!( +        micromark_with_options("<x y=\"\">", DANGER), +        "<x y=\"\">", +        "should support an empty double quoted attribute value" +    ); + +    assert_eq!( +        micromark_with_options("<x y=\"\">", DANGER), +        "<x y=\"\">", +        "should support an empty single quoted attribute value" +    ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x y=\"\n\">", DANGER), +    //     "<p><x y=\"\n\"></p>", +    //     "should not support a line ending in a double quoted attribute value" +    // ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<x y=\"\n\">", DANGER), +    //     "<p><x y=\"\n\"></p>", +    //     "should not support a line ending in a single quoted attribute value" +    // ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<w x=y\nz>", DANGER), +    //     "<p><w x=y\nz></p>", +    //     "should not support a line ending in/after an unquoted attribute value" +    // ); + +    assert_eq!( +        micromark_with_options("<w x=y\"z>", DANGER), +        "<p><w x=y"z></p>", +        "should not support a double quote in/after an unquoted attribute value" +    ); + +    // To do: html (text). +    // assert_eq!( +    //     micromark_with_options("<w x=y\"z>", DANGER), +    //     "<p><w x=y\"z></p>", +    //     "should not support a single quote in/after an unquoted attribute value" +    // ); + +    assert_eq!( +        micromark_with_options("<x y=\"\"z>", DANGER), +        "<p><x y=""z></p>", +        "should not support an attribute after a double quoted attribute value" +    ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark_with_options("<x>\n  \n  \n>", DANGER), +    //     "<x>\n<blockquote>\n</blockquote>", +    //     "should not support blank lines in complete" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> <a>\n*bar*", DANGER), +    //     "<blockquote>\n<a>\n</blockquote>\n<p><em>bar</em></p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark_with_options("> a\n<a>", DANGER), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<a>", +    //     "should not support lazyness (2)" +    // ); +} diff --git a/tests/lib.rs b/tests/lib.rs new file mode 100644 index 0000000..18fcef2 --- /dev/null +++ b/tests/lib.rs @@ -0,0 +1,8 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn basic() { +    assert_eq!(micromark("asd"), "<p>asd</p>", "should work"); +    assert_eq!(micromark("1 < 3"), "<p>1 < 3</p>", "should encode"); +} diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs new file mode 100644 index 0000000..833fa6f --- /dev/null +++ b/tests/thematic_break.rs @@ -0,0 +1,181 @@ +extern crate micromark; +use micromark::micromark; + +#[test] +fn thematic_break() { +    assert_eq!( +        micromark("***\n---\n___"), +        "<hr />\n<hr />\n<hr />", +        "should support thematic breaks w/ asterisks, dashes, and underscores" +    ); + +    assert_eq!( +        micromark("+++"), +        "<p>+++</p>", +        "should not support thematic breaks w/ plusses" +    ); + +    assert_eq!( +        micromark("==="), +        "<p>===</p>", +        "should not support thematic breaks w/ equals" +    ); + +    assert_eq!( +        micromark("--"), +        "<p>--</p>", +        "should not support thematic breaks w/ two dashes" +    ); + +    assert_eq!( +        micromark("**"), +        "<p>**</p>", +        "should not support thematic breaks w/ two asterisks" +    ); + +    assert_eq!( +        micromark("__"), +        "<p>__</p>", +        "should not support thematic breaks w/ two underscores" +    ); + +    assert_eq!( +        micromark(" ***"), +        "<hr />", +        "should support thematic breaks w/ 1 space" +    ); + +    assert_eq!( +        micromark("  ***"), +        "<hr />", +        "should support thematic breaks w/ 2 spaces" +    ); + +    assert_eq!( +        micromark("   ***"), +        "<hr />", +        "should support thematic breaks w/ 3 spaces" +    ); + +    assert_eq!( +        micromark("    ***"), +        "<pre><code>***\n</code></pre>", +        "should not support thematic breaks w/ 4 spaces" +    ); + +    // To do: paragraphs. +    // assert_eq!( +    //     micromark("Foo\n    ***"), +    //     "<p>Foo\n***</p>", +    //     "should not support thematic breaks w/ 4 spaces as paragraph continuation" +    // ); + +    assert_eq!( +        micromark("_____________________________________"), +        "<hr />", +        "should support thematic breaks w/ many markers" +    ); + +    assert_eq!( +        micromark(" - - -"), +        "<hr />", +        "should support thematic breaks w/ spaces (1)" +    ); + +    assert_eq!( +        micromark(" **  * ** * ** * **"), +        "<hr />", +        "should support thematic breaks w/ spaces (2)" +    ); + +    assert_eq!( +        micromark("-     -      -      -"), +        "<hr />", +        "should support thematic breaks w/ spaces (3)" +    ); + +    assert_eq!( +        micromark("- - - -    "), +        "<hr />", +        "should support thematic breaks w/ trailing spaces" +    ); + +    assert_eq!( +        micromark("_ _ _ _ a"), +        "<p>_ _ _ _ a</p>", +        "should not support thematic breaks w/ other characters (1)" +    ); + +    assert_eq!( +        micromark("a------"), +        "<p>a------</p>", +        "should not support thematic breaks w/ other characters (2)" +    ); + +    assert_eq!( +        micromark("---a---"), +        "<p>---a---</p>", +        "should not support thematic breaks w/ other characters (3)" +    ); + +    // To do: phrasing. +    // assert_eq!( +    //     micromark(" *-*"), +    //     "<p><em>-</em></p>", +    //     "should not support thematic breaks w/ mixed markers" +    // ); + +    // To do: lists. +    // assert_eq!( +    //     micromark("- foo\n***\n- bar"), +    //     "<ul>\n<li>foo</li>\n</ul>\n<hr />\n<ul>\n<li>bar</li>\n</ul>", +    //     "should support thematic breaks mixed w/ lists (1)" +    // ); + +    // assert_eq!( +    //     micromark("* Foo\n* * *\n* Bar"), +    //     "<ul>\n<li>Foo</li>\n</ul>\n<hr />\n<ul>\n<li>Bar</li>\n</ul>", +    //     "should support thematic breaks mixed w/ lists (2)" +    // ); + +    // To do: paragraph. +    // assert_eq!( +    //     micromark("Foo\n***\nbar"), +    //     "<p>Foo</p>\n<hr />\n<p>bar</p>", +    //     "should support thematic breaks interrupting paragraphs" +    // ); + +    // To do: setext. +    // assert_eq!( +    //     micromark("Foo\n---\nbar"), +    //     "<h2>Foo</h2>\n<p>bar</p>", +    //     "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)" +    // ); + +    // To do: list. +    // assert_eq!( +    //     micromark("- Foo\n- * * *"), +    //     "<ul>\n<li>Foo</li>\n<li>\n<hr />\n</li>\n</ul>", +    //     "should support thematic breaks in lists" +    // ); + +    // To do: blockquote. +    // assert_eq!( +    //     micromark("> ---\na"), +    //     "<blockquote>\n<hr />\n</blockquote>\n<p>a</p>", +    //     "should not support lazyness (1)" +    // ); + +    // assert_eq!( +    //     micromark("> a\n---"), +    //     "<blockquote>\n<p>a</p>\n</blockquote>\n<hr />", +    //     "should not support lazyness (2)" +    // ); + +    // To do: extensions. +    // assert_eq!( +    //   micromark("***", {extensions: [{disable: {null: ["thematicBreak"]}}]}), +    //   "<p>***</p>", +    //   "should support turning off thematic breaks" +    // ); +} | 
