aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-08 15:52:16 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-08 15:52:16 +0200
commit4c06c8554c35887f8f5147783953b2b7e7c2327f (patch)
tree1b2463848a3ae4c645f7f1a325877ee829ab65c5
downloadmarkdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz
markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2
markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip
.
-rw-r--r--.editorconfig12
-rw-r--r--.github/workflows/main.yml24
-rw-r--r--.gitignore5
-rw-r--r--Cargo.toml18
-rw-r--r--Untitled.txt1
-rw-r--r--examples/lib.rs22
-rw-r--r--funding.yml1
-rw-r--r--license22
-rw-r--r--readme.md183
-rw-r--r--src/compiler.rs367
-rw-r--r--src/constant.rs2561
-rw-r--r--src/construct/blank_line.rs61
-rw-r--r--src/construct/character_escape.rs69
-rw-r--r--src/construct/character_reference.rs237
-rw-r--r--src/construct/code_fenced.rs581
-rw-r--r--src/construct/code_indented.rs190
-rw-r--r--src/construct/heading_atx.rs175
-rw-r--r--src/construct/html_flow.rs1068
-rw-r--r--src/construct/mod.rs11
-rw-r--r--src/construct/partial_whitespace.rs66
-rw-r--r--src/construct/thematic_break.rs137
-rw-r--r--src/content/flow.rs258
-rw-r--r--src/content/mod.rs4
-rw-r--r--src/content/string.rs120
-rw-r--r--src/lib.rs52
-rw-r--r--src/parser.rs14
-rw-r--r--src/tokenizer.rs580
-rw-r--r--src/util.rs241
-rw-r--r--tests/code_fenced.rs266
-rw-r--r--tests/code_indented.rs196
-rw-r--r--tests/heading_atx.rs208
-rw-r--r--tests/html_flow.rs1058
-rw-r--r--tests/lib.rs8
-rw-r--r--tests/thematic_break.rs181
34 files changed, 8997 insertions, 0 deletions
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..201f7b7
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,12 @@
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.rs]
+indent_size = 4
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..cbee315
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,24 @@
+name: main
+on:
+ - pull_request
+ - push
+jobs:
+ main:
+ name: ${{matrix.rust}}
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions-rs/toolchain@v1
+ with:
+ toolchain: ${{matrix.rust}}
+ components: rustfmt, clippy
+ - run: cargo clippy -- -W clippy::pedantic
+ - run: cargo fmt --all -- --check
+ - run: cargo test
+ - run: cargo install cargo-tarpaulin && cargo tarpaulin --out Xml
+ - uses: codecov/codecov-action@v1
+ strategy:
+ matrix:
+ rust:
+ - stable
+ - beta
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..32a28f2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.DS_Store
+*.log
+*.lock
+coverage/
+target
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..96f23d7
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "micromark"
+version = "0.0.0"
+authors = ["Titus Wormer <tituswormer@gmail.com>"]
+edition = "2015"
+rust-version = "1.56"
+description = "small commonmark compliant markdown parser with positional info and concrete tokens"
+homepage = "https://github.com/micromark/micromark-rs"
+repository = "https://github.com/micromark/micromark-rs"
+license = "MIT"
+keywords = ["commonmark", "markdown", "parse", "render", "tokenize"]
+categories = ["compilers", "encoding", "parser-implementations", "parsing", "text-processing"]
+include = ["src/", "license"]
+publish = false
+
+[dependencies]
+log = "0.4"
+env_logger = "0.9"
diff --git a/Untitled.txt b/Untitled.txt
new file mode 100644
index 0000000..cc1576f
--- /dev/null
+++ b/Untitled.txt
@@ -0,0 +1 @@
+micromark.js: unquoted: is `completeAttributeValueUnquoted`s case for `completeAttributeNameAfter` missing a `/`?. I’ve added it here.
diff --git a/examples/lib.rs b/examples/lib.rs
new file mode 100644
index 0000000..4d01161
--- /dev/null
+++ b/examples/lib.rs
@@ -0,0 +1,22 @@
+extern crate micromark;
+use micromark::{micromark, micromark_with_options, CompileOptions};
+
+fn main() {
+ // Turn on debugging.
+ // You can show it with `RUST_LOG=debug cargo run --example lib`
+ env_logger::init();
+
+ // Safely turn (untrusted?) markdown into HTML.
+ println!("{:?}", micromark("# Hello, world!"));
+
+ // Turn trusted markdown into HTML.
+ println!(
+ "{:?}",
+ micromark_with_options(
+ "<div style=\"color: tomato\">\n\n# Hello, tomato!\n\n</div>",
+ &CompileOptions {
+ allow_dangerous_html: true
+ }
+ )
+ );
+}
diff --git a/funding.yml b/funding.yml
new file mode 100644
index 0000000..dee132d
--- /dev/null
+++ b/funding.yml
@@ -0,0 +1 @@
+github: wooorm
diff --git a/license b/license
new file mode 100644
index 0000000..9ac1e96
--- /dev/null
+++ b/license
@@ -0,0 +1,22 @@
+(The MIT License)
+
+Copyright (c) 2022 Titus Wormer <tituswormer@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..8892183
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,183 @@
+# micromark-rs
+
+Here be dragons!
+🐉
+There’s a lot to do.
+Some major to dos are described here, more smaller ones are in the code.
+
+## Some useful scripts for now
+
+Run examples:
+
+```sh
+RUST_BACKTRACE=1 RUST_LOG=debug cargo run --example lib
+```
+
+Format:
+
+```sh
+cargo fmt --all
+```
+
+Lint:
+
+```sh
+cargo fmt --all -- --check && cargo clippy -- -W clippy::pedantic
+```
+
+Tests:
+
+```sh
+RUST_BACKTRACE=1 cargo test
+```
+
+Docs:
+
+```sh
+cargo doc --document-private-items
+```
+
+(add `--open` to open them in a browser)
+
+## To do
+
+### Some major obstacles
+
+- [ ] (8) Subtokenization: figure out a good, fast way to deal with constructs in
+ one content type that also are another content type
+- [ ] (1) Setext headings: can they be solved in content, or do they have to be
+ solved in flow somehow
+- [ ] (8) Can content (and to a lesser extent string and text) operate more
+ performantly than checking whether other flow constructs start a line,
+ before exiting and actually attempting flow constructs?
+- [ ] (5) Figure out definitions and sharing those identifiers, and references
+ before definitions
+- [ ] (3) Interrupting: sometimes flow can or cannot start depending on the
+ previous construct (typically paragraph)
+- [ ] (5) Containers: this will be rather messy, and depends a lot on how
+ subtokenization is solved
+- [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by
+ containers
+- [ ] (3) Lazy lines, in containers, in flow and content in a paragraph, a line
+ does not need to be indented
+- [ ] (5) There’s a lot of rust-related choosing whether to pass (mutable)
+ references or whatever around that should be refactored
+- [ ] (5) Figure out extensions
+- [ ] (1) Support turning off constructs
+
+### Small things
+
+- [ ] (3) Clean compiler
+- [ ] (1) Optionally remove dangerous protocols when compiling
+- [ ] (1) Use preferred line ending style in markdown
+- [ ] (1) Handle BOM at start
+- [ ] (1) Make sure tabs are handled properly and that positional info is perfect
+- [ ] (1) Make sure crlf/cr/lf are working perfectly
+- [ ] (3) Figure out lifetimes of things (see `life time` in source)
+- [ ] (3) Use `commonmark` tests
+- [ ] (3) Share a bunch of tests with `micromark-js`
+- [ ] (5) Do some research on rust best practices for APIs, e.g., what to accept,
+ how to integrate with streams or so?
+- [ ] (1) Go through clippy rules, and such, to add strict code styles
+- [ ] (1) Make sure that rust character groups match CM character groups (e.g., is
+ `unicode_whitespace` or so the same?)
+- [ ] (1) Any special handling of surrogates?
+- [ ] (1) Make sure debugging is useful for other folks
+- [ ] (3) Add some benchmarks, do some perf testing
+- [ ] (3) Write comparison to other parsers
+- [ ] (3) Add node/etc bindings?
+- [ ] (8) After all extensions, including MDX, are done, see if we can integrate
+ this with SWC to compile MDX
+- [ ] (3) Bunch of docs
+- [ ] (5) Site
+
+### Constructs
+
+- [ ] (5) attention (strong, emphasis) (text)
+- [ ] (1) autolink
+- [x] blank line
+- [ ] (5) block quote
+- [x] character escape
+- [x] character reference
+- [x] code (fenced)
+- [x] code (indented)
+- [ ] (1) code (text)
+- [ ] (3) content
+- [ ] (3) definition
+- [ ] (1) hard break escape
+- [x] heading (atx)
+- [ ] (1) heading (setext)
+- [x] html (flow)
+- [ ] html (text)
+- [ ] (3) label end
+- [ ] (3) label start (image)
+- [ ] (3) label start (link)
+- [ ] (8) list
+- [ ] (1) paragraph
+- [x] thematic break
+
+### Content types
+
+- [ ] (8) container
+ - [ ] block quote
+ - [ ] list
+- [ ] (1) flow
+ - [x] blank line
+ - [x] code (fenced)
+ - [x] code (indented)
+ - [ ] content
+ - [x] heading (atx)
+ - [x] html (flow)
+ - [x] thematic break
+- [ ] (3) content
+ - [ ] definition
+ - [ ] heading (setext)
+ - [ ] paragraph
+- [ ] (5) text
+ - [ ] attention (strong, emphasis) (text)
+ - [ ] autolink
+ - [x] character escape
+ - [x] character reference
+ - [ ] code (text)
+ - [ ] hard break escape
+ - [ ] html (text)
+ - [ ] label end
+ - [ ] label start (image)
+ - [ ] label start (link)
+- [x] string
+ - [x] character escape
+ - [x] character reference
+
+### Extensions
+
+The main thing here is is to figure out if folks could extend from the outside
+with their own code, or if we need to maintain it all here.
+Regardless, it is essential for the launch of `micromark-rs` that extensions
+are theoretically or practically possible.
+The extensions below are listed from top to bottom from more important to less
+important.
+
+- [ ] (1) frontmatter (yaml, toml) (flow)
+ — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter)
+- [ ] (3) autolink literal (GFM) (text)
+ — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal)
+- [ ] (3) footnote (GFM) (content, text)
+ — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote)
+- [ ] (3) strikethrough (GFM) (text)
+ — [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough)
+- [ ] (5) table (GFM) (flow)
+ — [`micromark-extension-gfm-table`](https://github.com/micromark/micromark-extension-gfm-table)
+- [ ] (1) task list item (GFM) (text)
+ — [`micromark-extension-gfm-task-list-item`](https://github.com/micromark/micromark-extension-gfm-task-list-item)
+- [ ] (3) math (flow, text)
+ — [`micromark-extension-math`](https://github.com/micromark/micromark-extension-math)
+- [ ] (8) directive (flow, text)
+ — [`micromark-extension-directive`](https://github.com/micromark/micromark-extension-directive)
+- [ ] (8) expression (MDX) (flow, text)
+ — [`micromark-extension-mdx-expression`](https://github.com/micromark/micromark-extension-mdx-expression)
+- [ ] (5) JSX (MDX) (flow, text)
+ — [`micromark-extension-mdx-jsx`](https://github.com/micromark/micromark-extension-mdx-jsx)
+- [ ] (3) ESM (MDX) (flow)
+ — [`micromark-extension-mdxjs-esm`](https://github.com/micromark/micromark-extension-mdxjs-esm)
+- [ ] (1) tagfilter (GFM) (n/a, renderer)
+ — [`micromark-extension-gfm-tagfilter`](https://github.com/micromark/micromark-extension-gfm-tagfilter)
diff --git a/src/compiler.rs b/src/compiler.rs
new file mode 100644
index 0000000..166950e
--- /dev/null
+++ b/src/compiler.rs
@@ -0,0 +1,367 @@
+//! Turn events into a string of HTML.
+use crate::construct::character_reference::Kind as CharacterReferenceKind;
+use crate::tokenizer::{Code, Event, EventType, TokenType};
+use crate::util::{
+ decode_named_character_reference, decode_numeric_character_reference, encode, get_span,
+ slice_serialize,
+};
+
+/// Configuration (optional).
+#[derive(Default, Debug)]
+pub struct CompileOptions {
+ /// Whether to allow (dangerous) HTML.
+ /// The default is `false`, you can turn it on to `true` for trusted
+ /// content.
+ pub allow_dangerous_html: bool,
+}
+
+/// Turn events and codes into a string of HTML.
+#[allow(clippy::too_many_lines)]
+pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> String {
+ let mut index = 0;
+ // let mut last_was_tag = false;
+ let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]];
+ let mut atx_opening_sequence_size: Option<usize> = None;
+ let mut atx_heading_buffer: Option<String> = None;
+ let mut code_flow_seen_data: Option<bool> = None;
+ let mut code_fenced_fences_count: Option<usize> = None;
+ let mut slurp_one_line_ending = false;
+ let mut ignore_encode = false;
+ let mut character_reference_kind: Option<CharacterReferenceKind> = None;
+ // let mut slurp_all_line_endings = false;
+
+ println!("events: {:#?}", events);
+
+ while index < events.len() {
+ let event = &events[index];
+ let token_type = &event.token_type;
+
+ match event.event_type {
+ EventType::Enter => match token_type {
+ TokenType::Content => {
+ buf_tail_mut(buffers).push("<p>".to_string());
+ }
+ TokenType::CodeIndented => {
+ code_flow_seen_data = Some(false);
+ line_ending_if_needed(buffers);
+ buf_tail_mut(buffers).push("<pre><code>".to_string());
+ }
+ TokenType::CodeFenced => {
+ code_flow_seen_data = Some(false);
+ line_ending_if_needed(buffers);
+ // Note: no `>`, which is added later.
+ buf_tail_mut(buffers).push("<pre><code".to_string());
+ code_fenced_fences_count = Some(0);
+ }
+ TokenType::CodeFencedFenceInfo | TokenType::CodeFencedFenceMeta => {
+ buffer(buffers);
+ }
+ TokenType::HtmlFlow => {
+ line_ending_if_needed(buffers);
+ if options.allow_dangerous_html {
+ ignore_encode = true;
+ }
+ }
+ TokenType::ContentPhrasing
+ | TokenType::AtxHeading
+ | TokenType::AtxHeadingSequence
+ | TokenType::AtxHeadingWhitespace
+ | TokenType::AtxHeadingText
+ | TokenType::LineEnding
+ | TokenType::ThematicBreak
+ | TokenType::ThematicBreakSequence
+ | TokenType::ThematicBreakWhitespace
+ | TokenType::CodeIndentedPrefixWhitespace
+ | TokenType::CodeFlowChunk
+ | TokenType::BlankLineEnding
+ | TokenType::BlankLineWhitespace
+ | TokenType::Whitespace
+ | TokenType::HtmlFlowData
+ | TokenType::CodeFencedFence
+ | TokenType::CodeFencedFenceSequence
+ | TokenType::ChunkString
+ | TokenType::CodeFencedFenceWhitespace
+ | TokenType::Data
+ | TokenType::CharacterEscape
+ | TokenType::CharacterEscapeMarker
+ | TokenType::CharacterEscapeValue
+ | TokenType::CharacterReference
+ | TokenType::CharacterReferenceMarker
+ | TokenType::CharacterReferenceMarkerNumeric
+ | TokenType::CharacterReferenceMarkerHexadecimal
+ | TokenType::CharacterReferenceMarkerSemi
+ | TokenType::CharacterReferenceValue => {}
+ #[allow(unreachable_patterns)]
+ _ => {
+ unreachable!("unhandled `enter` of TokenType {:?}", token_type)
+ }
+ },
+ EventType::Exit => match token_type {
+ TokenType::ThematicBreakSequence
+ | TokenType::ThematicBreakWhitespace
+ | TokenType::CodeIndentedPrefixWhitespace
+ | TokenType::BlankLineEnding
+ | TokenType::BlankLineWhitespace
+ | TokenType::Whitespace
+ | TokenType::CodeFencedFenceSequence
+ | TokenType::CodeFencedFenceWhitespace
+ | TokenType::CharacterEscape
+ | TokenType::CharacterEscapeMarker
+ | TokenType::CharacterReference
+ | TokenType::CharacterReferenceMarkerSemi => {}
+ TokenType::HtmlFlow => {
+ ignore_encode = false;
+ }
+ TokenType::HtmlFlowData => {
+ let slice = slice_serialize(codes, &get_span(events, index), false);
+
+ let res = if ignore_encode { slice } else { encode(&slice) };
+
+ // last_was_tag = false;
+ buf_tail_mut(buffers).push(res);
+ }
+ TokenType::Content => {
+ buf_tail_mut(buffers).push("</p>".to_string());
+ }
+ TokenType::CodeIndented | TokenType::CodeFenced => {
+ let seen_data =
+ code_flow_seen_data.expect("`code_flow_seen_data` must be defined");
+
+ // To do: containers.
+ // One special case is if we are inside a container, and the fenced code was
+ // not closed (meaning it runs to the end).
+ // In that case, the following line ending, is considered *outside* the
+ // fenced code and block quote by micromark, but CM wants to treat that
+ // ending as part of the code.
+ // if fenced_count != None && fenced_count < 2 && tightStack.length > 0 && !last_was_tag {
+ // line_ending();
+ // }
+
+ // But in most cases, it’s simpler: when we’ve seen some data, emit an extra
+ // line ending when needed.
+ if seen_data {
+ line_ending_if_needed(buffers);
+ }
+
+ buf_tail_mut(buffers).push("</code></pre>".to_string());
+
+ if let Some(count) = code_fenced_fences_count {
+ if count < 2 {
+ line_ending_if_needed(buffers);
+ }
+ }
+
+ code_flow_seen_data = None;
+ code_fenced_fences_count = None;
+ slurp_one_line_ending = false;
+ }
+ TokenType::CodeFencedFence => {
+ let count = if let Some(count) = code_fenced_fences_count {
+ count
+ } else {
+ 0
+ };
+
+ if count == 0 {
+ buf_tail_mut(buffers).push(">".to_string());
+ // tag = true;
+ slurp_one_line_ending = true;
+ }
+
+ code_fenced_fences_count = Some(count + 1);
+ }
+ TokenType::CodeFencedFenceInfo => {
+ let value = resume(buffers);
+ buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value));
+ // tag = true;
+ }
+ TokenType::CodeFencedFenceMeta => {
+ resume(buffers);
+ }
+ TokenType::CodeFlowChunk => {
+ code_flow_seen_data = Some(true);
+ buf_tail_mut(buffers).push(encode(&slice_serialize(
+ codes,
+ &get_span(events, index),
+ false,
+ )));
+ }
+ // `AtxHeadingWhitespace` is ignored after the opening sequence,
+ // before the closing sequence, and after the closing sequence.
+ // But it is used around intermediate sequences.
+ // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`.
+ // `AtxHeadingSequence` is ignored as the opening and closing sequence,
+ // but not when intermediate.
+ TokenType::AtxHeadingWhitespace | TokenType::AtxHeadingSequence => {
+ if let Some(buf) = atx_heading_buffer {
+ atx_heading_buffer = Some(
+ buf.to_string()
+ + &encode(&slice_serialize(codes, &get_span(events, index), false)),
+ );
+ }
+
+ // First fence we see.
+ if None == atx_opening_sequence_size {
+ let rank = slice_serialize(codes, &get_span(events, index), false).len();
+ atx_opening_sequence_size = Some(rank);
+ buf_tail_mut(buffers).push(format!("<h{}>", rank));
+ }
+ }
+ TokenType::AtxHeadingText => {
+ println!("text: {:?}", atx_heading_buffer);
+ if let Some(ref buf) = atx_heading_buffer {
+ if !buf.is_empty() {
+ buf_tail_mut(buffers).push(encode(buf));
+ atx_heading_buffer = Some("".to_string());
+ }
+ } else {
+ atx_heading_buffer = Some("".to_string());
+ }
+
+ let slice = encode(&slice_serialize(codes, &get_span(events, index), false));
+ println!("slice: {:?}", slice);
+ buf_tail_mut(buffers).push(slice);
+ }
+ TokenType::AtxHeading => {
+ let rank = atx_opening_sequence_size
+ .expect("`atx_opening_sequence_size` must be set in headings");
+ buf_tail_mut(buffers).push(format!("</h{}>", rank));
+ atx_opening_sequence_size = None;
+ atx_heading_buffer = None;
+ }
+ TokenType::ThematicBreak => {
+ buf_tail_mut(buffers).push("<hr />".to_string());
+ }
+ TokenType::LineEnding => {
+ // if slurp_all_line_endings {
+ // // Empty.
+ // } else
+ if slurp_one_line_ending {
+ slurp_one_line_ending = false;
+ // } else if code_text_inside {
+ // buf_tail_mut(buffers).push(" ".to_string());
+ } else {
+ buf_tail_mut(buffers).push(encode(&slice_serialize(
+ codes,
+ &get_span(events, index),
+ false,
+ )));
+ }
+ }
+ TokenType::CharacterReferenceMarker => {
+ character_reference_kind = Some(CharacterReferenceKind::Named);
+ }
+ TokenType::CharacterReferenceMarkerNumeric => {
+ character_reference_kind = Some(CharacterReferenceKind::Decimal);
+ }
+ TokenType::CharacterReferenceMarkerHexadecimal => {
+ character_reference_kind = Some(CharacterReferenceKind::Hexadecimal);
+ }
+ TokenType::CharacterReferenceValue => {
+ let kind = character_reference_kind
+ .expect("expected `character_reference_kind` to be set");
+ let reference = slice_serialize(codes, &get_span(events, index), false);
+ let ref_string = reference.as_str();
+ let value = match kind {
+ CharacterReferenceKind::Decimal => {
+ decode_numeric_character_reference(ref_string, 10).to_string()
+ }
+ CharacterReferenceKind::Hexadecimal => {
+ decode_numeric_character_reference(ref_string, 16).to_string()
+ }
+ CharacterReferenceKind::Named => {
+ decode_named_character_reference(ref_string)
+ }
+ };
+
+ buf_tail_mut(buffers).push(value);
+
+ character_reference_kind = None;
+ }
+ // To do: `ContentPhrasing` should be parsed as phrasing first.
+ // This branch below currently acts as the resulting `data` tokens.
+ TokenType::ContentPhrasing
+ // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported.
+ | TokenType::ChunkString
+ | TokenType::Data
+ | TokenType::CharacterEscapeValue => {
+ // last_was_tag = false;
+ buf_tail_mut(buffers).push(encode(&slice_serialize(
+ codes,
+ &get_span(events, index),
+ false,
+ )));
+ }
+ #[allow(unreachable_patterns)]
+ _ => {
+ unreachable!("unhandled `exit` of TokenType {:?}", token_type)
+ }
+ },
+ }
+
+ index += 1;
+ }
+
+ assert!(buffers.len() == 1, "expected 1 final buffer");
+ buffers.get(0).expect("expected 1 final buffer").concat()
+}
+
+/// Push a buffer.
+fn buffer(buffers: &mut Vec<Vec<String>>) {
+ buffers.push(vec![]);
+}
+
+/// Pop a buffer, returning its value.
+fn resume(buffers: &mut Vec<Vec<String>>) -> String {
+ let buf = buffers.pop().expect("Cannot resume w/o buffer");
+ buf.concat()
+}
+
+/// Get the last chunk of current buffer.
+fn buf_tail_slice(buffers: &mut [Vec<String>]) -> Option<&String> {
+ let tail = buf_tail(buffers);
+ tail.last()
+}
+
+/// Get the mutable last chunk of current buffer.
+fn buf_tail_mut(buffers: &mut [Vec<String>]) -> &mut Vec<String> {
+ buffers
+ .last_mut()
+ .expect("at least one buffer should exist")
+}
+
+/// Get the current buffer.
+fn buf_tail(buffers: &mut [Vec<String>]) -> &Vec<String> {
+ buffers.last().expect("at least one buffer should exist")
+}
+
+/// Add a line ending.
+fn line_ending(buffers: &mut [Vec<String>]) {
+ let tail = buf_tail_mut(buffers);
+ // To do: use inferred line ending style.
+ // lastWasTag = false
+ tail.push("\n".to_string());
+}
+
+/// Add a line ending if needed (as in, there’s no eol/eof already).
+fn line_ending_if_needed(buffers: &mut [Vec<String>]) {
+ let slice = buf_tail_slice(buffers);
+ let last_char = if let Some(x) = slice {
+ x.chars().last()
+ } else {
+ None
+ };
+ let mut add = true;
+
+ if let Some(x) = last_char {
+ if x == '\n' || x == '\r' {
+ add = false;
+ }
+ } else {
+ add = false;
+ }
+
+ if add {
+ line_ending(buffers);
+ }
+}
diff --git a/src/constant.rs b/src/constant.rs
new file mode 100644
index 0000000..332fdaf
--- /dev/null
+++ b/src/constant.rs
@@ -0,0 +1,2561 @@
+//! Constants needed to parse markdown.
+//!
+//! Most of these constants are magic numbers, such as the number of markers
+//! needed to parse [code (fenced)][code_fenced]
+//! ([`CODE_FENCED_SEQUENCE_SIZE_MIN`][]) or the max number of allowed markers
+//! in a [heading (atx)][heading_atx]
+//! ([`HEADING_ATX_OPENING_FENCE_SIZE_MAX`][]).
+//!
+//! Some constants are instead lists of things, such as the list of tag names
+//! considered in the **raw** production of [HTML (flow)][html_flow]
+//! ([`HTML_RAW_NAMES`][]), or the list of allowed named character references
+//! ([`CHARACTER_REFERENCE_NAMES`][]).
+//!
+//! [code_fenced]: crate::construct::code_fenced
+//! [heading_atx]: crate::construct::heading_atx
+//! [html_flow]: crate::construct::html_flow
+
+/// The number of characters that form a tab stop.
+///
+/// This relates to the number of whitespace characters needed to form certain
+/// constructs in markdown, most notable the whitespace required to form
+/// [code (indented)][code_indented].
+///
+/// <!-- To do: link to somewhere that discusses virtual spaces. -->
+/// <!-- Ref: https://github.com/syntax-tree/mdast-util-to-markdown/issues/51 -->
+///
+/// [code_indented]: crate::construct::code_indented
+pub const TAB_SIZE: usize = 4;
+
+/// The number of markers needed for a [thematic break][thematic_break] to form.
+///
+/// Like many things in markdown, the number is `3`.
+///
+/// [thematic_break]: crate::construct::thematic_break
+pub const THEMATIC_BREAK_MARKER_COUNT_MIN: usize = 3;
+
+/// The max number of markers allowed to form a [heading (atx)][heading_atx].
+///
+/// This limitation is imposed by HTML, which imposes a max heading rank of
+/// `6`.
+///
+/// [heading_atx]: crate::construct::heading_atx
+pub const HEADING_ATX_OPENING_FENCE_SIZE_MAX: usize = 6;
+
+/// The number of markers needed for [code (fenced)][code_fenced] to form.
+///
+/// Like many things in markdown, the number is `3`.
+///
+/// [code_fenced]: crate::construct::code_fenced
+pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;
+
+/// List of HTML tag names that form the **raw** production of
+/// [HTML (flow)][html_flow].
+///
+/// The **raw** production allows blank lines and thus no interleaving with
+/// markdown.
+/// Tag name matching must be performed insensitive to case, and thus this list
+/// includes lowercase tag names.
+///
+/// The number of the longest tag name is also stored as a constant in
+/// [`HTML_RAW_SIZE_MAX`][].
+///
+/// > 👉 **Note**: `textarea` was added in `CommonMark@0.30`.
+///
+/// ## References
+///
+/// * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks)
+///
+/// [html_flow]: crate::construct::html_flow
+pub const HTML_RAW_NAMES: [&str; 4] = ["pre", "script", "style", "textarea"];
+
+/// The number of the longest tag name in [`HTML_RAW_NAMES`][].
+///
+/// This is currently the size of `textarea`.
+pub const HTML_RAW_SIZE_MAX: usize = 8;
+
+/// List of HTML tag names that form the **basic** production of
+/// [HTML (flow)][html_flow].
+///
+/// The **basic** production allows interleaving HTML and markdown with blank lines
+/// and allows flow (block) elements to interrupt content.
+/// Tag name matching must be performed insensitive to case, and thus this list
+/// includes lowercase tag names.
+///
+/// Tag names not on this list result in the **complete** production.
+///
+/// > 👉 **Note**: `source` was removed on `main` of the `CommonMark` spec and
+/// > is slated to be released in `CommonMark@0.31`.
+///
+/// ## References
+///
+/// * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks)
+/// * [*Remove source element as HTML block start condition* as `commonmark/commonmark-spec#710`](https://github.com/commonmark/commonmark-spec/pull/710)
+///
+/// [html_flow]: crate::construct::html_flow
+pub const HTML_BLOCK_NAMES: [&str; 61] = [
+ "address",
+ "article",
+ "aside",
+ "base",
+ "basefont",
+ "blockquote",
+ "body",
+ "caption",
+ "center",
+ "col",
+ "colgroup",
+ "dd",
+ "details",
+ "dialog",
+ "dir",
+ "div",
+ "dl",
+ "dt",
+ "fieldset",
+ "figcaption",
+ "figure",
+ "footer",
+ "form",
+ "frame",
+ "frameset",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "head",
+ "header",
+ "hr",
+ "html",
+ "iframe",
+ "legend",
+ "li",
+ "link",
+ "main",
+ "menu",
+ "menuitem",
+ "nav",
+ "noframes",
+ "ol",
+ "optgroup",
+ "option",
+ "p",
+ "param",
+ "section",
+ "summary",
+ "table",
+ "tbody",
+ "td",
+ "tfoot",
+ "th",
+ "thead",
+ "title",
+ "tr",
+ "track",
+ "ul",
+];
+
+/// The max number of characters in a hexadecimal numeric
+/// [character reference][character_reference].
+///
+/// To illustrate, this allows `&#xff9999;` and disallows `&#xff99990;`.
+/// This limit is imposed because all bigger numbers are invalid.
+///
+/// [character_reference]: crate::construct::character_reference
+pub const CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX: usize = 6;
+
+/// The max number of characters in a decimal numeric
+/// [character reference][character_reference].
+///
+/// To illustrate, this allows `&#9999999;` and disallows `&#99999990;`.
+/// This limit is imposed because all bigger numbers are invalid.
+///
+/// [character_reference]: crate::construct::character_reference
+pub const CHARACTER_REFERENCE_DECIMAL_SIZE_MAX: usize = 7;
+
+/// The max number of characters in a named
+/// [character reference][character_reference].
+///
+/// This is the number of the longest name in [`CHARACTER_REFERENCE_NAMES`][].
+/// It allows `&CounterClockwiseContourIntegral;` and prevents the parser from
+/// continuing for eons.
+///
+/// [character_reference]: crate::construct::character_reference
+pub const CHARACTER_REFERENCE_NAMED_SIZE_MAX: usize = 31;
+
+/// List of names that can form a named
+/// [character reference][character_reference].
+///
+/// This list is sensitive to casing.
+///
+/// The number of the longest name (`CounterClockwiseContourIntegral`) is also
+/// stored as a constant in [`CHARACTER_REFERENCE_NAMED_SIZE_MAX`][].
+///
+/// The corresponding values of this list are stored in
+/// [`CHARACTER_REFERENCE_VALUES`][].
+/// They correspond through their index.
+///
+/// ## References
+///
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+///
+/// [character_reference]: crate::construct::character_reference
+pub const CHARACTER_REFERENCE_NAMES: [&str; 2222] = [
+ "AEli",
+ "AElig",
+ "AM",
+ "AMP",
+ "Aacut",
+ "Aacute",
+ "Abreve",
+ "Acir",
+ "Acirc",
+ "Acy",
+ "Afr",
+ "Agrav",
+ "Agrave",
+ "Alpha",
+ "Amacr",
+ "And",
+ "Aogon",
+ "Aopf",
+ "ApplyFunction",
+ "Arin",
+ "Aring",
+ "Ascr",
+ "Assign",
+ "Atild",
+ "Atilde",
+ "Aum",
+ "Auml",
+ "Backslash",
+ "Barv",
+ "Barwed",
+ "Bcy",
+ "Because",
+ "Bernoullis",
+ "Beta",
+ "Bfr",
+ "Bopf",
+ "Breve",
+ "Bscr",
+ "Bumpeq",
+ "CHcy",
+ "COP",
+ "COPY",
+ "Cacute",
+ "Cap",
+ "CapitalDifferentialD",
+ "Cayleys",
+ "Ccaron",
+ "Ccedi",
+ "Ccedil",
+ "Ccirc",
+ "Cconint",
+ "Cdot",
+ "Cedilla",
+ "CenterDot",
+ "Cfr",
+ "Chi",
+ "CircleDot",
+ "CircleMinus",
+ "CirclePlus",
+ "CircleTimes",
+ "ClockwiseContourIntegral",
+ "CloseCurlyDoubleQuote",
+ "CloseCurlyQuote",
+ "Colon",
+ "Colone",
+ "Congruent",
+ "Conint",
+ "ContourIntegral",
+ "Copf",
+ "Coproduct",
+ "CounterClockwiseContourIntegral",
+ "Cross",
+ "Cscr",
+ "Cup",
+ "CupCap",
+ "DD",
+ "DDotrahd",
+ "DJcy",
+ "DScy",
+ "DZcy",
+ "Dagger",
+ "Darr",
+ "Dashv",
+ "Dcaron",
+ "Dcy",
+ "Del",
+ "Delta",
+ "Dfr",
+ "DiacriticalAcute",
+ "DiacriticalDot",
+ "DiacriticalDoubleAcute",
+ "DiacriticalGrave",
+ "DiacriticalTilde",
+ "Diamond",
+ "DifferentialD",
+ "Dopf",
+ "Dot",
+ "DotDot",
+ "DotEqual",
+ "DoubleContourIntegral",
+ "DoubleDot",
+ "DoubleDownArrow",
+ "DoubleLeftArrow",
+ "DoubleLeftRightArrow",
+ "DoubleLeftTee",
+ "DoubleLongLeftArrow",
+ "DoubleLongLeftRightArrow",
+ "DoubleLongRightArrow",
+ "DoubleRightArrow",
+ "DoubleRightTee",
+ "DoubleUpArrow",
+ "DoubleUpDownArrow",
+ "DoubleVerticalBar",
+ "DownArrow",
+ "DownArrowBar",
+ "DownArrowUpArrow",
+ "DownBreve",
+ "DownLeftRightVector",
+ "DownLeftTeeVector",
+ "DownLeftVector",
+ "DownLeftVectorBar",
+ "DownRightTeeVector",
+ "DownRightVector",
+ "DownRightVectorBar",
+ "DownTee",
+ "DownTeeArrow",
+ "Downarrow",
+ "Dscr",
+ "Dstrok",
+ "ENG",
+ "ET",
+ "ETH",
+ "Eacut",
+ "Eacute",
+ "Ecaron",
+ "Ecir",
+ "Ecirc",
+ "Ecy",
+ "Edot",
+ "Efr",
+ "Egrav",
+ "Egrave",
+ "Element",
+ "Emacr",
+ "EmptySmallSquare",
+ "EmptyVerySmallSquare",
+ "Eogon",
+ "Eopf",
+ "Epsilon",
+ "Equal",
+ "EqualTilde",
+ "Equilibrium",
+ "Escr",
+ "Esim",
+ "Eta",
+ "Eum",
+ "Euml",
+ "Exists",
+ "ExponentialE",
+ "Fcy",
+ "Ffr",
+ "FilledSmallSquare",
+ "FilledVerySmallSquare",
+ "Fopf",
+ "ForAll",
+ "Fouriertrf",
+ "Fscr",
+ "GJcy",
+ "G",
+ "GT",
+ "Gamma",
+ "Gammad",
+ "Gbreve",
+ "Gcedil",
+ "Gcirc",
+ "Gcy",
+ "Gdot",
+ "Gfr",
+ "Gg",
+ "Gopf",
+ "GreaterEqual",
+ "GreaterEqualLess",
+ "GreaterFullEqual",
+ "GreaterGreater",
+ "GreaterLess",
+ "GreaterSlantEqual",
+ "GreaterTilde",
+ "Gscr",
+ "Gt",
+ "HARDcy",
+ "Hacek",
+ "Hat",
+ "Hcirc",
+ "Hfr",
+ "HilbertSpace",
+ "Hopf",
+ "HorizontalLine",
+ "Hscr",
+ "Hstrok",
+ "HumpDownHump",
+ "HumpEqual",
+ "IEcy",
+ "IJlig",
+ "IOcy",
+ "Iacut",
+ "Iacute",
+ "Icir",
+ "Icirc",
+ "Icy",
+ "Idot",
+ "Ifr",
+ "Igrav",
+ "Igrave",
+ "Im",
+ "Imacr",
+ "ImaginaryI",
+ "Implies",
+ "Int",
+ "Integral",
+ "Intersection",
+ "InvisibleComma",
+ "InvisibleTimes",
+ "Iogon",
+ "Iopf",
+ "Iota",
+ "Iscr",
+ "Itilde",
+ "Iukcy",
+ "Ium",
+ "Iuml",
+ "Jcirc",
+ "Jcy",
+ "Jfr",
+ "Jopf",
+ "Jscr",
+ "Jsercy",
+ "Jukcy",
+ "KHcy",
+ "KJcy",
+ "Kappa",
+ "Kcedil",
+ "Kcy",
+ "Kfr",
+ "Kopf",
+ "Kscr",
+ "LJcy",
+ "L",
+ "LT",
+ "Lacute",
+ "Lambda",
+ "Lang",
+ "Laplacetrf",
+ "Larr",
+ "Lcaron",
+ "Lcedil",
+ "Lcy",
+ "LeftAngleBracket",
+ "LeftArrow",
+ "LeftArrowBar",
+ "LeftArrowRightArrow",
+ "LeftCeiling",
+ "LeftDoubleBracket",
+ "LeftDownTeeVector",
+ "LeftDownVector",
+ "LeftDownVectorBar",
+ "LeftFloor",
+ "LeftRightArrow",
+ "LeftRightVector",
+ "LeftTee",
+ "LeftTeeArrow",
+ "LeftTeeVector",
+ "LeftTriangle",
+ "LeftTriangleBar",
+ "LeftTriangleEqual",
+ "LeftUpDownVector",
+ "LeftUpTeeVector",
+ "LeftUpVector",
+ "LeftUpVectorBar",
+ "LeftVector",
+ "LeftVectorBar",
+ "Leftarrow",
+ "Leftrightarrow",
+ "LessEqualGreater",
+ "LessFullEqual",
+ "LessGreater",
+ "LessLess",
+ "LessSlantEqual",
+ "LessTilde",
+ "Lfr",
+ "Ll",
+ "Lleftarrow",
+ "Lmidot",
+ "LongLeftArrow",
+ "LongLeftRightArrow",
+ "LongRightArrow",
+ "Longleftarrow",
+ "Longleftrightarrow",
+ "Longrightarrow",
+ "Lopf",
+ "LowerLeftArrow",
+ "LowerRightArrow",
+ "Lscr",
+ "Lsh",
+ "Lstrok",
+ "Lt",
+ "Map",
+ "Mcy",
+ "MediumSpace",
+ "Mellintrf",
+ "Mfr",
+ "MinusPlus",
+ "Mopf",
+ "Mscr",
+ "Mu",
+ "NJcy",
+ "Nacute",
+ "Ncaron",
+ "Ncedil",
+ "Ncy",
+ "NegativeMediumSpace",
+ "NegativeThickSpace",
+ "NegativeThinSpace",
+ "NegativeVeryThinSpace",
+ "NestedGreaterGreater",
+ "NestedLessLess",
+ "NewLine",
+ "Nfr",
+ "NoBreak",
+ "NonBreakingSpace",
+ "Nopf",
+ "Not",
+ "NotCongruent",
+ "NotCupCap",
+ "NotDoubleVerticalBar",
+ "NotElement",
+ "NotEqual",
+ "NotEqualTilde",
+ "NotExists",
+ "NotGreater",
+ "NotGreaterEqual",
+ "NotGreaterFullEqual",
+ "NotGreaterGreater",
+ "NotGreaterLess",
+ "NotGreaterSlantEqual",
+ "NotGreaterTilde",
+ "NotHumpDownHump",
+ "NotHumpEqual",
+ "NotLeftTriangle",
+ "NotLeftTriangleBar",
+ "NotLeftTriangleEqual",
+ "NotLess",
+ "NotLessEqual",
+ "NotLessGreater",
+ "NotLessLess",
+ "NotLessSlantEqual",
+ "NotLessTilde",
+ "NotNestedGreaterGreater",
+ "NotNestedLessLess",
+ "NotPrecedes",
+ "NotPrecedesEqual",
+ "NotPrecedesSlantEqual",
+ "NotReverseElement",
+ "NotRightTriangle",
+ "NotRightTriangleBar",
+ "NotRightTriangleEqual",
+ "NotSquareSubset",
+ "NotSquareSubsetEqual",
+ "NotSquareSuperset",
+ "NotSquareSupersetEqual",
+ "NotSubset",
+ "NotSubsetEqual",
+ "NotSucceeds",
+ "NotSucceedsEqual",
+ "NotSucceedsSlantEqual",
+ "NotSucceedsTilde",
+ "NotSuperset",
+ "NotSupersetEqual",
+ "NotTilde",
+ "NotTildeEqual",
+ "NotTildeFullEqual",
+ "NotTildeTilde",
+ "NotVerticalBar",
+ "Nscr",
+ "Ntild",
+ "Ntilde",
+ "Nu",
+ "OElig",
+ "Oacut",
+ "Oacute",
+ "Ocir",
+ "Ocirc",
+ "Ocy",
+ "Odblac",
+ "Ofr",
+ "Ograv",
+ "Ograve",
+ "Omacr",
+ "Omega",
+ "Omicron",
+ "Oopf",
+ "OpenCurlyDoubleQuote",
+ "OpenCurlyQuote",
+ "Or",
+ "Oscr",
+ "Oslas",
+ "Oslash",
+ "Otild",
+ "Otilde",
+ "Otimes",
+ "Oum",
+ "Ouml",
+ "OverBar",
+ "OverBrace",
+ "OverBracket",
+ "OverParenthesis",
+ "PartialD",
+ "Pcy",
+ "Pfr",
+ "Phi",
+ "Pi",
+ "PlusMinus",
+ "Poincareplane",
+ "Popf",
+ "Pr",
+ "Precedes",
+ "PrecedesEqual",
+ "PrecedesSlantEqual",
+ "PrecedesTilde",
+ "Prime",
+ "Product",
+ "Proportion",
+ "Proportional",
+ "Pscr",
+ "Psi",
+ "QUO",
+ "QUOT",
+ "Qfr",
+ "Qopf",
+ "Qscr",
+ "RBarr",
+ "RE",
+ "REG",
+ "Racute",
+ "Rang",
+ "Rarr",
+ "Rarrtl",
+ "Rcaron",
+ "Rcedil",
+ "Rcy",
+ "Re",
+ "ReverseElement",
+ "ReverseEquilibrium",
+ "ReverseUpEquilibrium",
+ "Rfr",
+ "Rho",
+ "RightAngleBracket",
+ "RightArrow",
+ "RightArrowBar",
+ "RightArrowLeftArrow",
+ "RightCeiling",
+ "RightDoubleBracket",
+ "RightDownTeeVector",
+ "RightDownVector",
+ "RightDownVectorBar",
+ "RightFloor",
+ "RightTee",
+ "RightTeeArrow",
+ "RightTeeVector",
+ "RightTriangle",
+ "RightTriangleBar",
+ "RightTriangleEqual",
+ "RightUpDownVector",
+ "RightUpTeeVector",
+ "RightUpVector",
+ "RightUpVectorBar",
+ "RightVector",
+ "RightVectorBar",
+ "Rightarrow",
+ "Ropf",
+ "RoundImplies",
+ "Rrightarrow",
+ "Rscr",
+ "Rsh",
+ "RuleDelayed",
+ "SHCHcy",
+ "SHcy",
+ "SOFTcy",
+ "Sacute",
+ "Sc",
+ "Scaron",
+ "Scedil",
+ "Scirc",
+ "Scy",
+ "Sfr",
+ "ShortDownArrow",
+ "ShortLeftArrow",
+ "ShortRightArrow",
+ "ShortUpArrow",
+ "Sigma",
+ "SmallCircle",
+ "Sopf",
+ "Sqrt",
+ "Square",
+ "SquareIntersection",
+ "SquareSubset",
+ "SquareSubsetEqual",
+ "SquareSuperset",
+ "SquareSupersetEqual",
+ "SquareUnion",
+ "Sscr",
+ "Star",
+ "Sub",
+ "Subset",
+ "SubsetEqual",
+ "Succeeds",
+ "SucceedsEqual",
+ "SucceedsSlantEqual",
+ "SucceedsTilde",
+ "SuchThat",
+ "Sum",
+ "Sup",
+ "Superset",
+ "SupersetEqual",
+ "Supset",
+ "THOR",
+ "THORN",
+ "TRADE",
+ "TSHcy",
+ "TScy",
+ "Tab",
+ "Tau",
+ "Tcaron",
+ "Tcedil",
+ "Tcy",
+ "Tfr",
+ "Therefore",
+ "Theta",
+ "ThickSpace",
+ "ThinSpace",
+ "Tilde",
+ "TildeEqual",
+ "TildeFullEqual",
+ "TildeTilde",
+ "Topf",
+ "TripleDot",
+ "Tscr",
+ "Tstrok",
+ "Uacut",
+ "Uacute",
+ "Uarr",
+ "Uarrocir",
+ "Ubrcy",
+ "Ubreve",
+ "Ucir",
+ "Ucirc",
+ "Ucy",
+ "Udblac",
+ "Ufr",
+ "Ugrav",
+ "Ugrave",
+ "Umacr",
+ "UnderBar",
+ "UnderBrace",
+ "UnderBracket",
+ "UnderParenthesis",
+ "Union",
+ "UnionPlus",
+ "Uogon",
+ "Uopf",
+ "UpArrow",
+ "UpArrowBar",
+ "UpArrowDownArrow",
+ "UpDownArrow",
+ "UpEquilibrium",
+ "UpTee",
+ "UpTeeArrow",
+ "Uparrow",
+ "Updownarrow",
+ "UpperLeftArrow",
+ "UpperRightArrow",
+ "Upsi",
+ "Upsilon",
+ "Uring",
+ "Uscr",
+ "Utilde",
+ "Uum",
+ "Uuml",
+ "VDash",
+ "Vbar",
+ "Vcy",
+ "Vdash",
+ "Vdashl",
+ "Vee",
+ "Verbar",
+ "Vert",
+ "VerticalBar",
+ "VerticalLine",
+ "VerticalSeparator",
+ "VerticalTilde",
+ "VeryThinSpace",
+ "Vfr",
+ "Vopf",
+ "Vscr",
+ "Vvdash",
+ "Wcirc",
+ "Wedge",
+ "Wfr",
+ "Wopf",
+ "Wscr",
+ "Xfr",
+ "Xi",
+ "Xopf",
+ "Xscr",
+ "YAcy",
+ "YIcy",
+ "YUcy",
+ "Yacut",
+ "Yacute",
+ "Ycirc",
+ "Ycy",
+ "Yfr",
+ "Yopf",
+ "Yscr",
+ "Yuml",
+ "ZHcy",
+ "Zacute",
+ "Zcaron",
+ "Zcy",
+ "Zdot",
+ "ZeroWidthSpace",
+ "Zeta",
+ "Zfr",
+ "Zopf",
+ "Zscr",
+ "aacut",
+ "aacute",
+ "abreve",
+ "ac",
+ "acE",
+ "acd",
+ "acir",
+ "acirc",
+ "acut",
+ "acute",
+ "acy",
+ "aeli",
+ "aelig",
+ "af",
+ "afr",
+ "agrav",
+ "agrave",
+ "alefsym",
+ "aleph",
+ "alpha",
+ "amacr",
+ "amalg",
+ "am",
+ "amp",
+ "and",
+ "andand",
+ "andd",
+ "andslope",
+ "andv",
+ "ang",
+ "ange",
+ "angle",
+ "angmsd",
+ "angmsdaa",
+ "angmsdab",
+ "angmsdac",
+ "angmsdad",
+ "angmsdae",
+ "angmsdaf",
+ "angmsdag",
+ "angmsdah",
+ "angrt",
+ "angrtvb",
+ "angrtvbd",
+ "angsph",
+ "angst",
+ "angzarr",
+ "aogon",
+ "aopf",
+ "ap",
+ "apE",
+ "apacir",
+ "ape",
+ "apid",
+ "apos",
+ "approx",
+ "approxeq",
+ "arin",
+ "aring",
+ "ascr",
+ "ast",
+ "asymp",
+ "asympeq",
+ "atild",
+ "atilde",
+ "aum",
+ "auml",
+ "awconint",
+ "awint",
+ "bNot",
+ "backcong",
+ "backepsilon",
+ "backprime",
+ "backsim",
+ "backsimeq",
+ "barvee",
+ "barwed",
+ "barwedge",
+ "bbrk",
+ "bbrktbrk",
+ "bcong",
+ "bcy",
+ "bdquo",
+ "becaus",
+ "because",
+ "bemptyv",
+ "bepsi",
+ "bernou",
+ "beta",
+ "beth",
+ "between",
+ "bfr",
+ "bigcap",
+ "bigcirc",
+ "bigcup",
+ "bigodot",
+ "bigoplus",
+ "bigotimes",
+ "bigsqcup",
+ "bigstar",
+ "bigtriangledown",
+ "bigtriangleup",
+ "biguplus",
+ "bigvee",
+ "bigwedge",
+ "bkarow",
+ "blacklozenge",
+ "blacksquare",
+ "blacktriangle",
+ "blacktriangledown",
+ "blacktriangleleft",
+ "blacktriangleright",
+ "blank",
+ "blk12",
+ "blk14",
+ "blk34",
+ "block",
+ "bne",
+ "bnequiv",
+ "bnot",
+ "bopf",
+ "bot",
+ "bottom",
+ "bowtie",
+ "boxDL",
+ "boxDR",
+ "boxDl",
+ "boxDr",
+ "boxH",
+ "boxHD",
+ "boxHU",
+ "boxHd",
+ "boxHu",
+ "boxUL",
+ "boxUR",
+ "boxUl",
+ "boxUr",
+ "boxV",
+ "boxVH",
+ "boxVL",
+ "boxVR",
+ "boxVh",
+ "boxVl",
+ "boxVr",
+ "boxbox",
+ "boxdL",
+ "boxdR",
+ "boxdl",
+ "boxdr",
+ "boxh",
+ "boxhD",
+ "boxhU",
+ "boxhd",
+ "boxhu",
+ "boxminus",
+ "boxplus",
+ "boxtimes",
+ "boxuL",
+ "boxuR",
+ "boxul",
+ "boxur",
+ "boxv",
+ "boxvH",
+ "boxvL",
+ "boxvR",
+ "boxvh",
+ "boxvl",
+ "boxvr",
+ "bprime",
+ "breve",
+ "brvba",
+ "brvbar",
+ "bscr",
+ "bsemi",
+ "bsim",
+ "bsime",
+ "bsol",
+ "bsolb",
+ "bsolhsub",
+ "bull",
+ "bullet",
+ "bump",
+ "bumpE",
+ "bumpe",
+ "bumpeq",
+ "cacute",
+ "cap",
+ "capand",
+ "capbrcup",
+ "capcap",
+ "capcup",
+ "capdot",
+ "caps",
+ "caret",
+ "caron",
+ "ccaps",
+ "ccaron",
+ "ccedi",
+ "ccedil",
+ "ccirc",
+ "ccups",
+ "ccupssm",
+ "cdot",
+ "cedi",
+ "cedil",
+ "cemptyv",
+ "cen",
+ "cent",
+ "centerdot",
+ "cfr",
+ "chcy",
+ "check",
+ "checkmark",
+ "chi",
+ "cir",
+ "cirE",
+ "circ",
+ "circeq",
+ "circlearrowleft",
+ "circlearrowright",
+ "circledR",
+ "circledS",
+ "circledast",
+ "circledcirc",
+ "circleddash",
+ "cire",
+ "cirfnint",
+ "cirmid",
+ "cirscir",
+ "clubs",
+ "clubsuit",
+ "colon",
+ "colone",
+ "coloneq",
+ "comma",
+ "commat",
+ "comp",
+ "compfn",
+ "complement",
+ "complexes",
+ "cong",
+ "congdot",
+ "conint",
+ "copf",
+ "coprod",
+ "cop",
+ "copy",
+ "copysr",
+ "crarr",
+ "cross",
+ "cscr",
+ "csub",
+ "csube",
+ "csup",
+ "csupe",
+ "ctdot",
+ "cudarrl",
+ "cudarrr",
+ "cuepr",
+ "cuesc",
+ "cularr",
+ "cularrp",
+ "cup",
+ "cupbrcap",
+ "cupcap",
+ "cupcup",
+ "cupdot",
+ "cupor",
+ "cups",
+ "curarr",
+ "curarrm",
+ "curlyeqprec",
+ "curlyeqsucc",
+ "curlyvee",
+ "curlywedge",
+ "curre",
+ "curren",
+ "curvearrowleft",
+ "curvearrowright",
+ "cuvee",
+ "cuwed",
+ "cwconint",
+ "cwint",
+ "cylcty",
+ "dArr",
+ "dHar",
+ "dagger",
+ "daleth",
+ "darr",
+ "dash",
+ "dashv",
+ "dbkarow",
+ "dblac",
+ "dcaron",
+ "dcy",
+ "dd",
+ "ddagger",
+ "ddarr",
+ "ddotseq",
+ "de",
+ "deg",
+ "delta",
+ "demptyv",
+ "dfisht",
+ "dfr",
+ "dharl",
+ "dharr",
+ "diam",
+ "diamond",
+ "diamondsuit",
+ "diams",
+ "die",
+ "digamma",
+ "disin",
+ "div",
+ "divid",
+ "divide",
+ "divideontimes",
+ "divonx",
+ "djcy",
+ "dlcorn",
+ "dlcrop",
+ "dollar",
+ "dopf",
+ "dot",
+ "doteq",
+ "doteqdot",
+ "dotminus",
+ "dotplus",
+ "dotsquare",
+ "doublebarwedge",
+ "downarrow",
+ "downdownarrows",
+ "downharpoonleft",
+ "downharpoonright",
+ "drbkarow",
+ "drcorn",
+ "drcrop",
+ "dscr",
+ "dscy",
+ "dsol",
+ "dstrok",
+ "dtdot",
+ "dtri",
+ "dtrif",
+ "duarr",
+ "duhar",
+ "dwangle",
+ "dzcy",
+ "dzigrarr",
+ "eDDot",
+ "eDot",
+ "eacut",
+ "eacute",
+ "easter",
+ "ecaron",
+ "ecir",
+ "ecirc",
+ "ecolon",
+ "ecy",
+ "edot",
+ "ee",
+ "efDot",
+ "efr",
+ "eg",
+ "egrav",
+ "egrave",
+ "egs",
+ "egsdot",
+ "el",
+ "elinters",
+ "ell",
+ "els",
+ "elsdot",
+ "emacr",
+ "empty",
+ "emptyset",
+ "emptyv",
+ "emsp13",
+ "emsp14",
+ "emsp",
+ "eng",
+ "ensp",
+ "eogon",
+ "eopf",
+ "epar",
+ "eparsl",
+ "eplus",
+ "epsi",
+ "epsilon",
+ "epsiv",
+ "eqcirc",
+ "eqcolon",
+ "eqsim",
+ "eqslantgtr",
+ "eqslantless",
+ "equals",
+ "equest",
+ "equiv",
+ "equivDD",
+ "eqvparsl",
+ "erDot",
+ "erarr",
+ "escr",
+ "esdot",
+ "esim",
+ "eta",
+ "et",
+ "eth",
+ "eum",
+ "euml",
+ "euro",
+ "excl",
+ "exist",
+ "expectation",
+ "exponentiale",
+ "fallingdotseq",
+ "fcy",
+ "female",
+ "ffilig",
+ "fflig",
+ "ffllig",
+ "ffr",
+ "filig",
+ "fjlig",
+ "flat",
+ "fllig",
+ "fltns",
+ "fnof",
+ "fopf",
+ "forall",
+ "fork",
+ "forkv",
+ "fpartint",
+ "frac1",
+ "frac12",
+ "frac13",
+ "frac14",
+ "frac15",
+ "frac16",
+ "frac18",
+ "frac23",
+ "frac25",
+ "frac3",
+ "frac34",
+ "frac35",
+ "frac38",
+ "frac45",
+ "frac56",
+ "frac58",
+ "frac78",
+ "frasl",
+ "frown",
+ "fscr",
+ "gE",
+ "gEl",
+ "gacute",
+ "gamma",
+ "gammad",
+ "gap",
+ "gbreve",
+ "gcirc",
+ "gcy",
+ "gdot",
+ "ge",
+ "gel",
+ "geq",
+ "geqq",
+ "geqslant",
+ "ges",
+ "gescc",
+ "gesdot",
+ "gesdoto",
+ "gesdotol",
+ "gesl",
+ "gesles",
+ "gfr",
+ "gg",
+ "ggg",
+ "gimel",
+ "gjcy",
+ "gl",
+ "glE",
+ "gla",
+ "glj",
+ "gnE",
+ "gnap",
+ "gnapprox",
+ "gne",
+ "gneq",
+ "gneqq",
+ "gnsim",
+ "gopf",
+ "grave",
+ "gscr",
+ "gsim",
+ "gsime",
+ "gsiml",
+ "g",
+ "gt",
+ "gtcc",
+ "gtcir",
+ "gtdot",
+ "gtlPar",
+ "gtquest",
+ "gtrapprox",
+ "gtrarr",
+ "gtrdot",
+ "gtreqless",
+ "gtreqqless",
+ "gtrless",
+ "gtrsim",
+ "gvertneqq",
+ "gvnE",
+ "hArr",
+ "hairsp",
+ "half",
+ "hamilt",
+ "hardcy",
+ "harr",
+ "harrcir",
+ "harrw",
+ "hbar",
+ "hcirc",
+ "hearts",
+ "heartsuit",
+ "hellip",
+ "hercon",
+ "hfr",
+ "hksearow",
+ "hkswarow",
+ "hoarr",
+ "homtht",
+ "hookleftarrow",
+ "hookrightarrow",
+ "hopf",
+ "horbar",
+ "hscr",
+ "hslash",
+ "hstrok",
+ "hybull",
+ "hyphen",
+ "iacut",
+ "iacute",
+ "ic",
+ "icir",
+ "icirc",
+ "icy",
+ "iecy",
+ "iexc",
+ "iexcl",
+ "iff",
+ "ifr",
+ "igrav",
+ "igrave",
+ "ii",
+ "iiiint",
+ "iiint",
+ "iinfin",
+ "iiota",
+ "ijlig",
+ "imacr",
+ "image",
+ "imagline",
+ "imagpart",
+ "imath",
+ "imof",
+ "imped",
+ "in",
+ "incare",
+ "infin",
+ "infintie",
+ "inodot",
+ "int",
+ "intcal",
+ "integers",
+ "intercal",
+ "intlarhk",
+ "intprod",
+ "iocy",
+ "iogon",
+ "iopf",
+ "iota",
+ "iprod",
+ "iques",
+ "iquest",
+ "iscr",
+ "isin",
+ "isinE",
+ "isindot",
+ "isins",
+ "isinsv",
+ "isinv",
+ "it",
+ "itilde",
+ "iukcy",
+ "ium",
+ "iuml",
+ "jcirc",
+ "jcy",
+ "jfr",
+ "jmath",
+ "jopf",
+ "jscr",
+ "jsercy",
+ "jukcy",
+ "kappa",
+ "kappav",
+ "kcedil",
+ "kcy",
+ "kfr",
+ "kgreen",
+ "khcy",
+ "kjcy",
+ "kopf",
+ "kscr",
+ "lAarr",
+ "lArr",
+ "lAtail",
+ "lBarr",
+ "lE",
+ "lEg",
+ "lHar",
+ "lacute",
+ "laemptyv",
+ "lagran",
+ "lambda",
+ "lang",
+ "langd",
+ "langle",
+ "lap",
+ "laqu",
+ "laquo",
+ "larr",
+ "larrb",
+ "larrbfs",
+ "larrfs",
+ "larrhk",
+ "larrlp",
+ "larrpl",
+ "larrsim",
+ "larrtl",
+ "lat",
+ "latail",
+ "late",
+ "lates",
+ "lbarr",
+ "lbbrk",
+ "lbrace",
+ "lbrack",
+ "lbrke",
+ "lbrksld",
+ "lbrkslu",
+ "lcaron",
+ "lcedil",
+ "lceil",
+ "lcub",
+ "lcy",
+ "ldca",
+ "ldquo",
+ "ldquor",
+ "ldrdhar",
+ "ldrushar",
+ "ldsh",
+ "le",
+ "leftarrow",
+ "leftarrowtail",
+ "leftharpoondown",
+ "leftharpoonup",
+ "leftleftarrows",
+ "leftrightarrow",
+ "leftrightarrows",
+ "leftrightharpoons",
+ "leftrightsquigarrow",
+ "leftthreetimes",
+ "leg",
+ "leq",
+ "leqq",
+ "leqslant",
+ "les",
+ "lescc",
+ "lesdot",
+ "lesdoto",
+ "lesdotor",
+ "lesg",
+ "lesges",
+ "lessapprox",
+ "lessdot",
+ "lesseqgtr",
+ "lesseqqgtr",
+ "lessgtr",
+ "lesssim",
+ "lfisht",
+ "lfloor",
+ "lfr",
+ "lg",
+ "lgE",
+ "lhard",
+ "lharu",
+ "lharul",
+ "lhblk",
+ "ljcy",
+ "ll",
+ "llarr",
+ "llcorner",
+ "llhard",
+ "lltri",
+ "lmidot",
+ "lmoust",
+ "lmoustache",
+ "lnE",
+ "lnap",
+ "lnapprox",
+ "lne",
+ "lneq",
+ "lneqq",
+ "lnsim",
+ "loang",
+ "loarr",
+ "lobrk",
+ "longleftarrow",
+ "longleftrightarrow",
+ "longmapsto",
+ "longrightarrow",
+ "looparrowleft",
+ "looparrowright",
+ "lopar",
+ "lopf",
+ "loplus",
+ "lotimes",
+ "lowast",
+ "lowbar",
+ "loz",
+ "lozenge",
+ "lozf",
+ "lpar",
+ "lparlt",
+ "lrarr",
+ "lrcorner",
+ "lrhar",
+ "lrhard",
+ "lrm",
+ "lrtri",
+ "lsaquo",
+ "lscr",
+ "lsh",
+ "lsim",
+ "lsime",
+ "lsimg",
+ "lsqb",
+ "lsquo",
+ "lsquor",
+ "lstrok",
+ "l",
+ "lt",
+ "ltcc",
+ "ltcir",
+ "ltdot",
+ "lthree",
+ "ltimes",
+ "ltlarr",
+ "ltquest",
+ "ltrPar",
+ "ltri",
+ "ltrie",
+ "ltrif",
+ "lurdshar",
+ "luruhar",
+ "lvertneqq",
+ "lvnE",
+ "mDDot",
+ "mac",
+ "macr",
+ "male",
+ "malt",
+ "maltese",
+ "map",
+ "mapsto",
+ "mapstodown",
+ "mapstoleft",
+ "mapstoup",
+ "marker",
+ "mcomma",
+ "mcy",
+ "mdash",
+ "measuredangle",
+ "mfr",
+ "mho",
+ "micr",
+ "micro",
+ "mid",
+ "midast",
+ "midcir",
+ "middo",
+ "middot",
+ "minus",
+ "minusb",
+ "minusd",
+ "minusdu",
+ "mlcp",
+ "mldr",
+ "mnplus",
+ "models",
+ "mopf",
+ "mp",
+ "mscr",
+ "mstpos",
+ "mu",
+ "multimap",
+ "mumap",
+ "nGg",
+ "nGt",
+ "nGtv",
+ "nLeftarrow",
+ "nLeftrightarrow",
+ "nLl",
+ "nLt",
+ "nLtv",
+ "nRightarrow",
+ "nVDash",
+ "nVdash",
+ "nabla",
+ "nacute",
+ "nang",
+ "nap",
+ "napE",
+ "napid",
+ "napos",
+ "napprox",
+ "natur",
+ "natural",
+ "naturals",
+ "nbs",
+ "nbsp",
+ "nbump",
+ "nbumpe",
+ "ncap",
+ "ncaron",
+ "ncedil",
+ "ncong",
+ "ncongdot",
+ "ncup",
+ "ncy",
+ "ndash",
+ "ne",
+ "neArr",
+ "nearhk",
+ "nearr",
+ "nearrow",
+ "nedot",
+ "nequiv",
+ "nesear",
+ "nesim",
+ "nexist",
+ "nexists",
+ "nfr",
+ "ngE",
+ "nge",
+ "ngeq",
+ "ngeqq",
+ "ngeqslant",
+ "nges",
+ "ngsim",
+ "ngt",
+ "ngtr",
+ "nhArr",
+ "nharr",
+ "nhpar",
+ "ni",
+ "nis",
+ "nisd",
+ "niv",
+ "njcy",
+ "nlArr",
+ "nlE",
+ "nlarr",
+ "nldr",
+ "nle",
+ "nleftarrow",
+ "nleftrightarrow",
+ "nleq",
+ "nleqq",
+ "nleqslant",
+ "nles",
+ "nless",
+ "nlsim",
+ "nlt",
+ "nltri",
+ "nltrie",
+ "nmid",
+ "nopf",
+ "no",
+ "not",
+ "notin",
+ "notinE",
+ "notindot",
+ "notinva",
+ "notinvb",
+ "notinvc",
+ "notni",
+ "notniva",
+ "notnivb",
+ "notnivc",
+ "npar",
+ "nparallel",
+ "nparsl",
+ "npart",
+ "npolint",
+ "npr",
+ "nprcue",
+ "npre",
+ "nprec",
+ "npreceq",
+ "nrArr",
+ "nrarr",
+ "nrarrc",
+ "nrarrw",
+ "nrightarrow",
+ "nrtri",
+ "nrtrie",
+ "nsc",
+ "nsccue",
+ "nsce",
+ "nscr",
+ "nshortmid",
+ "nshortparallel",
+ "nsim",
+ "nsime",
+ "nsimeq",
+ "nsmid",
+ "nspar",
+ "nsqsube",
+ "nsqsupe",
+ "nsub",
+ "nsubE",
+ "nsube",
+ "nsubset",
+ "nsubseteq",
+ "nsubseteqq",
+ "nsucc",
+ "nsucceq",
+ "nsup",
+ "nsupE",
+ "nsupe",
+ "nsupset",
+ "nsupseteq",
+ "nsupseteqq",
+ "ntgl",
+ "ntild",
+ "ntilde",
+ "ntlg",
+ "ntriangleleft",
+ "ntrianglelefteq",
+ "ntriangleright",
+ "ntrianglerighteq",
+ "nu",
+ "num",
+ "numero",
+ "numsp",
+ "nvDash",
+ "nvHarr",
+ "nvap",
+ "nvdash",
+ "nvge",
+ "nvgt",
+ "nvinfin",
+ "nvlArr",
+ "nvle",
+ "nvlt",
+ "nvltrie",
+ "nvrArr",
+ "nvrtrie",
+ "nvsim",
+ "nwArr",
+ "nwarhk",
+ "nwarr",
+ "nwarrow",
+ "nwnear",
+ "oS",
+ "oacut",
+ "oacute",
+ "oast",
+ "ocir",
+ "ocirc",
+ "ocy",
+ "odash",
+ "odblac",
+ "odiv",
+ "odot",
+ "odsold",
+ "oelig",
+ "ofcir",
+ "ofr",
+ "ogon",
+ "ograv",
+ "ograve",
+ "ogt",
+ "ohbar",
+ "ohm",
+ "oint",
+ "olarr",
+ "olcir",
+ "olcross",
+ "oline",
+ "olt",
+ "omacr",
+ "omega",
+ "omicron",
+ "omid",
+ "ominus",
+ "oopf",
+ "opar",
+ "operp",
+ "oplus",
+ "or",
+ "orarr",
+ "ord",
+ "order",
+ "orderof",
+ "ordf",
+ "ordm",
+ "origof",
+ "oror",
+ "orslope",
+ "orv",
+ "oscr",
+ "oslas",
+ "oslash",
+ "osol",
+ "otild",
+ "otilde",
+ "otimes",
+ "otimesas",
+ "oum",
+ "ouml",
+ "ovbar",
+ "par",
+ "para",
+ "parallel",
+ "parsim",
+ "parsl",
+ "part",
+ "pcy",
+ "percnt",
+ "period",
+ "permil",
+ "perp",
+ "pertenk",
+ "pfr",
+ "phi",
+ "phiv",
+ "phmmat",
+ "phone",
+ "pi",
+ "pitchfork",
+ "piv",
+ "planck",
+ "planckh",
+ "plankv",
+ "plus",
+ "plusacir",
+ "plusb",
+ "pluscir",
+ "plusdo",
+ "plusdu",
+ "pluse",
+ "plusm",
+ "plusmn",
+ "plussim",
+ "plustwo",
+ "pm",
+ "pointint",
+ "popf",
+ "poun",
+ "pound",
+ "pr",
+ "prE",
+ "prap",
+ "prcue",
+ "pre",
+ "prec",
+ "precapprox",
+ "preccurlyeq",
+ "preceq",
+ "precnapprox",
+ "precneqq",
+ "precnsim",
+ "precsim",
+ "prime",
+ "primes",
+ "prnE",
+ "prnap",
+ "prnsim",
+ "prod",
+ "profalar",
+ "profline",
+ "profsurf",
+ "prop",
+ "propto",
+ "prsim",
+ "prurel",
+ "pscr",
+ "psi",
+ "puncsp",
+ "qfr",
+ "qint",
+ "qopf",
+ "qprime",
+ "qscr",
+ "quaternions",
+ "quatint",
+ "quest",
+ "questeq",
+ "quo",
+ "quot",
+ "rAarr",
+ "rArr",
+ "rAtail",
+ "rBarr",
+ "rHar",
+ "race",
+ "racute",
+ "radic",
+ "raemptyv",
+ "rang",
+ "rangd",
+ "range",
+ "rangle",
+ "raqu",
+ "raquo",
+ "rarr",
+ "rarrap",
+ "rarrb",
+ "rarrbfs",
+ "rarrc",
+ "rarrfs",
+ "rarrhk",
+ "rarrlp",
+ "rarrpl",
+ "rarrsim",
+ "rarrtl",
+ "rarrw",
+ "ratail",
+ "ratio",
+ "rationals",
+ "rbarr",
+ "rbbrk",
+ "rbrace",
+ "rbrack",
+ "rbrke",
+ "rbrksld",
+ "rbrkslu",
+ "rcaron",
+ "rcedil",
+ "rceil",
+ "rcub",
+ "rcy",
+ "rdca",
+ "rdldhar",
+ "rdquo",
+ "rdquor",
+ "rdsh",
+ "real",
+ "realine",
+ "realpart",
+ "reals",
+ "rect",
+ "re",
+ "reg",
+ "rfisht",
+ "rfloor",
+ "rfr",
+ "rhard",
+ "rharu",
+ "rharul",
+ "rho",
+ "rhov",
+ "rightarrow",
+ "rightarrowtail",
+ "rightharpoondown",
+ "rightharpoonup",
+ "rightleftarrows",
+ "rightleftharpoons",
+ "rightrightarrows",
+ "rightsquigarrow",
+ "rightthreetimes",
+ "ring",
+ "risingdotseq",
+ "rlarr",
+ "rlhar",
+ "rlm",
+ "rmoust",
+ "rmoustache",
+ "rnmid",
+ "roang",
+ "roarr",
+ "robrk",
+ "ropar",
+ "ropf",
+ "roplus",
+ "rotimes",
+ "rpar",
+ "rpargt",
+ "rppolint",
+ "rrarr",
+ "rsaquo",
+ "rscr",
+ "rsh",
+ "rsqb",
+ "rsquo",
+ "rsquor",
+ "rthree",
+ "rtimes",
+ "rtri",
+ "rtrie",
+ "rtrif",
+ "rtriltri",
+ "ruluhar",
+ "rx",
+ "sacute",
+ "sbquo",
+ "sc",
+ "scE",
+ "scap",
+ "scaron",
+ "sccue",
+ "sce",
+ "scedil",
+ "scirc",
+ "scnE",
+ "scnap",
+ "scnsim",
+ "scpolint",
+ "scsim",
+ "scy",
+ "sdot",
+ "sdotb",
+ "sdote",
+ "seArr",
+ "searhk",
+ "searr",
+ "searrow",
+ "sec",
+ "sect",
+ "semi",
+ "seswar",
+ "setminus",
+ "setmn",
+ "sext",
+ "sfr",
+ "sfrown",
+ "sharp",
+ "shchcy",
+ "shcy",
+ "shortmid",
+ "shortparallel",
+ "sh",
+ "shy",
+ "sigma",
+ "sigmaf",
+ "sigmav",
+ "sim",
+ "simdot",
+ "sime",
+ "simeq",
+ "simg",
+ "simgE",
+ "siml",
+ "simlE",
+ "simne",
+ "simplus",
+ "simrarr",
+ "slarr",
+ "smallsetminus",
+ "smashp",
+ "smeparsl",
+ "smid",
+ "smile",
+ "smt",
+ "smte",
+ "smtes",
+ "softcy",
+ "sol",
+ "solb",
+ "solbar",
+ "sopf",
+ "spades",
+ "spadesuit",
+ "spar",
+ "sqcap",
+ "sqcaps",
+ "sqcup",
+ "sqcups",
+ "sqsub",
+ "sqsube",
+ "sqsubset",
+ "sqsubseteq",
+ "sqsup",
+ "sqsupe",
+ "sqsupset",
+ "sqsupseteq",
+ "squ",
+ "square",
+ "squarf",
+ "squf",
+ "srarr",
+ "sscr",
+ "ssetmn",
+ "ssmile",
+ "sstarf",
+ "star",
+ "starf",
+ "straightepsilon",
+ "straightphi",
+ "strns",
+ "sub",
+ "subE",
+ "subdot",
+ "sube",
+ "subedot",
+ "submult",
+ "subnE",
+ "subne",
+ "subplus",
+ "subrarr",
+ "subset",
+ "subseteq",
+ "subseteqq",
+ "subsetneq",
+ "subsetneqq",
+ "subsim",
+ "subsub",
+ "subsup",
+ "succ",
+ "succapprox",
+ "succcurlyeq",
+ "succeq",
+ "succnapprox",
+ "succneqq",
+ "succnsim",
+ "succsim",
+ "sum",
+ "sung",
+ "sup",
+ "sup1",
+ "sup2",
+ "sup3",
+ "supE",
+ "supdot",
+ "supdsub",
+ "supe",
+ "supedot",
+ "suphsol",
+ "suphsub",
+ "suplarr",
+ "supmult",
+ "supnE",
+ "supne",
+ "supplus",
+ "supset",
+ "supseteq",
+ "supseteqq",
+ "supsetneq",
+ "supsetneqq",
+ "supsim",
+ "supsub",
+ "supsup",
+ "swArr",
+ "swarhk",
+ "swarr",
+ "swarrow",
+ "swnwar",
+ "szli",
+ "szlig",
+ "target",
+ "tau",
+ "tbrk",
+ "tcaron",
+ "tcedil",
+ "tcy",
+ "tdot",
+ "telrec",
+ "tfr",
+ "there4",
+ "therefore",
+ "theta",
+ "thetasym",
+ "thetav",
+ "thickapprox",
+ "thicksim",
+ "thinsp",
+ "thkap",
+ "thksim",
+ "thor",
+ "thorn",
+ "tilde",
+ "time",
+ "times",
+ "timesb",
+ "timesbar",
+ "timesd",
+ "tint",
+ "toea",
+ "top",
+ "topbot",
+ "topcir",
+ "topf",
+ "topfork",
+ "tosa",
+ "tprime",
+ "trade",
+ "triangle",
+ "triangledown",
+ "triangleleft",
+ "trianglelefteq",
+ "triangleq",
+ "triangleright",
+ "trianglerighteq",
+ "tridot",
+ "trie",
+ "triminus",
+ "triplus",
+ "trisb",
+ "tritime",
+ "trpezium",
+ "tscr",
+ "tscy",
+ "tshcy",
+ "tstrok",
+ "twixt",
+ "twoheadleftarrow",
+ "twoheadrightarrow",
+ "uArr",
+ "uHar",
+ "uacut",
+ "uacute",
+ "uarr",
+ "ubrcy",
+ "ubreve",
+ "ucir",
+ "ucirc",
+ "ucy",
+ "udarr",
+ "udblac",
+ "udhar",
+ "ufisht",
+ "ufr",
+ "ugrav",
+ "ugrave",
+ "uharl",
+ "uharr",
+ "uhblk",
+ "ulcorn",
+ "ulcorner",
+ "ulcrop",
+ "ultri",
+ "umacr",
+ "um",
+ "uml",
+ "uogon",
+ "uopf",
+ "uparrow",
+ "updownarrow",
+ "upharpoonleft",
+ "upharpoonright",
+ "uplus",
+ "upsi",
+ "upsih",
+ "upsilon",
+ "upuparrows",
+ "urcorn",
+ "urcorner",
+ "urcrop",
+ "uring",
+ "urtri",
+ "uscr",
+ "utdot",
+ "utilde",
+ "utri",
+ "utrif",
+ "uuarr",
+ "uum",
+ "uuml",
+ "uwangle",
+ "vArr",
+ "vBar",
+ "vBarv",
+ "vDash",
+ "vangrt",
+ "varepsilon",
+ "varkappa",
+ "varnothing",
+ "varphi",
+ "varpi",
+ "varpropto",
+ "varr",
+ "varrho",
+ "varsigma",
+ "varsubsetneq",
+ "varsubsetneqq",
+ "varsupsetneq",
+ "varsupsetneqq",
+ "vartheta",
+ "vartriangleleft",
+ "vartriangleright",
+ "vcy",
+ "vdash",
+ "vee",
+ "veebar",
+ "veeeq",
+ "vellip",
+ "verbar",
+ "vert",
+ "vfr",
+ "vltri",
+ "vnsub",
+ "vnsup",
+ "vopf",
+ "vprop",
+ "vrtri",
+ "vscr",
+ "vsubnE",
+ "vsubne",
+ "vsupnE",
+ "vsupne",
+ "vzigzag",
+ "wcirc",
+ "wedbar",
+ "wedge",
+ "wedgeq",
+ "weierp",
+ "wfr",
+ "wopf",
+ "wp",
+ "wr",
+ "wreath",
+ "wscr",
+ "xcap",
+ "xcirc",
+ "xcup",
+ "xdtri",
+ "xfr",
+ "xhArr",
+ "xharr",
+ "xi",
+ "xlArr",
+ "xlarr",
+ "xmap",
+ "xnis",
+ "xodot",
+ "xopf",
+ "xoplus",
+ "xotime",
+ "xrArr",
+ "xrarr",
+ "xscr",
+ "xsqcup",
+ "xuplus",
+ "xutri",
+ "xvee",
+ "xwedge",
+ "yacut",
+ "yacute",
+ "yacy",
+ "ycirc",
+ "ycy",
+ "ye",
+ "yen",
+ "yfr",
+ "yicy",
+ "yopf",
+ "yscr",
+ "yucy",
+ "yum",
+ "yuml",
+ "zacute",
+ "zcaron",
+ "zcy",
+ "zdot",
+ "zeetrf",
+ "zeta",
+ "zfr",
+ "zhcy",
+ "zigrarr",
+ "zopf",
+ "zscr",
+ "zwj",
+ "zwnj",
+];
+
+/// List of values corresponding to names of named
+/// [character references][character_reference].
+///
+/// The corresponding names of this list are stored in
+/// [`CHARACTER_REFERENCE_NAMES`][].
+/// They correspond through their index.
+///
+/// ## References
+///
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+///
+/// [character_reference]: crate::construct::character_reference
+pub const CHARACTER_REFERENCE_VALUES: [&str; 2222] = [
+ "Æ", "Æ", "&", "&", "Á", "Á", "Ă", "Â", "Â", "А", "𝔄", "À", "À", "Α", "Ā", "⩓", "Ą", "𝔸", "⁡",
+ "Å", "Å", "𝒜", "≔", "Ã", "Ã", "Ä", "Ä", "∖", "⫧", "⌆", "Б", "∵", "ℬ", "Β", "𝔅", "𝔹", "˘", "ℬ",
+ "≎", "Ч", "©", "©", "Ć", "⋒", "ⅅ", "ℭ", "Č", "Ç", "Ç", "Ĉ", "∰", "Ċ", "¸", "·", "ℭ", "Χ", "⊙",
+ "⊖", "⊕", "⊗", "∲", "”", "’", "∷", "⩴", "≡", "∯", "∮", "ℂ", "∐", "∳", "⨯", "𝒞", "⋓", "≍", "ⅅ",
+ "⤑", "Ђ", "Ѕ", "Џ", "‡", "↡", "⫤", "Ď", "Д", "∇", "Δ", "𝔇", "´", "˙", "˝", "`", "˜", "⋄", "ⅆ",
+ "𝔻", "¨", "⃜", "≐", "∯", "¨", "⇓", "⇐", "⇔", "⫤", "⟸", "⟺", "⟹", "⇒", "⊨", "⇑", "⇕", "∥", "↓",
+ "⤓", "⇵", "̑", "⥐", "⥞", "↽", "⥖", "⥟", "⇁", "⥗", "⊤", "↧", "⇓", "𝒟", "Đ", "Ŋ", "Ð", "Ð", "É",
+ "É", "Ě", "Ê", "Ê", "Э", "Ė", "𝔈", "È", "È", "∈", "Ē", "◻", "▫", "Ę", "𝔼", "Ε", "⩵", "≂", "⇌",
+ "ℰ", "⩳", "Η", "Ë", "Ë", "∃", "ⅇ", "Ф", "𝔉", "◼", "▪", "𝔽", "∀", "ℱ", "ℱ", "Ѓ", ">", ">", "Γ",
+ "Ϝ", "Ğ", "Ģ", "Ĝ", "Г", "Ġ", "𝔊", "⋙", "𝔾", "≥", "⋛", "≧", "⪢", "≷", "⩾", "≳", "𝒢", "≫", "Ъ",
+ "ˇ", "^", "Ĥ", "ℌ", "ℋ", "ℍ", "─", "ℋ", "Ħ", "≎", "≏", "Е", "IJ", "Ё", "Í", "Í", "Î", "Î", "И",
+ "İ", "ℑ", "Ì", "Ì", "ℑ", "Ī", "ⅈ", "⇒", "∬", "∫", "⋂", "⁣", "⁢", "Į", "𝕀", "Ι", "ℐ", "Ĩ", "І",
+ "Ï", "Ï", "Ĵ", "Й", "𝔍", "𝕁", "𝒥", "Ј", "Є", "Х", "Ќ", "Κ", "Ķ", "К", "𝔎", "𝕂", "𝒦", "Љ", "<",
+ "<", "Ĺ", "Λ", "⟪", "ℒ", "↞", "Ľ", "Ļ", "Л", "⟨", "←", "⇤", "⇆", "⌈", "⟦", "⥡", "⇃", "⥙", "⌊",
+ "↔", "⥎", "⊣", "↤", "⥚", "⊲", "⧏", "⊴", "⥑", "⥠", "↿", "⥘", "↼", "⥒", "⇐", "⇔", "⋚", "≦", "≶",
+ "⪡", "⩽", "≲", "𝔏", "⋘", "⇚", "Ŀ", "⟵", "⟷", "⟶", "⟸", "⟺", "⟹", "𝕃", "↙", "↘", "ℒ", "↰", "Ł",
+ "≪", "⤅", "М", " ", "ℳ", "𝔐", "∓", "𝕄", "ℳ", "Μ", "Њ", "Ń", "Ň", "Ņ", "Н", "\u{200B}",
+ "\u{200B}", "\u{200B}", "\u{200B}", "≫", "≪", "\n", "𝔑", "\u{2060}", " ", "ℕ", "⫬", "≢", "≭",
+ "∦", "∉", "≠", "≂̸", "∄", "≯", "≱", "≧̸", "≫̸", "≹", "⩾̸", "≵", "≎̸", "≏̸", "⋪", "⧏̸", "⋬", "≮", "≰",
+ "≸", "≪̸", "⩽̸", "≴", "⪢̸", "⪡̸", "⊀", "⪯̸", "⋠", "∌", "⋫", "⧐̸", "⋭", "⊏̸", "⋢", "⊐̸", "⋣", "⊂⃒", "⊈",
+ "⊁", "⪰̸", "⋡", "≿̸", "⊃⃒", "⊉", "≁", "≄", "≇", "≉", "∤", "𝒩", "Ñ", "Ñ", "Ν", "Œ", "Ó", "Ó", "Ô",
+ "Ô", "О", "Ő", "𝔒", "Ò", "Ò", "Ō", "Ω", "Ο", "𝕆", "“", "‘", "⩔", "𝒪", "Ø", "Ø", "Õ", "Õ", "⨷",
+ "Ö", "Ö", "‾", "⏞", "⎴", "⏜", "∂", "П", "𝔓", "Φ", "Π", "±", "ℌ", "ℙ", "⪻", "≺", "⪯", "≼", "≾",
+ "″", "∏", "∷", "∝", "𝒫", "Ψ", "\"", "\"", "𝔔", "ℚ", "𝒬", "⤐", "®", "®", "Ŕ", "⟫", "↠", "⤖",
+ "Ř", "Ŗ", "Р", "ℜ", "∋", "⇋", "⥯", "ℜ", "Ρ", "⟩", "→", "⇥", "⇄", "⌉", "⟧", "⥝", "⇂", "⥕", "⌋",
+ "⊢", "↦", "⥛", "⊳", "⧐", "⊵", "⥏", "⥜", "↾", "⥔", "⇀", "⥓", "⇒", "ℝ", "⥰", "⇛", "ℛ", "↱", "⧴",
+ "Щ", "Ш", "Ь", "Ś", "⪼", "Š", "Ş", "Ŝ", "С", "𝔖", "↓", "←", "→", "↑", "Σ", "∘", "𝕊", "√", "□",
+ "⊓", "⊏", "⊑", "⊐", "⊒", "⊔", "𝒮", "⋆", "⋐", "⋐", "⊆", "≻", "⪰", "≽", "≿", "∋", "∑", "⋑", "⊃",
+ "⊇", "⋑", "Þ", "Þ", "™", "Ћ", "Ц", "\t", "Τ", "Ť", "Ţ", "Т", "𝔗", "∴", "Θ", "  ", " ", "∼",
+ "≃", "≅", "≈", "𝕋", "⃛", "𝒯", "Ŧ", "Ú", "Ú", "↟", "⥉", "Ў", "Ŭ", "Û", "Û", "У", "Ű", "𝔘", "Ù",
+ "Ù", "Ū", "_", "⏟", "⎵", "⏝", "⋃", "⊎", "Ų", "𝕌", "↑", "⤒", "⇅", "↕", "⥮", "⊥", "↥", "⇑", "⇕",
+ "↖", "↗", "ϒ", "Υ", "Ů", "𝒰", "Ũ", "Ü", "Ü", "⊫", "⫫", "В", "⊩", "⫦", "⋁", "‖", "‖", "∣", "|",
+ "❘", "≀", " ", "𝔙", "𝕍", "𝒱", "⊪", "Ŵ", "⋀", "𝔚", "𝕎", "𝒲", "𝔛", "Ξ", "𝕏", "𝒳", "Я", "Ї", "Ю",
+ "Ý", "Ý", "Ŷ", "Ы", "𝔜", "𝕐", "𝒴", "Ÿ", "Ж", "Ź", "Ž", "З", "Ż", "\u{200B}", "Ζ", "ℨ", "ℤ",
+ "𝒵", "á", "á", "ă", "∾", "∾̳", "∿", "â", "â", "´", "´", "а", "æ", "æ", "⁡", "𝔞", "à", "à", "ℵ",
+ "ℵ", "α", "ā", "⨿", "&", "&", "∧", "⩕", "⩜", "⩘", "⩚", "∠", "⦤", "∠", "∡", "⦨", "⦩", "⦪", "⦫",
+ "⦬", "⦭", "⦮", "⦯", "∟", "⊾", "⦝", "∢", "Å", "⍼", "ą", "𝕒", "≈", "⩰", "⩯", "≊", "≋", "'", "≈",
+ "≊", "å", "å", "𝒶", "*", "≈", "≍", "ã", "ã", "ä", "ä", "∳", "⨑", "⫭", "≌", "϶", "‵", "∽", "⋍",
+ "⊽", "⌅", "⌅", "⎵", "⎶", "≌", "б", "„", "∵", "∵", "⦰", "϶", "ℬ", "β", "ℶ", "≬", "𝔟", "⋂", "◯",
+ "⋃", "⨀", "⨁", "⨂", "⨆", "★", "▽", "△", "⨄", "⋁", "⋀", "⤍", "⧫", "▪", "▴", "▾", "◂", "▸", "␣",
+ "▒", "░", "▓", "█", "=⃥", "≡⃥", "⌐", "𝕓", "⊥", "⊥", "⋈", "╗", "╔", "╖", "╓", "═", "╦", "╩", "╤",
+ "╧", "╝", "╚", "╜", "╙", "║", "╬", "╣", "╠", "╫", "╢", "╟", "⧉", "╕", "╒", "┐", "┌", "─", "╥",
+ "╨", "┬", "┴", "⊟", "⊞", "⊠", "╛", "╘", "┘", "└", "│", "╪", "╡", "╞", "┼", "┤", "├", "‵", "˘",
+ "¦", "¦", "𝒷", "⁏", "∽", "⋍", "\\", "⧅", "⟈", "•", "•", "≎", "⪮", "≏", "≏", "ć", "∩", "⩄", "⩉",
+ "⩋", "⩇", "⩀", "∩︀", "⁁", "ˇ", "⩍", "č", "ç", "ç", "ĉ", "⩌", "⩐", "ċ", "¸", "¸", "⦲", "¢", "¢",
+ "·", "𝔠", "ч", "✓", "✓", "χ", "○", "⧃", "ˆ", "≗", "↺", "↻", "®", "Ⓢ", "⊛", "⊚", "⊝", "≗", "⨐",
+ "⫯", "⧂", "♣", "♣", ":", "≔", "≔", ",", "@", "∁", "∘", "∁", "ℂ", "≅", "⩭", "∮", "𝕔", "∐", "©",
+ "©", "℗", "↵", "✗", "𝒸", "⫏", "⫑", "⫐", "⫒", "⋯", "⤸", "⤵", "⋞", "⋟", "↶", "⤽", "∪", "⩈", "⩆",
+ "⩊", "⊍", "⩅", "∪︀", "↷", "⤼", "⋞", "⋟", "⋎", "⋏", "¤", "¤", "↶", "↷", "⋎", "⋏", "∲", "∱", "⌭",
+ "⇓", "⥥", "†", "ℸ", "↓", "‐", "⊣", "⤏", "˝", "ď", "д", "ⅆ", "‡", "⇊", "⩷", "°", "°", "δ", "⦱",
+ "⥿", "𝔡", "⇃", "⇂", "⋄", "⋄", "♦", "♦", "¨", "ϝ", "⋲", "÷", "÷", "÷", "⋇", "⋇", "ђ", "⌞", "⌍",
+ "$", "𝕕", "˙", "≐", "≑", "∸", "∔", "⊡", "⌆", "↓", "⇊", "⇃", "⇂", "⤐", "⌟", "⌌", "𝒹", "ѕ", "⧶",
+ "đ", "⋱", "▿", "▾", "⇵", "⥯", "⦦", "џ", "⟿", "⩷", "≑", "é", "é", "⩮", "ě", "ê", "ê", "≕", "э",
+ "ė", "ⅇ", "≒", "𝔢", "⪚", "è", "è", "⪖", "⪘", "⪙", "⏧", "ℓ", "⪕", "⪗", "ē", "∅", "∅", "∅", " ",
+ " ", " ", "ŋ", " ", "ę", "𝕖", "⋕", "⧣", "⩱", "ε", "ε", "ϵ", "≖", "≕", "≂", "⪖", "⪕", "=", "≟",
+ "≡", "⩸", "⧥", "≓", "⥱", "ℯ", "≐", "≂", "η", "ð", "ð", "ë", "ë", "€", "!", "∃", "ℰ", "ⅇ", "≒",
+ "ф", "♀", "ffi", "ff", "ffl", "𝔣", "fi", "fj", "♭", "fl", "▱", "ƒ", "𝕗", "∀", "⋔", "⫙", "⨍", "¼", "½",
+ "⅓", "¼", "⅕", "⅙", "⅛", "⅔", "⅖", "¾", "¾", "⅗", "⅜", "⅘", "⅚", "⅝", "⅞", "⁄", "⌢", "𝒻", "≧",
+ "⪌", "ǵ", "γ", "ϝ", "⪆", "ğ", "ĝ", "г", "ġ", "≥", "⋛", "≥", "≧", "⩾", "⩾", "⪩", "⪀", "⪂", "⪄",
+ "⋛︀", "⪔", "𝔤", "≫", "⋙", "ℷ", "ѓ", "≷", "⪒", "⪥", "⪤", "≩", "⪊", "⪊", "⪈", "⪈", "≩", "⋧", "𝕘",
+ "`", "ℊ", "≳", "⪎", "⪐", ">", ">", "⪧", "⩺", "⋗", "⦕", "⩼", "⪆", "⥸", "⋗", "⋛", "⪌", "≷", "≳",
+ "≩︀", "≩︀", "⇔", " ", "½", "ℋ", "ъ", "↔", "⥈", "↭", "ℏ", "ĥ", "♥", "♥", "…", "⊹", "𝔥", "⤥", "⤦",
+ "⇿", "∻", "↩", "↪", "𝕙", "―", "𝒽", "ℏ", "ħ", "⁃", "‐", "í", "í", "⁣", "î", "î", "и", "е", "¡",
+ "¡", "⇔", "𝔦", "ì", "ì", "ⅈ", "⨌", "∭", "⧜", "℩", "ij", "ī", "ℑ", "ℐ", "ℑ", "ı", "⊷", "Ƶ", "∈",
+ "℅", "∞", "⧝", "ı", "∫", "⊺", "ℤ", "⊺", "⨗", "⨼", "ё", "į", "𝕚", "ι", "⨼", "¿", "¿", "𝒾", "∈",
+ "⋹", "⋵", "⋴", "⋳", "∈", "⁢", "ĩ", "і", "ï", "ï", "ĵ", "й", "𝔧", "ȷ", "𝕛", "𝒿", "ј", "є", "κ",
+ "ϰ", "ķ", "к", "𝔨", "ĸ", "х", "ќ", "𝕜", "𝓀", "⇚", "⇐", "⤛", "⤎", "≦", "⪋", "⥢", "ĺ", "⦴", "ℒ",
+ "λ", "⟨", "⦑", "⟨", "⪅", "«", "«", "←", "⇤", "⤟", "⤝", "↩", "↫", "⤹", "⥳", "↢", "⪫", "⤙", "⪭",
+ "⪭︀", "⤌", "❲", "{", "[", "⦋", "⦏", "⦍", "ľ", "ļ", "⌈", "{", "л", "⤶", "“", "„", "⥧", "⥋", "↲",
+ "≤", "←", "↢", "↽", "↼", "⇇", "↔", "⇆", "⇋", "↭", "⋋", "⋚", "≤", "≦", "⩽", "⩽", "⪨", "⩿", "⪁",
+ "⪃", "⋚︀", "⪓", "⪅", "⋖", "⋚", "⪋", "≶", "≲", "⥼", "⌊", "𝔩", "≶", "⪑", "↽", "↼", "⥪", "▄", "љ",
+ "≪", "⇇", "⌞", "⥫", "◺", "ŀ", "⎰", "⎰", "≨", "⪉", "⪉", "⪇", "⪇", "≨", "⋦", "⟬", "⇽", "⟦", "⟵",
+ "⟷", "⟼", "⟶", "↫", "↬", "⦅", "𝕝", "⨭", "⨴", "∗", "_", "◊", "◊", "⧫", "(", "⦓", "⇆", "⌟", "⇋",
+ "⥭", "‎", "⊿", "‹", "𝓁", "↰", "≲", "⪍", "⪏", "[", "‘", "‚", "ł", "<", "<", "⪦", "⩹", "⋖", "⋋",
+ "⋉", "⥶", "⩻", "⦖", "◃", "⊴", "◂", "⥊", "⥦", "≨︀", "≨︀", "∺", "¯", "¯", "♂", "✠", "✠", "↦", "↦",
+ "↧", "↤", "↥", "▮", "⨩", "м", "—", "∡", "𝔪", "℧", "µ", "µ", "∣", "*", "⫰", "·", "·", "−", "⊟",
+ "∸", "⨪", "⫛", "…", "∓", "⊧", "𝕞", "∓", "𝓂", "∾", "μ", "⊸", "⊸", "⋙̸", "≫⃒", "≫̸", "⇍", "⇎", "⋘̸",
+ "≪⃒", "≪̸", "⇏", "⊯", "⊮", "∇", "ń", "∠⃒", "≉", "⩰̸", "≋̸", "ʼn", "≉", "♮", "♮", "ℕ", " ", " ", "≎̸",
+ "≏̸", "⩃", "ň", "ņ", "≇", "⩭̸", "⩂", "н", "–", "≠", "⇗", "⤤", "↗", "↗", "≐̸", "≢", "⤨", "≂̸", "∄",
+ "∄", "𝔫", "≧̸", "≱", "≱", "≧̸", "⩾̸", "⩾̸", "≵", "≯", "≯", "⇎", "↮", "⫲", "∋", "⋼", "⋺", "∋", "њ",
+ "⇍", "≦̸", "↚", "‥", "≰", "↚", "↮", "≰", "≦̸", "⩽̸", "⩽̸", "≮", "≴", "≮", "⋪", "⋬", "∤", "𝕟", "¬",
+ "¬", "∉", "⋹̸", "⋵̸", "∉", "⋷", "⋶", "∌", "∌", "⋾", "⋽", "∦", "∦", "⫽⃥", "∂̸", "⨔", "⊀", "⋠", "⪯̸",
+ "⊀", "⪯̸", "⇏", "↛", "⤳̸", "↝̸", "↛", "⋫", "⋭", "⊁", "⋡", "⪰̸", "𝓃", "∤", "∦", "≁", "≄", "≄", "∤",
+ "∦", "⋢", "⋣", "⊄", "⫅̸", "⊈", "⊂⃒", "⊈", "⫅̸", "⊁", "⪰̸", "⊅", "⫆̸", "⊉", "⊃⃒", "⊉", "⫆̸", "≹", "ñ",
+ "ñ", "≸", "⋪", "⋬", "⋫", "⋭", "ν", "#", "№", " ", "⊭", "⤄", "≍⃒", "⊬", "≥⃒", ">⃒", "⧞", "⤂", "≤⃒",
+ "<⃒", "⊴⃒", "⤃", "⊵⃒", "∼⃒", "⇖", "⤣", "↖", "↖", "⤧", "Ⓢ", "ó", "ó", "⊛", "ô", "ô", "о", "⊝", "ő",
+ "⨸", "⊙", "⦼", "œ", "⦿", "𝔬", "˛", "ò", "ò", "⧁", "⦵", "Ω", "∮", "↺", "⦾", "⦻", "‾", "⧀", "ō",
+ "ω", "ο", "⦶", "⊖", "𝕠", "⦷", "⦹", "⊕", "∨", "↻", "º", "ℴ", "ℴ", "ª", "º", "⊶", "⩖", "⩗", "⩛",
+ "ℴ", "ø", "ø", "⊘", "õ", "õ", "⊗", "⨶", "ö", "ö", "⌽", "¶", "¶", "∥", "⫳", "⫽", "∂", "п", "%",
+ ".", "‰", "⊥", "‱", "𝔭", "φ", "ϕ", "ℳ", "☎", "π", "⋔", "ϖ", "ℏ", "ℎ", "ℏ", "+", "⨣", "⊞", "⨢",
+ "∔", "⨥", "⩲", "±", "±", "⨦", "⨧", "±", "⨕", "𝕡", "£", "£", "≺", "⪳", "⪷", "≼", "⪯", "≺", "⪷",
+ "≼", "⪯", "⪹", "⪵", "⋨", "≾", "′", "ℙ", "⪵", "⪹", "⋨", "∏", "⌮", "⌒", "⌓", "∝", "∝", "≾", "⊰",
+ "𝓅", "ψ", " ", "𝔮", "⨌", "𝕢", "⁗", "𝓆", "ℍ", "⨖", "?", "≟", "\"", "\"", "⇛", "⇒", "⤜", "⤏",
+ "⥤", "∽̱", "ŕ", "√", "⦳", "⟩", "⦒", "⦥", "⟩", "»", "»", "→", "⥵", "⇥", "⤠", "⤳", "⤞", "↪", "↬",
+ "⥅", "⥴", "↣", "↝", "⤚", "∶", "ℚ", "⤍", "❳", "}", "]", "⦌", "⦎", "⦐", "ř", "ŗ", "⌉", "}", "р",
+ "⤷", "⥩", "”", "”", "↳", "ℜ", "ℛ", "ℜ", "ℝ", "▭", "®", "®", "⥽", "⌋", "𝔯", "⇁", "⇀", "⥬", "ρ",
+ "ϱ", "→", "↣", "⇁", "⇀", "⇄", "⇌", "⇉", "↝", "⋌", "˚", "≓", "⇄", "⇌", "‏", "⎱", "⎱", "⫮", "⟭",
+ "⇾", "⟧", "⦆", "𝕣", "⨮", "⨵", ")", "⦔", "⨒", "⇉", "›", "𝓇", "↱", "]", "’", "’", "⋌", "⋊", "▹",
+ "⊵", "▸", "⧎", "⥨", "℞", "ś", "‚", "≻", "⪴", "⪸", "š", "≽", "⪰", "ş", "ŝ", "⪶", "⪺", "⋩", "⨓",
+ "≿", "с", "⋅", "⊡", "⩦", "⇘", "⤥", "↘", "↘", "§", "§", ";", "⤩", "∖", "∖", "✶", "𝔰", "⌢", "♯",
+ "щ", "ш", "∣", "∥", "\u{AD}", "\u{AD}", "σ", "ς", "ς", "∼", "⩪", "≃", "≃", "⪞", "⪠", "⪝", "⪟",
+ "≆", "⨤", "⥲", "←", "∖", "⨳", "⧤", "∣", "⌣", "⪪", "⪬", "⪬︀", "ь", "/", "⧄", "⌿", "𝕤", "♠", "♠",
+ "∥", "⊓", "⊓︀", "⊔", "⊔︀", "⊏", "⊑", "⊏", "⊑", "⊐", "⊒", "⊐", "⊒", "□", "□", "▪", "▪", "→", "𝓈",
+ "∖", "⌣", "⋆", "☆", "★", "ϵ", "ϕ", "¯", "⊂", "⫅", "⪽", "⊆", "⫃", "⫁", "⫋", "⊊", "⪿", "⥹", "⊂",
+ "⊆", "⫅", "⊊", "⫋", "⫇", "⫕", "⫓", "≻", "⪸", "≽", "⪰", "⪺", "⪶", "⋩", "≿", "∑", "♪", "⊃", "¹",
+ "²", "³", "⫆", "⪾", "⫘", "⊇", "⫄", "⟉", "⫗", "⥻", "⫂", "⫌", "⊋", "⫀", "⊃", "⊇", "⫆", "⊋", "⫌",
+ "⫈", "⫔", "⫖", "⇙", "⤦", "↙", "↙", "⤪", "ß", "ß", "⌖", "τ", "⎴", "ť", "ţ", "т", "⃛", "⌕", "𝔱",
+ "∴", "∴", "θ", "ϑ", "ϑ", "≈", "∼", " ", "≈", "∼", "þ", "þ", "˜", "×", "×", "⊠", "⨱", "⨰", "∭",
+ "⤨", "⊤", "⌶", "⫱", "𝕥", "⫚", "⤩", "‴", "™", "▵", "▿", "◃", "⊴", "≜", "▹", "⊵", "◬", "≜", "⨺",
+ "⨹", "⧍", "⨻", "⏢", "𝓉", "ц", "ћ", "ŧ", "≬", "↞", "↠", "⇑", "⥣", "ú", "ú", "↑", "ў", "ŭ", "û",
+ "û", "у", "⇅", "ű", "⥮", "⥾", "𝔲", "ù", "ù", "↿", "↾", "▀", "⌜", "⌜", "⌏", "◸", "ū", "¨", "¨",
+ "ų", "𝕦", "↑", "↕", "↿", "↾", "⊎", "υ", "ϒ", "υ", "⇈", "⌝", "⌝", "⌎", "ů", "◹", "𝓊", "⋰", "ũ",
+ "▵", "▴", "⇈", "ü", "ü", "⦧", "⇕", "⫨", "⫩", "⊨", "⦜", "ϵ", "ϰ", "∅", "ϕ", "ϖ", "∝", "↕", "ϱ",
+ "ς", "⊊︀", "⫋︀", "⊋︀", "⫌︀", "ϑ", "⊲", "⊳", "в", "⊢", "∨", "⊻", "≚", "⋮", "|", "|", "𝔳", "⊲", "⊂⃒",
+ "⊃⃒", "𝕧", "∝", "⊳", "𝓋", "⫋︀", "⊊︀", "⫌︀", "⊋︀", "⦚", "ŵ", "⩟", "∧", "≙", "℘", "𝔴", "𝕨", "℘", "≀",
+ "≀", "𝓌", "⋂", "◯", "⋃", "▽", "𝔵", "⟺", "⟷", "ξ", "⟸", "⟵", "⟼", "⋻", "⨀", "𝕩", "⨁", "⨂", "⟹",
+ "⟶", "𝓍", "⨆", "⨄", "△", "⋁", "⋀", "ý", "ý", "я", "ŷ", "ы", "¥", "¥", "𝔶", "ї", "𝕪", "𝓎", "ю",
+ "ÿ", "ÿ", "ź", "ž", "з", "ż", "ℨ", "ζ", "𝔷", "ж", "⇝", "𝕫", "𝓏", "‍", "‌",
+];
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs
new file mode 100644
index 0000000..7b7962b
--- /dev/null
+++ b/src/construct/blank_line.rs
@@ -0,0 +1,61 @@
+//! Blank lines are a construct that occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! blank_line ::= *(' ' '\t')
+//! ```
+//!
+//! Blank lines are sometimes needed, such as to differentiate a paragraph
+//! from another paragraph.
+//! In several cases, blank lines are not needed between flow constructs,
+//! such as between two headings.
+//! Sometimes, whether blank lines are present, changes the behavior of how
+//! HTML is rendered, such as whether blank lines are present between list
+//! items in a list.
+//! More than one blank line is never needed in `CommonMark`.
+//!
+//! Because blank lines can be empty (line endings are not considered part of
+//! it), and events cannot be empty, blank lines are not present as a token.
+//!
+//! ## References
+//!
+//! * [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js)
+//! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines)
+//!
+//! <!-- To do: link `flow`, `heading`, `list`, `paragraph` -->
+
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a blank line.
+///
+/// Note: `␠` represents a space character.
+///
+/// ```markdown
+/// |␠␠
+/// |
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace),
+ |_ok| Box::new(after),
+ )(tokenizer, code)
+}
+
+/// After zero or more spaces or tabs, before a line ending or EOF.
+///
+/// Note: `␠` represents a space character.
+///
+/// ```markdown
+/// |␠␠
+/// |
+/// ```
+fn after(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
new file mode 100644
index 0000000..5ea995e
--- /dev/null
+++ b/src/construct/character_escape.rs
@@ -0,0 +1,69 @@
+//! Character escapes are a construct that occurs in the string and text
+//! content types.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! character_escape ::= '\\' ascii_punctuation
+//! ```
+//!
+//! Like much of markdown, there are no “invalid” character escapes: just a
+//! slash, or a slash followed by anything other than an ASCII punctuation
+//! character, is exactly that: just a slash.
+//! To escape (most) arbitrary characters, use a
+//! [character reference][] instead
+//! (as in, `&amp;`, `&#123;`, or say `&#x9;`).
+//! It is also possible to escape a line ending in text with a similar
+//! construct: a backslash followed by a line ending (that is part of the
+//! construct instead of ending it).
+//!
+//! ## References
+//!
+//! * [`character-escape.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-escape.js)
+//! * [*§ 2.4 Backslash escapes* in `CommonMark`](https://spec.commonmark.org/0.30/#backslash-escapes)
+//!
+//! [character reference]: crate::construct::character_reference
+//!
+//! <!-- To do: link `hard_break_escape`, `string`, `text` -->
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a character escape.
+///
+/// ```markdown
+/// a|\*b
+/// a|\b
+/// a|\ b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('\\') => {
+ tokenizer.enter(TokenType::CharacterEscape);
+ tokenizer.enter(TokenType::CharacterEscapeMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterEscapeMarker);
+ (State::Fn(Box::new(inside)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside a character escape, after `\`.
+///
+/// ```markdown
+/// a\|*b
+/// a\|b
+/// a\| b
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_punctuation() => {
+ tokenizer.enter(TokenType::CharacterEscapeValue);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterEscapeValue);
+ tokenizer.exit(TokenType::CharacterEscape);
+ (State::Ok, None)
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs
new file mode 100644
index 0000000..27275d5
--- /dev/null
+++ b/src/construct/character_reference.rs
@@ -0,0 +1,237 @@
+//! Character references are a construct that occurs in the string and text
+//! content types.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! character_reference ::= '&' (numeric | named) ';'
+//!
+//! numeric ::= '#' (hexadecimal | decimal)
+//! ; Note: Limit of `6` imposed as all bigger numbers are invalid:
+//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit)
+//! ; Note: Limit of `7` imposed as all bigger numbers are invalid:
+//! decimal ::= 1*7(ascii_digit)
+//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`:
+//! ; Note: Limited to any known named character reference (see `constants.rs`)
+//! named ::= 1*31(ascii_alphanumeric)
+//! ```
+//!
+//! Like much of markdown, there are no “invalid” character references.
+//! However, for security reasons, several numeric character references parse
+//! fine but are not rendered as their corresponding character and they are
+//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`).
+//! See [`decode_numeric_character_reference`][decode_numeric] for more info.
+//!
+//! To escape ASCII punctuation characters, use the terser
+//! [character escape][character_escape] construct instead (as in, `\&`).
+//!
+//! Character references in markdown are not the same as character references
+//! in HTML.
+//! Notably, HTML allows several character references without a closing
+//! semicolon.
+//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info.
+//!
+//! Character references are parsed insensitive to casing.
+//! The casing of hexadecimal numeric character references has no effect.
+//! The casing of named character references does not matter when parsing them,
+//! but does affect whether they match.
+//! Depending on the name, one or more cases are allowed, such as that `AMP`
+//! and `amp` are both allowed but other cases are not.
+//! See [`CHARACTER_REFERENCE_NAMES`][character_reference_names] for which
+//! names match.
+//!
+//! ## References
+//!
+//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js)
+//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+//!
+//! [character_escape]: crate::construct::character_reference
+//! [decode_numeric]: crate::util::decode_numeric_character_reference
+//! [character_reference_names]: crate::constant::CHARACTER_REFERENCE_NAMES
+//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
+//!
+//! <!-- To do: link `string`, `text` -->
+
+use crate::constant::{
+ CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
+ CHARACTER_REFERENCE_NAMED_SIZE_MAX, CHARACTER_REFERENCE_NAMES,
+};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Kind of a character reference.
+#[derive(Debug, Clone)]
+pub enum Kind {
+ /// Numeric decimal character reference (`&#x9;`).
+ Decimal,
+ /// Numeric hexadecimal character reference (`&#123;`).
+ Hexadecimal,
+ /// Named character reference (`&amp;`).
+ Named,
+}
+
+/// State needed to parse character references.
+#[derive(Debug, Clone)]
+struct Info {
+ /// All parsed characters.
+ buffer: Vec<char>,
+ /// Kind of character reference.
+ kind: Kind,
+}
+
+/// Start of a character reference.
+///
+/// ```markdown
+/// a|&amp;b
+/// a|&#123;b
+/// a|&#x9;b
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('&') => {
+ tokenizer.enter(TokenType::CharacterReference);
+ tokenizer.enter(TokenType::CharacterReferenceMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterReferenceMarker);
+ (State::Fn(Box::new(open)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside a character reference, after `&`, before `#` for numeric references
+/// or an alphanumeric for named references.
+///
+/// ```markdown
+/// a&|amp;b
+/// a&|#123;b
+/// a&|#x9;b
+/// ```
+fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if let Code::Char('#') = code {
+ tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric);
+ (State::Fn(Box::new(numeric)), None)
+ } else {
+ tokenizer.enter(TokenType::CharacterReferenceValue);
+ value(
+ tokenizer,
+ code,
+ Info {
+ buffer: vec![],
+ kind: Kind::Named,
+ },
+ )
+ }
+}
+
+/// Inside a numeric character reference, right before `x` for hexadecimals,
+/// or a digit for decimals.
+///
+/// ```markdown
+/// a&#|123;b
+/// a&#|x9;b
+/// ```
+fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == 'x' || char == 'X' => {
+ tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal);
+ tokenizer.enter(TokenType::CharacterReferenceValue);
+
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ value(
+ tokenizer,
+ code,
+ Info {
+ buffer: vec![],
+ kind: Kind::Hexadecimal,
+ },
+ )
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.enter(TokenType::CharacterReferenceValue);
+
+ value(
+ tokenizer,
+ code,
+ Info {
+ buffer: vec![],
+ kind: Kind::Decimal,
+ },
+ )
+ }
+ }
+}
+
+/// Inside a character reference value, after the markers (`&#x`, `&#`, or
+/// `&`) that define its kind, but before the `;`.
+/// The character reference kind defines what and how many characters are
+/// allowed.
+///
+/// ```markdown
+/// a&a|mp;b
+/// a&#1|23;b
+/// a&#x|9;b
+/// ```
+fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
+ match code {
+ Code::Char(';') if !info.buffer.is_empty() => {
+ tokenizer.exit(TokenType::CharacterReferenceValue);
+ let value = info.buffer.iter().collect::<String>();
+
+ if let Kind::Named = info.kind {
+ if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) {
+ return (State::Nok, Some(vec![code]));
+ }
+ }
+
+ tokenizer.enter(TokenType::CharacterReferenceMarkerSemi);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterReferenceMarkerSemi);
+ tokenizer.exit(TokenType::CharacterReference);
+ (State::Ok, None)
+ }
+ Code::Char(char) => {
+ let len = info.buffer.len();
+
+ let cont = match info.kind {
+ Kind::Hexadecimal
+ if char.is_ascii_hexdigit()
+ && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX =>
+ {
+ true
+ }
+ Kind::Decimal
+ if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX =>
+ {
+ true
+ }
+ Kind::Named
+ if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX =>
+ {
+ true
+ }
+ _ => false,
+ };
+
+ if cont {
+ let mut clone = info;
+ clone.buffer.push(char);
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))),
+ None,
+ )
+ } else {
+ (State::Nok, None)
+ }
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
new file mode 100644
index 0000000..2068a62
--- /dev/null
+++ b/src/construct/code_fenced.rs
@@ -0,0 +1,581 @@
+//! Code (fenced) is a construct that occurs in the flow content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ]
+//!
+//! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab
+//! ; Restriction: the number of markers in the closing fence sequence must be
+//! ; equal to or greater than the number of markers in the opening fence
+//! ; sequence.
+//! ; Restriction: the marker in the closing fence sequence must match the
+//! ; marker in the opening fence sequence
+//! fence_close ::= sequence *space_or_tab
+//! sequence ::= 3*'`' | 3*'~'
+//! info ::= 1*text
+//! meta ::= 1*text *( *space_or_tab 1*text )
+//!
+//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the
+//! ; marker of the opening fence sequence.
+//! text ::= code - eol - space_or_tab
+//! eol ::= '\r' | '\r\n' | '\n'
+//! space_or_tab ::= ' ' | '\t'
+//! code ::= . ; any unicode code point (other than line endings).
+//! ```
+//!
+//! The above grammar does not show how whitespace is handled.
+//! To parse code (fenced), let `X` be the number of whitespace characters
+//! before the opening fence sequence.
+//! Each line of content is then allowed (not required) to be indented with up
+//! to `X` spaces or tabs, which are then ignored as an indent instead of being
+//! considered as part of the code.
+//! This indent does not affect the closing fence.
+//! It can be indented up to a separate 3 spaces or tabs.
+//! A bigger indent makes it part of the code instead of a fence.
+//!
+//! Code (fenced) relates to both the `<pre>` and the `<code>` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! The optional `meta` part is ignored: it is not used when parsing or
+//! rendering.
+//! The optional `info` part is used and is expected to specify the programming
+//! language that the code is in.
+//! Which value it holds depends on what your syntax highlighter supports, if
+//! one is used.
+//! The `info` is, when rendering to HTML, typically exposed as a class.
+//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
+//! element*][html-code]).
+//! For example:
+//!
+//! ```markdown
+//! ~~~css
+//! * { color: tomato }
+//! ~~~
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <pre><code class="language-css">* { color: tomato }
+//! </code></pre>
+//! ```
+//!
+//! The `info` and `meta` parts are interpreted as the string content type.
+//! That means that character escapes and character reference are allowed.
+//!
+//! In markdown, it is also possible to use code (text) in the text content
+//! type.
+//! It is also possible to create code with the
+//! [code (indented)][code-indented] construct.
+//! That construct is less explicit, different from code (text), and has no
+//! support for specifying the programming language, so it is recommended to
+//! use code (fenced) instead of code (indented).
+//!
+//! ## References
+//!
+//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js)
+//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks)
+//!
+//! [code-indented]: crate::construct::code_indented
+//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+//!
+//! <!-- To do: link `flow`, `text`, `code_text`, `string` -->
+
+use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE};
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::get_span;
+
+/// Kind of fences.
+#[derive(Debug, Clone, PartialEq)]
+pub enum Kind {
+ /// Grave accent (tick) code.
+ GraveAccent,
+ /// Tilde code.
+ Tilde,
+}
+
+/// State needed to parse code (fenced).
+#[derive(Debug, Clone)]
+struct Info {
+ /// Number of markers on the opening fence sequence.
+ size: usize,
+ /// Number of tabs or spaces of indentation before the opening fence
+ /// sequence.
+ prefix: usize,
+ /// Kind of fences.
+ kind: Kind,
+}
+
+/// Start of fenced code.
+///
+/// ```markdown
+/// | ~~~js
+/// console.log(1);
+/// ~~~
+/// ```
+///
+/// Parsing note: normally, the prefix is already stripped.
+/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need
+/// it.
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::CodeFenced);
+ tokenizer.enter(TokenType::CodeFencedFence);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(before_sequence_open),
+ )(tokenizer, code)
+}
+
+/// Inside the opening fence, after an optional prefix, before a sequence.
+///
+/// ```markdown
+/// |~~~js
+/// console.log(1);
+/// ~~~
+/// ```
+fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let tail = tokenizer.events.last();
+ let mut prefix = 0;
+
+ if let Some(event) = tail {
+ if event.token_type == TokenType::Whitespace {
+ let span = get_span(&tokenizer.events, tokenizer.events.len() - 1);
+ prefix = span.end_index - span.start_index;
+ }
+ }
+
+ match code {
+ Code::Char(char) if char == '`' || char == '~' => {
+ tokenizer.enter(TokenType::CodeFencedFenceSequence);
+ sequence_open(
+ tokenizer,
+ Info {
+ prefix,
+ size: 0,
+ kind: if char == '`' {
+ Kind::GraveAccent
+ } else {
+ Kind::Tilde
+ },
+ },
+ code,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside the opening fence sequence.
+///
+/// ```markdown
+/// ~|~~js
+/// console.log(1);
+/// ~~~
+/// ```
+fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ let marker = if info.kind == Kind::GraveAccent {
+ '`'
+ } else {
+ '~'
+ };
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ let mut info = info;
+ info.size += 1;
+ sequence_open(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => {
+ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN {
+ (State::Nok, None)
+ } else {
+ tokenizer.exit(TokenType::CodeFencedFenceSequence);
+ tokenizer.attempt(
+ |tokenizer, code| {
+ whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace)
+ },
+ |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)),
+ )(tokenizer, code)
+ }
+ }
+ }
+}
+
+/// Inside the opening fence, after the sequence (and optional whitespace), before the info.
+///
+/// ```markdown
+/// ~~~|js
+/// console.log(1);
+/// ~~~
+/// ```
+fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeFencedFenceInfo);
+ tokenizer.enter(TokenType::ChunkString);
+ info_inside(tokenizer, info, code, vec![])
+ }
+ }
+}
+
+/// Inside the opening fence info.
+///
+/// ```markdown
+/// ~~~j|s
+/// console.log(1);
+/// ~~~
+/// ```
+fn info_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+ codes: Vec<Code>,
+) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ println!("to do: subtokenize: {:?}", codes);
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::CodeFencedFenceInfo);
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ println!("to do: subtokenize: {:?}", codes);
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::CodeFencedFenceInfo);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace),
+ |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)),
+ )(tokenizer, code)
+ }
+ Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
+ Code::Char(_) => {
+ let mut codes = codes;
+ codes.push(code);
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ info_inside(tokenizer, info, code, codes)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// Inside the opening fence, after the info and whitespace, before the meta.
+///
+/// ```markdown
+/// ~~~js |eval
+/// console.log(1);
+/// ~~~
+/// ```
+fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeFencedFenceMeta);
+ tokenizer.enter(TokenType::ChunkString);
+ meta(tokenizer, info, code)
+ }
+ }
+}
+
+/// Inside the opening fence meta.
+///
+/// ```markdown
+/// ~~~js e|val
+/// console.log(1);
+/// ~~~
+/// ```
+fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::CodeFencedFenceMeta);
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))),
+ None,
+ )
+ }
+ }
+}
+
+/// At an eol/eof in code, before a closing fence or before content.
+///
+/// ```markdown
+/// ~~~js|
+/// aa|
+/// ~~~
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ let clone = info.clone();
+
+ match code {
+ Code::None => after(tokenizer, code),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt(
+ |tokenizer, code| {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ close_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ },
+ |ok| {
+ if ok {
+ Box::new(after)
+ } else {
+ Box::new(|tokenizer, code| {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ content_start(tokenizer, clone, code)
+ })),
+ None,
+ )
+ })
+ }
+ },
+ )(tokenizer, code),
+ _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code),
+ }
+}
+
+/// Before a closing fence, before optional whitespace.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// |~~~
+///
+/// ~~~js
+/// console.log('1')
+/// | ~~~
+/// ```
+fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::CodeFencedFence);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)),
+ )(tokenizer, code)
+}
+
+/// In a closing fence, after optional whitespace, before sequence.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// |~~~
+///
+/// ~~~js
+/// console.log('1')
+/// |~~~
+/// ```
+fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ let tail = tokenizer.events.last();
+ let mut prefix = 0;
+ let marker = if info.kind == Kind::GraveAccent {
+ '`'
+ } else {
+ '~'
+ };
+
+ if let Some(event) = tail {
+ if event.token_type == TokenType::Whitespace {
+ let span = get_span(&tokenizer.events, tokenizer.events.len() - 1);
+ prefix = span.end_index - span.start_index;
+ }
+ }
+
+ // To do: 4+ should be okay if code (indented) is turned off!
+ if prefix >= TAB_SIZE {
+ return (State::Nok, None);
+ }
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.enter(TokenType::CodeFencedFenceSequence);
+ close_sequence(tokenizer, info, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In the closing fence sequence.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// ~|~~
+/// ```
+fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult {
+ let marker = if info.kind == Kind::GraveAccent {
+ '`'
+ } else {
+ '~'
+ };
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ close_sequence(tokenizer, info, code, size + 1)
+ })),
+ None,
+ )
+ }
+ _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => {
+ tokenizer.exit(TokenType::CodeFencedFenceSequence);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace),
+ |_ok| Box::new(close_whitespace_after),
+ )(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After the closing fence sequence after optional whitespace.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// ~~~ |
+/// ```
+fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFencedFence);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Before code content, definitely not before a closing fence.
+///
+/// ```markdown
+/// ~~~js
+/// |aa
+/// ~~~
+/// ```
+fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ at_break(tokenizer, info, code)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => {
+ tokenizer.enter(TokenType::Whitespace);
+ content_prefix(tokenizer, info, 0, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeFlowChunk);
+ content_continue(tokenizer, info, code)
+ }
+ }
+}
+
+/// Before code content, in a prefix.
+///
+/// ```markdown
+/// ~~~js
+/// | aa
+/// ~~~
+/// ```
+fn content_prefix(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ prefix: usize,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ content_prefix(tokenizer, info, prefix + 1, code)
+ })),
+ None,
+ )
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::Whitespace);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.exit(TokenType::Whitespace);
+ tokenizer.enter(TokenType::CodeFlowChunk);
+ content_continue(tokenizer, info, code)
+ }
+ }
+}
+
+/// In code content.
+///
+/// ```markdown
+/// ~~~js
+/// |ab
+/// a|b
+/// ab|
+/// ~~~
+/// ```
+fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFlowChunk);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ content_continue(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// After fenced code.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// ~~~|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::CodeFenced);
+ (State::Ok, Some(vec![code]))
+}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
new file mode 100644
index 0000000..6bf089b
--- /dev/null
+++ b/src/construct/code_indented.rs
@@ -0,0 +1,190 @@
+//! Code (indented) is a construct that occurs in the flow content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line )
+//!
+//! ; Restriction: at least one `code` must not be whitespace.
+//! indented_filled_line ::= 4space_or_tab *code
+//! blank_line ::= *space_or_tab
+//! eol ::= '\r' | '\r\n' | '\n'
+//! code ::= . ; any unicode code point (other than line endings).
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Code (indented) relates to both the `<pre>` and the `<code>` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! In markdown, it is also possible to use code (text) in the text content
+//! type.
+//! It is also possible to create code with the [code (fenced)][code-fenced]
+//! construct.
+//! That construct is more explicit, more similar to code (text), and has
+//! support for specifying the programming language that the code is in, so it
+//! is recommended to use that instead of indented code.
+//!
+//! ## References
+//!
+//! * [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js)
+//! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks)
+//!
+//! [code-fenced]: crate::construct::code_fenced
+//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+//!
+//! <!-- To do: link `flow`, `code_text` -->
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of code (indented).
+///
+/// ```markdown
+/// | asd
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.enter(TokenType::CodeIndented);
+ tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+ indent(tokenizer, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside the initial whitespace.
+///
+/// ```markdown
+/// | asd
+/// | asd
+/// | asd
+/// |asd
+/// ```
+///
+/// > **Parsing note**: it is not needed to check if this first line is a
+/// > filled line (that it has a non-whitespace character), because blank lines
+/// > are parsed already, so we never run into that.
+fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ _ if size == TAB_SIZE => {
+ tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+ at_break(tokenizer, code)
+ }
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ indent(tokenizer, code, size + 1)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// At a break.
+///
+/// ```markdown
+/// |asd
+/// asd|
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => after(tokenizer, code),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
+ .attempt(further_start, |ok| {
+ Box::new(if ok { at_break } else { after })
+ })(tokenizer, code),
+ _ => {
+ tokenizer.enter(TokenType::CodeFlowChunk);
+ content(tokenizer, code)
+ }
+ }
+}
+
+/// Inside code content.
+///
+/// ```markdown
+/// |ab
+/// a|b
+/// ab|
+/// ```
+fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFlowChunk);
+ at_break(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(content)), None)
+ }
+ }
+}
+
+/// After indented code.
+///
+/// ```markdown
+/// ab|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::CodeIndented);
+ (State::Ok, Some(vec![code]))
+}
+
+/// Right at a line ending, trying to parse another indent.
+///
+/// ```markdown
+/// ab|
+/// cd
+/// ```
+fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // To do: `nok` if lazy line.
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(further_start)), None)
+ }
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+ further_indent(tokenizer, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside further whitespace.
+///
+/// ```markdown
+/// asd
+/// | asd
+/// ```
+fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ _ if size == TAB_SIZE => {
+ tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+ (State::Ok, Some(vec![code]))
+ }
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ further_indent(tokenizer, code, size + 1)
+ })),
+ None,
+ )
+ }
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+ further_start(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
new file mode 100644
index 0000000..b3aef1b
--- /dev/null
+++ b/src/construct/heading_atx.rs
@@ -0,0 +1,175 @@
+//! Heading (atx) is a construct that occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab
+//!
+//! code ::= . ; any unicode code point (other than line endings).
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML.
+//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the
+//! HTML spec][html] for more info.
+//!
+//! `CommonMark` introduced the requirement on whitespace existing after the
+//! opening sequence and before text.
+//! In older markdown versions, this was not required, and headings would form
+//! without it.
+//!
+//! In markdown, it is also possible to create headings with the setext heading
+//! construct.
+//! The benefit of setext headings is that their text can include line endings.
+//! However, their limit is that they cannot form `<h3>` through `<h6>`
+//! headings.
+//! Due to this limitation, it is recommended to use atx headings.
+//!
+//! > 🏛 **Background**: the word *setext* originates from a small markup
+//! > language by Ian Feldman from 1991.
+//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info.
+//! > The word *atx* originates from a tiny markup language by Aaron Swartz
+//! > from 2002.
+//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for
+//! > more info.
+//!
+//! ## References
+//!
+//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js)
+//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings)
+//!
+//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements
+//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
+//! [atx]: http://www.aaronsw.com/2002/atx/
+//!
+//! <!-- To do: link `flow`, `setext` -->
+
+use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a heading (atx).
+///
+/// ```markdown
+/// |## alpha
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if Code::Char('#') == code {
+ tokenizer.enter(TokenType::AtxHeading);
+ tokenizer.enter(TokenType::AtxHeadingSequence);
+ sequence_open(tokenizer, code, 0)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// In the opening sequence.
+///
+/// ```markdown
+/// #|# alpha
+/// ```
+fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult {
+ match code {
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\n' | '\r' | ' ')
+ if rank > 0 =>
+ {
+ tokenizer.exit(TokenType::AtxHeadingSequence);
+ at_break(tokenizer, code)
+ }
+ Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ sequence_open(tokenizer, code, rank + 1)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After something but before something else.
+///
+/// ```markdown
+/// ## |alpha
+/// ## alpha| bravo
+/// ## alpha |bravo
+/// ## alpha bravo|##
+/// ## alpha bravo ##|
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::AtxHeading);
+ (State::Ok, Some(vec![code]))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.enter(TokenType::AtxHeadingWhitespace);
+ whitespace(tokenizer, code)
+ }
+ Code::Char('#') => {
+ tokenizer.enter(TokenType::AtxHeadingSequence);
+ further_sequence(tokenizer, code)
+ }
+ Code::Char(_) => {
+ tokenizer.enter(TokenType::AtxHeadingText);
+ data(tokenizer, code)
+ }
+ }
+}
+
+/// In a further sequence (after whitespace).
+/// Could be normal “visible” hashes in the heading or a final sequence.
+///
+/// ```markdown
+/// ## alpha #|#
+/// ```
+fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if let Code::Char('#') = code {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(further_sequence)), None)
+ } else {
+ tokenizer.exit(TokenType::AtxHeadingSequence);
+ at_break(tokenizer, code)
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// ## alpha | bravo
+/// ```
+fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(whitespace)), None)
+ }
+ _ => {
+ tokenizer.exit(TokenType::AtxHeadingWhitespace);
+ at_break(tokenizer, code)
+ }
+ }
+}
+
+/// In text.
+///
+/// ```markdown
+/// ## al|pha
+/// ```
+fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text.
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => {
+ tokenizer.exit(TokenType::AtxHeadingText);
+ at_break(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(data)), None)
+ }
+ }
+}
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
new file mode 100644
index 0000000..b7d5570
--- /dev/null
+++ b/src/construct/html_flow.rs
@@ -0,0 +1,1068 @@
+//! HTML (flow) is a construct that occurs in the flow content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete
+//!
+//! ; Note: closing tag name need to match opening tag name.
+//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ]
+//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ]
+//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ]
+//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ]
+//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ]
+//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ]
+//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional )
+//!
+//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive.
+//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive.
+//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>'
+//! closing_tag ::= '</' tag_name whitespace_optional '>'
+//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric )
+//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ]
+//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric )
+//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`')
+//!
+//! whitespace ::= 1*space_or_tab
+//! whitespace_optional ::= [ space_or_tab ]
+//! line ::= code - eol
+//! eol ::= '\r' | '\r\n' | '\n'
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! The grammar for HTML in markdown does not resemble the rules of parsing
+//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML
+//! spec][html-parsing].
+//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?)
+//! attempt to parse an XML-like language.
+//! By extension, another notable property of the grammar is that it can
+//! result in invalid HTML, in that it allows things that wouldn’t work or
+//! wouldn’t work well in HTML, such as mismatched tags.
+//!
+//! Because the **basic** and **complete** productions in the grammar form with
+//! a tag, followed by more stuff, and stop at a blank line, it is possible to
+//! interleave (a word for switching between languages) markdown and HTML
+//! together, by placing the opening and closing tags on their own lines,
+//! with blank lines between them and markdown.
+//! For example:
+//!
+//! ```markdown
+//! <div>This is a <code>div</code> but *this* is not emphasis.</div>
+//!
+//! <div>
+//!
+//! This is a paragraph in a `div` and *this* is emphasis.
+//!
+//! </div>
+//! ```
+//!
+//! The **complete** production of HTML (flow) is not allowed to interrupt
+//! content.
+//! That means that a blank line is needed between a paragraph and it.
+//! However, HTML (text) has a similar production, which will typically kick-in
+//! instead.
+//!
+//! The list of tag names allowed in the **raw** production are defined in
+//! [`HTML_RAW_NAMES`][html_raw_names].
+//! This production exists because there are a few cases where markdown
+//! *inside* some elements, and hence interleaving, does not make sense.
+//!
+//! The list of tag names allowed in the **basic** production are defined in
+//! [`HTML_BLOCK_NAMES`][html_block_names].
+//! This production exists because there are a few cases where we can decide
+//! early that something is going to be a flow (block) element instead of a
+//! phrasing (inline) element.
+//! We *can* interrupt and don’t have to care too much about it being
+//! well-formed.
+//!
+//! ## References
+//!
+//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js)
+//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks)
+//!
+//! [html_raw_names]: crate::constant::HTML_RAW_NAMES
+//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
+//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
+//!
+//! <!-- To do: link stuff -->
+
+use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX};
+use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Kind of HTML (flow).
+#[derive(Debug, Clone, PartialEq)]
+enum Kind {
+ /// Not yet known.
+ Unknown,
+ /// Symbol for `<script>` (condition 1).
+ Raw,
+ /// Symbol for `<!---->` (condition 2).
+ Comment,
+ /// Symbol for `<?php?>` (condition 3).
+ Instruction,
+ /// Symbol for `<!doctype>` (condition 4).
+ Declaration,
+ /// Symbol for `<![CDATA[]]>` (condition 5).
+ Cdata,
+ /// Symbol for `<div` (condition 6).
+ Basic,
+ /// Symbol for `<x>` (condition 7).
+ Complete,
+}
+
+/// Type of quote, if we’re in an attribure, in complete (condition 7).
+#[derive(Debug, Clone, PartialEq)]
+enum QuoteKind {
+ /// Not in a quoted attribute.
+ None,
+ /// In a double quoted (`"`) attribute.
+ Double,
+ /// In a single quoted (`"`) attribute.
+ Single,
+}
+
+/// State needed to parse HTML (flow).
+#[derive(Debug, Clone)]
+struct Info {
+ /// Kind of HTML (flow).
+ kind: Kind,
+ /// Whether this is a start tag (`<` not followed by `/`).
+ start_tag: bool,
+ /// Used depending on `kind` to either collect all parsed characters, or to
+ /// store expected characters.
+ buffer: Vec<char>,
+ /// `index` into `buffer` when expecting certain characters.
+ index: usize,
+ /// Current quote, when in a double or single quoted attribute value.
+ quote: QuoteKind,
+}
+
+// To do: mark as concrete (block quotes or lists can’t “pierce” into HTML).
+
+/// Start of HTML (flow), before optional whitespace.
+///
+/// ```markdown
+/// |<x />
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::HtmlFlow);
+ tokenizer.enter(TokenType::HtmlFlowData);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(before),
+ )(tokenizer, code)
+}
+
+/// After optional whitespace, before `<`.
+///
+/// ```markdown
+/// |<x />
+/// ```
+fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if Code::Char('<') == code {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ open(
+ tokenizer,
+ Info {
+ kind: Kind::Unknown,
+ start_tag: false,
+ buffer: vec![],
+ index: 0,
+ quote: QuoteKind::None,
+ },
+ code,
+ )
+ })),
+ None,
+ )
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// After `<`, before a tag name or other stuff.
+///
+/// ```markdown
+/// <|x />
+/// <|!doctype />
+/// <|!--xxx--/>
+/// ```
+fn open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('!') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ declaration_start(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ tag_close_start(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('?') => {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Instruction;
+ tokenizer.consume(code);
+ // While we’re in an instruction instead of a declaration, we’re on a `?`
+ // right now, so we do need to search for `>`, similar to declarations.
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ // To do: life times.
+ let mut clone = info;
+ clone.start_tag = true;
+ tag_name(tokenizer, clone, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<!`, so inside a declaration, comment, or CDATA.
+///
+/// ```markdown
+/// <!|doctype />
+/// <!|--xxx--/>
+/// <!|[CDATA[>&<]]>
+/// ```
+fn declaration_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ let mut clone = info;
+ clone.kind = Kind::Comment;
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ comment_open_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('[') => {
+ tokenizer.consume(code);
+ let mut clone = info;
+ clone.kind = Kind::Cdata;
+ clone.buffer = vec!['C', 'D', 'A', 'T', 'A', '['];
+ clone.index = 0;
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ cdata_open_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Declaration;
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<!-`, inside a comment, before another `-`.
+///
+/// ```markdown
+/// <!-|-xxx--/>
+/// ```
+fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `<![`, inside CDATA, expecting `CDATA[`.
+///
+/// ```markdown
+/// <![|CDATA[>&<]]>
+/// <![CD|ATA[>&<]]>
+/// <![CDA|TA[>&<]]>
+/// <![CDAT|A[>&<]]>
+/// <![CDATA|[>&<]]>
+/// ```
+fn cdata_open_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == info.buffer[info.index] => {
+ let mut clone = info;
+ clone.index += 1;
+ tokenizer.consume(code);
+
+ if clone.index == clone.buffer.len() {
+ clone.buffer.clear();
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ cdata_open_inside(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `</`, in a closing tag, before a tag name.
+///
+/// ```markdown
+/// </|x>
+/// ```
+fn tag_close_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In a tag name.
+///
+/// ```markdown
+/// <a|b>
+/// </a|b>
+/// ```
+fn tag_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => {
+ let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase();
+ let name = tag_name_buffer.as_str();
+ let slash = if let Code::Char(char) = code {
+ char == '/'
+ } else {
+ false
+ };
+
+ if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Raw;
+ clone.buffer.clear();
+ continuation(tokenizer, clone, code)
+ } else if HTML_BLOCK_NAMES.contains(&name) {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Basic;
+ clone.buffer.clear();
+
+ if slash {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ basic_self_closing(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ continuation(tokenizer, clone, code)
+ }
+ } else {
+ // To do: life times.
+ let mut clone = info;
+ clone.kind = Kind::Complete;
+
+ // To do: do not support complete HTML when interrupting.
+ if clone.start_tag {
+ complete_attribute_name_before(tokenizer, clone, code)
+ } else {
+ complete_closing_tag_after(tokenizer, clone, code)
+ }
+ }
+ }
+ Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ tokenizer.consume(code);
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| tag_name(tokenizer, clone, code))),
+ None,
+ )
+ }
+ Code::Char(_) => (State::Nok, None),
+ }
+}
+
+/// After a closing slash of a basic tag name.
+///
+/// ```markdown
+/// <div/|>
+/// ```
+fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After a closing slash of a complete tag name.
+///
+/// ```markdown
+/// <x/|>
+/// </x/|>
+/// ```
+fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_closing_tag_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_end(tokenizer, info, code),
+ }
+}
+
+/// At a place where an attribute name would be valid.
+///
+/// At first, this state is used after a complete tag name, after whitespace,
+/// where it expects optional attributes or the end of the tag.
+/// It is also reused after attributes, when expecting more optional
+/// attributes.
+///
+/// ```markdown
+/// <x |/>
+/// <x |:asd>
+/// <x |_asd>
+/// <x |asd>
+/// <x | >
+/// <x |>
+/// ```
+fn complete_attribute_name_before(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_end(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_end(tokenizer, info, code),
+ }
+}
+
+/// In an attribute name.
+///
+/// ```markdown
+/// <x :|>
+/// <x _|>
+/// <x a|>
+/// ```
+fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char)
+ if char == '-'
+ || char == '.'
+ || char == ':'
+ || char == '_'
+ || char.is_ascii_alphanumeric() =>
+ {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_attribute_name_after(tokenizer, info, code),
+ }
+}
+
+/// After an attribute name, before an attribute initializer, the end of the
+/// tag, or whitespace.
+///
+/// ```markdown
+/// <x a|>
+/// <x a|=b>
+/// <x a|="c">
+/// ```
+fn complete_attribute_name_after(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char('=') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_name_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_attribute_name_before(tokenizer, info, code),
+ }
+}
+
+/// Before an unquoted, double quoted, or single quoted attribute value,
+/// allowing whitespace.
+///
+/// ```markdown
+/// <x a=|b>
+/// <x a=|"c">
+/// ```
+fn complete_attribute_value_before(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None),
+ Code::Char(char) if char == '"' || char == '\'' => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.quote = if char == '"' {
+ QuoteKind::Double
+ } else {
+ QuoteKind::Single
+ };
+
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_quoted(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => complete_attribute_value_unquoted(tokenizer, info, code),
+ }
+}
+
+/// In a double or single quoted attribute value.
+///
+/// ```markdown
+/// <x a="|">
+/// <x a='|'>
+/// ```
+fn complete_attribute_value_quoted(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ let marker = if info.quote == QuoteKind::Double {
+ '"'
+ } else {
+ '\''
+ };
+
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None),
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_quoted_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_quoted(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// In an unquoted attribute value.
+///
+/// ```markdown
+/// <x a=b|c>
+/// ```
+fn complete_attribute_value_unquoted(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => {
+ complete_attribute_name_after(tokenizer, info, code)
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_attribute_value_unquoted(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// After a double or single quoted attribute value, before whitespace or the
+/// end of the tag.
+///
+/// ```markdown
+/// <x a="b"|>
+/// ```
+fn complete_attribute_value_quoted_after(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => {
+ complete_attribute_name_before(tokenizer, info, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In certain circumstances of a complete tag where only an `>` is allowed.
+///
+/// ```markdown
+/// <x a="b"|>
+/// ```
+fn complete_end(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After `>` in a complete tag.
+///
+/// ```markdown
+/// <x>|
+/// ```
+fn complete_after(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ continuation(tokenizer, info, code)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ complete_after(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(_) => (State::Nok, None),
+ }
+}
+
+/// Inside continuation of any HTML kind.
+///
+/// ```markdown
+/// <!--x|xx-->
+/// ```
+fn continuation(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') if info.kind == Kind::Comment => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_comment_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('<') if info.kind == Kind::Raw => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_tag_open(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('>') if info.kind == Kind::Declaration => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('?') if info.kind == Kind::Instruction => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char(']') if info.kind == Kind::Cdata => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_character_data_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
+ if info.kind == Kind::Basic || info.kind == Kind::Complete =>
+ {
+ let clone = info;
+
+ tokenizer.check(blank_line_before, |ok| {
+ if ok {
+ Box::new(|tokenizer, code| continuation_close(tokenizer, clone, code))
+ } else {
+ Box::new(|tokenizer, code| continuation_at_line_ending(tokenizer, clone, code))
+ }
+ })(tokenizer, code)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ continuation_at_line_ending(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// In continuation, before an eol or eof.
+///
+/// ```markdown
+/// <x>|
+/// ```
+fn continuation_at_line_ending(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::HtmlFlowData);
+ html_continue_start(tokenizer, info, code)
+}
+
+/// In continuation, after an eol.
+///
+/// ```markdown
+/// <x>|
+/// asd
+/// ```
+fn html_continue_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None => {
+ tokenizer.exit(TokenType::HtmlFlow);
+ (State::Ok, Some(vec![code]))
+ }
+ // To do: do not allow lazy lines.
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ html_continue_start(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.enter(TokenType::HtmlFlowData);
+ continuation(tokenizer, info, code)
+ }
+ }
+}
+
+/// In comment continuation, after one `-`, expecting another.
+///
+/// ```markdown
+/// <!--xxx-|->
+/// ```
+fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('-') if info.kind == Kind::Comment => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In raw continuation, after `<`, expecting a `/`.
+///
+/// ```markdown
+/// <script>console.log(1)<|/script>
+/// ```
+fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_end_tag(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In raw continuation, after `</`, expecting or inside a raw tag name.
+///
+/// ```markdown
+/// <script>console.log(1)</|script>
+/// <script>console.log(1)</s|cript>
+/// <script>console.log(1)</script|>
+/// ```
+fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase();
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.clear();
+
+ if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ continuation(tokenizer, clone, code)
+ }
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_end_tag(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In cdata continuation, after `]`, expecting `]>`.
+///
+/// ```markdown
+/// <![CDATA[>&<]|]>
+/// ```
+fn continuation_character_data_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In declaration or instruction continuation, waiting for `>` to close it.
+///
+/// ```markdown
+/// <!--|>
+/// <?ab?|>
+/// <?|>
+/// <!q|>
+/// <!--ab--|>
+/// <!--ab--|->
+/// <!--ab---|>
+/// <![CDATA[>&<]]|>
+/// ```
+fn continuation_declaration_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ Code::Char('-') if info.kind == Kind::Comment => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In closed continuation: everything we get until the eol/eof is part of it.
+///
+/// ```markdown
+/// <!doctype>|
+/// ```
+fn continuation_close(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::HtmlFlowData);
+ tokenizer.exit(TokenType::HtmlFlow);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// Before a line ending, expecting a blank line.
+///
+/// ```markdown
+/// <div>|
+///
+/// ```
+fn blank_line_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(blank_line)), None)
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
new file mode 100644
index 0000000..d671db6
--- /dev/null
+++ b/src/construct/mod.rs
@@ -0,0 +1,11 @@
+//! Constructs found in markdown.
+
+pub mod blank_line;
+pub mod character_escape;
+pub mod character_reference;
+pub mod code_fenced;
+pub mod code_indented;
+pub mod heading_atx;
+pub mod html_flow;
+pub mod partial_whitespace;
+pub mod thematic_break;
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
new file mode 100644
index 0000000..dd0d2b5
--- /dev/null
+++ b/src/construct/partial_whitespace.rs
@@ -0,0 +1,66 @@
+//! A little helper to parse `space_or_tab`
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! space_or_tab ::= 1*(' ' '\t')
+//! ```
+//!
+//! Depending on where whitespace can occur, it can be optional (or not),
+//! and present in the rendered result (or not).
+//!
+//! ## References
+//!
+//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js)
+//!
+//! <!-- To do: link stuff -->
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+// To do: should `token_type` be a `Some`, with `None` defaulting to something?
+// To do: should `max: Some(usize)` be added?
+
+/// Before whitespace.
+///
+/// ```markdown
+/// alpha| bravo
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ // To do: lifetimes.
+ let clone = token_type.clone();
+ tokenizer.enter(token_type);
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// alpha |bravo
+/// alpha | bravo
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ inside(tokenizer, code, token_type)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(token_type);
+ (State::Ok, Some(vec![code]))
+ }
+ }
+}
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
new file mode 100644
index 0000000..15ebac7
--- /dev/null
+++ b/src/construct/thematic_break.rs
@@ -0,0 +1,137 @@
+//! Thematic breaks, sometimes called horizontal rules, are a construct that
+//! occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: all markers must be identical.
+//! ; Restriction: at least 3 markers must be used.
+//! thematic_break ::= *space_or_tab 1*(1*marker *space_or_tab)
+//!
+//! space_or_tab ::= ' ' | '\t'
+//! marker ::= '*' | '-' | '_'
+//! ```
+//!
+//! Thematic breaks in markdown typically relate to the HTML element `<hr>`.
+//! See [*§ 4.4.2 The `hr` element* in the HTML spec][html] for more info.
+//!
+//! It is recommended to use exactly three asterisks without whitespace when
+//! writing markdown.
+//! As using more than three markers has no effect other than wasting space,
+//! it is recommended to use exactly three markers.
+//! Thematic breaks formed with asterisks or dashes can interfere with lists
+//! in if there is whitespace between them: `* * *` and `- - -`.
+//! For these reasons, it is recommend to not use spaces or tabs between the
+//! markers.
+//! Thematic breaks formed with dashes (without whitespace) can also form
+//! setext headings.
+//! As dashes and underscores frequently occur in natural language and URLs, it
+//! is recommended to use asterisks for thematic breaks to distinguish from
+//! such use.
+//! Because asterisks can be used to form the most markdown constructs, using
+//! them has the added benefit of making it easier to gloss over markdown: you
+//! can look for asterisks to find syntax while not worrying about other
+//! characters.
+//!
+//! ## References
+//!
+//! * [`thematic-break.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/thematic-break.js)
+//! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks)
+//!
+//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element
+//!
+//! <!-- To do: link `flow` -->
+
+use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a thematic break.
+///
+/// ```markdown
+/// |***
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == '*' || char == '-' || char == '_' => {
+ tokenizer.enter(TokenType::ThematicBreak);
+ at_break(tokenizer, code, char, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After something but before something else.
+///
+/// ```markdown
+/// |***
+/// *| * *
+/// * |* *
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.enter(TokenType::ThematicBreakSequence);
+ sequence(tokenizer, code, marker, size)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.enter(TokenType::ThematicBreakWhitespace);
+ whitespace(tokenizer, code, marker, size)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
+ if size >= THEMATIC_BREAK_MARKER_COUNT_MIN =>
+ {
+ tokenizer.exit(TokenType::ThematicBreak);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In a sequence of markers.
+///
+/// ```markdown
+/// |***
+/// *|**
+/// **|*
+/// ```
+fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ sequence(tokenizer, code, marker, size + 1)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(TokenType::ThematicBreakSequence);
+ at_break(tokenizer, code, marker, size)
+ }
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// * |* *
+/// * | * *
+/// ```
+fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ whitespace(tokenizer, code, marker, size)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(TokenType::ThematicBreakWhitespace);
+ at_break(tokenizer, code, marker, size)
+ }
+ }
+}
diff --git a/src/content/flow.rs b/src/content/flow.rs
new file mode 100644
index 0000000..21c5721
--- /dev/null
+++ b/src/content/flow.rs
@@ -0,0 +1,258 @@
+//! The flow content type.
+//!
+//! **Flow** represents the sections, such as headings, code, and content, which
+//! is parsed per line.
+//! An example is HTML, which has a certain starting condition (such as
+//! `<script>` on its own line), then continues for a while, until an end
+//! condition is found (such as `</style>`).
+//! If that line with an end condition is never found, that flow goes until
+//! the end.
+//!
+//! The constructs found in flow are:
+//!
+//! * [Blank line][crate::construct::blank_line]
+//! * [Code (fenced)][crate::construct::code_fenced]
+//! * [Code (indented)][crate::construct::code_indented]
+//! * [Heading (atx)][crate::construct::heading_atx]
+//! * [HTML (flow)][crate::construct::html_flow]
+//! * [Thematic break][crate::construct::thematic_break]
+//!
+//! <!-- To do: `setext` in content? Link to content. -->
+
+use crate::construct::{
+ blank_line::start as blank_line, code_fenced::start as code_fenced,
+ code_indented::start as code_indented, heading_atx::start as heading_atx,
+ html_flow::start as html_flow, partial_whitespace::start as whitespace,
+ thematic_break::start as thematic_break,
+};
+use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+
+/// Turn `codes` as the flow content type into events.
+// To do: remove this `allow` when all the content types are glued together.
+#[allow(dead_code)]
+pub fn flow(codes: Vec<Code>) -> Vec<Event> {
+ let mut tokenizer = Tokenizer::new();
+ let (state, remainder) = tokenizer.feed(codes, Box::new(start), true);
+
+ if let Some(ref x) = remainder {
+ if !x.is_empty() {
+ unreachable!("expected no final remainder {:?}", x);
+ }
+ }
+
+ match state {
+ State::Ok => {}
+ _ => unreachable!("expected final state to be `State::Ok`"),
+ }
+
+ tokenizer.events
+}
+
+/// Before flow.
+///
+/// First we assume a blank line.
+//
+/// ```markdown
+/// |
+/// |## alpha
+/// | bravo
+/// |***
+/// ```
+fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(blank_line, |ok| {
+ Box::new(if ok { blank_line_after } else { initial_before })
+ })(tokenizer, code),
+ }
+}
+
+/// After a blank line.
+///
+/// Move to `start` afterwards.
+///
+/// ```markdown
+/// ␠␠|
+/// ```
+fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::BlankLineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::BlankLineEnding);
+ (State::Fn(Box::new(start)), None)
+ }
+ _ => unreachable!("expected eol/eof after blank line `{:?}`", code),
+ }
+}
+
+/// Before flow (initial).
+///
+/// “Initial” flow means unprefixed flow, so right at the start of a line.
+/// Interestingly, the only flow (initial) construct is indented code.
+/// Move to `before` afterwards.
+///
+/// ```markdown
+/// |qwe
+/// | asd
+/// ```
+fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(code_indented, |ok| {
+ Box::new(if ok {
+ after
+ } else {
+ initial_before_not_code_indented
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// After a flow construct.
+///
+/// ```markdown
+/// ## alpha|
+/// |
+/// ~~~js
+/// asd
+/// ~~~|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(start)), None)
+ }
+ _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code),
+ }
+}
+
+/// Before flow (initial), but not at code (indented).
+///
+/// ```markdown
+/// |qwe
+/// ```
+fn initial_before_not_code_indented(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(code_fenced, |ok| {
+ Box::new(if ok {
+ after
+ } else {
+ initial_before_not_code_fenced
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// Before flow (initial), but not at code (fenced).
+///
+/// ```markdown
+/// |qwe
+/// ```
+fn initial_before_not_code_fenced(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(html_flow, |ok| Box::new(if ok { after } else { before }))(
+ tokenizer, code,
+ ),
+ }
+}
+
+/// Before flow, but not at code (indented) or code (fenced).
+///
+/// Compared to flow (initial), normal flow can be arbitrarily prefixed.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(before_after_prefix),
+ )(tokenizer, code)
+}
+
+/// Before flow, after potential whitespace.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(heading_atx, |ok| {
+ Box::new(if ok { after } else { before_not_heading_atx })
+ })(tokenizer, code)
+}
+
+/// Before flow, but not before a heading (atx)
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before_not_heading_atx(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(thematic_break, |ok| {
+ Box::new(if ok { after } else { before_not_thematic_break })
+ })(tokenizer, code)
+}
+
+/// Before flow, but not before a heading (atx) or thematic break.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before_not_thematic_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(html_flow, |ok| {
+ Box::new(if ok { after } else { content_before })
+ })(tokenizer, code)
+}
+
+/// Before flow, but not before a heading (atx) or thematic break.
+///
+/// At this point, we’re at content (zero or more definitions and zero or one
+/// paragraph/setext heading).
+///
+/// ```markdown
+/// |qwe
+/// ```
+// To do: currently only parses a single line.
+// To do:
+// - Multiline
+// - One or more definitions.
+// - Setext heading.
+fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ after(tokenizer, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::Content);
+ tokenizer.enter(TokenType::ContentPhrasing);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(content)), None)
+ }
+ }
+}
+/// In content.
+///
+/// ```markdown
+/// al|pha
+/// ```
+// To do: lift limitations as documented above.
+fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::ContentPhrasing);
+ tokenizer.exit(TokenType::Content);
+ after(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(content)), None)
+ }
+ }
+}
diff --git a/src/content/mod.rs b/src/content/mod.rs
new file mode 100644
index 0000000..d5771a3
--- /dev/null
+++ b/src/content/mod.rs
@@ -0,0 +1,4 @@
+//! Content types found in markdown.
+
+pub mod flow;
+pub mod string;
diff --git a/src/content/string.rs b/src/content/string.rs
new file mode 100644
index 0000000..a8a81b2
--- /dev/null
+++ b/src/content/string.rs
@@ -0,0 +1,120 @@
+//! The string content type.
+//!
+//! **String** is a limited **text** like content type which only allows
+//! character escapes and character references.
+//! It exists in things such as identifiers (media references, definitions),
+//! titles, URLs, code (fenced) info and meta parts.
+//!
+//! The constructs found in strin are:
+//!
+//! * [Character escape][crate::construct::character_escape]
+//! * [Character reference][crate::construct::character_reference]
+
+use crate::construct::{
+ character_escape::start as character_escape, character_reference::start as character_reference,
+};
+use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+
+/// Turn `codes` as the string content type into events.
+// To do: remove this `allow` when all the content types are glued together.
+#[allow(dead_code)]
+pub fn string(codes: Vec<Code>) -> Vec<Event> {
+ let mut tokenizer = Tokenizer::new();
+ let (state, remainder) = tokenizer.feed(codes, Box::new(before), true);
+
+ if let Some(ref x) = remainder {
+ if !x.is_empty() {
+ unreachable!("expected no final remainder {:?}", x);
+ }
+ }
+
+ match state {
+ State::Ok => {}
+ _ => unreachable!("expected final state to be `State::Ok`"),
+ }
+
+ tokenizer.events
+}
+
+/// Before string.
+///
+/// First we assume character reference.
+///
+/// ```markdown
+/// |&amp;
+/// |\&
+/// |qwe
+/// ```
+fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(character_reference, |ok| {
+ Box::new(if ok {
+ before
+ } else {
+ before_not_character_reference
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// Before string, not at a character reference.
+///
+/// Assume character escape.
+///
+/// ```markdown
+/// |\&
+/// |qwe
+/// ```
+fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(character_escape, |ok| {
+ Box::new(if ok {
+ before
+ } else {
+ before_not_character_escape
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// Before string, not at a character reference or character escape.
+///
+/// We’re at data.
+///
+/// ```markdown
+/// |qwe
+/// ```
+fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if let Code::None = code {
+ (State::Ok, None)
+ } else {
+ tokenizer.enter(TokenType::Data);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(in_data)), None)
+ }
+}
+
+/// In data.
+///
+/// ```markdown
+/// q|w|e
+/// ```
+fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => {
+ tokenizer.exit(TokenType::Data);
+ (State::Ok, None)
+ }
+ // To do: somehow get these markers from constructs.
+ Code::Char('&' | '\\') => {
+ tokenizer.exit(TokenType::Data);
+ before(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(in_data)), None)
+ }
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..1624a22
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,52 @@
+//! Public API of micromark.
+//!
+//! This module exposes [`micromark`][] (and [`micromark_with_options`][]).
+//! `micromark` is a safe way to transform (untrusted?) markdown into HTML.
+//! `micromark_with_options` allows you to configure how markdown is turned into
+//! HTML, such as by allowing dangerous HTML when you trust it.
+mod compiler;
+mod constant;
+mod construct;
+mod content;
+mod parser;
+mod tokenizer;
+mod util;
+
+use crate::compiler::compile;
+pub use crate::compiler::CompileOptions;
+use crate::parser::parse;
+
+/// Turn markdown into HTML.
+///
+/// ## Examples
+///
+/// ```rust
+/// use micromark::micromark;
+///
+/// let result = micromark("# Hello, world!");
+///
+/// assert_eq!(result, "<h1>Hello, world!</h1>");
+/// ```
+#[must_use]
+pub fn micromark(value: &str) -> String {
+ micromark_with_options(value, &CompileOptions::default())
+}
+
+/// Turn markdown into HTML, with configuration.
+///
+/// ## Examples
+///
+/// ```rust
+/// use micromark::{micromark_with_options, CompileOptions};
+///
+/// let result = micromark_with_options("<div>\n\n# Hello, world!\n\n</div>", &CompileOptions {
+/// allow_dangerous_html: true,
+/// });
+///
+/// assert_eq!(result, "<div>\n<h1>Hello, world!</h1>\n</div>");
+/// ```
+#[must_use]
+pub fn micromark_with_options(value: &str, options: &CompileOptions) -> String {
+ let (events, codes) = parse(value);
+ compile(&events, &codes, options)
+}
diff --git a/src/parser.rs b/src/parser.rs
new file mode 100644
index 0000000..10c6e7a
--- /dev/null
+++ b/src/parser.rs
@@ -0,0 +1,14 @@
+//! Turn a string of markdown into events.
+// To do: this should start with `containers`, when they’re done.
+// To do: definitions and such will mean more data has to be passed around.
+use crate::content::flow::flow;
+use crate::tokenizer::{as_codes, Code, Event};
+
+/// Turn a string of markdown into events.
+/// Passes the codes back so the compiler can access the source.
+pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) {
+ let codes = as_codes(value);
+ // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough.
+ let events = flow(codes.clone());
+ (events, codes)
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
new file mode 100644
index 0000000..c8b1440
--- /dev/null
+++ b/src/tokenizer.rs
@@ -0,0 +1,580 @@
+//! The tokenizer glues states from the state machine together.
+//!
+//! It facilitates everything needed to turn codes into tokens and events with
+//! a state machine.
+//! It also enables logic needed for parsing markdown, such as an [`attempt`][]
+//! to parse something, which can succeed or, when unsuccessful, revert the
+//! attempt.
+//! Similarly, a [`check`][] exists, which does the same as an `attempt` but
+//! reverts even if successful.
+//!
+//! [`attempt`]: Tokenizer::attempt
+//! [`check`]: Tokenizer::check
+
+use crate::constant::TAB_SIZE;
+
+/// Semantic label of a span.
+// To do: figure out how to share this so extensions can add their own stuff,
+// though perhaps that’s impossible and we should inline all extensions?
+// To do: document each variant.
+#[derive(Debug, Clone, PartialEq)]
+pub enum TokenType {
+ AtxHeading,
+ AtxHeadingSequence,
+ AtxHeadingWhitespace,
+ AtxHeadingText,
+
+ CharacterEscape,
+ CharacterEscapeMarker,
+ CharacterEscapeValue,
+
+ CharacterReference,
+ CharacterReferenceMarker,
+ CharacterReferenceMarkerNumeric,
+ CharacterReferenceMarkerHexadecimal,
+ CharacterReferenceMarkerSemi,
+ CharacterReferenceValue,
+
+ CodeFenced,
+ CodeFencedFence,
+ CodeFencedFenceSequence,
+ CodeFencedFenceWhitespace,
+ CodeFencedFenceInfo,
+ CodeFencedFenceMeta,
+
+ CodeIndented,
+ CodeIndentedPrefixWhitespace,
+
+ CodeFlowChunk,
+
+ Data,
+
+ HtmlFlow,
+ HtmlFlowData,
+
+ ThematicBreak,
+ ThematicBreakSequence,
+ ThematicBreakWhitespace,
+
+ Whitespace,
+ LineEnding,
+ BlankLineEnding,
+ BlankLineWhitespace,
+
+ Content,
+ ContentPhrasing,
+ ChunkString,
+}
+
+/// Enum representing a character code.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum Code {
+ /// End of the input stream (called eof).
+ None,
+ /// Used to make parsing line endings easier as it represents both
+ /// `Code::Char('\r')` and `Code::Char('\n')` combined.
+ CarriageReturnLineFeed,
+ /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab
+ /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s.
+ VirtualSpace,
+ /// The most frequent variant of this enum is `Code::Char(char)`, which just
+ /// represents a char, but micromark adds meaning to certain other values.
+ Char(char),
+}
+
+/// A location in the document (`line`/`column`/`offset`).
+///
+/// The interface for the location in the document comes from unist `Point`:
+/// <https://github.com/syntax-tree/unist#point>.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Point {
+ /// 1-indexed line number.
+ pub line: usize,
+ /// 1-indexed column number.
+ /// Note that this is increases up to a tab stop for tabs.
+ /// Some editors count tabs as 1 character, so this position is not always
+ /// the same as editors.
+ pub column: usize,
+ /// 0-indexed position in the document.
+ pub offset: usize,
+}
+
+/// Possible event types.
+#[derive(Debug, PartialEq)]
+pub enum EventType {
+ /// The start of something.
+ Enter,
+ /// The end of something.
+ Exit,
+}
+
+/// Something semantic happening somewhere.
+#[derive(Debug)]
+pub struct Event {
+ pub event_type: EventType,
+ pub token_type: TokenType,
+ pub point: Point,
+ pub index: usize,
+}
+
+/// The essence of the state machine are functions: `StateFn`.
+/// It’s responsible for dealing with that single passed [`Code`][].
+/// It yields a [`StateFnResult`][].
+pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult;
+/// Each [`StateFn`][] yields something back: primarily the state.
+/// In certain cases, it can also yield back up parsed codes that were passed down.
+pub type StateFnResult = (State, Option<Vec<Code>>);
+
+/// The result of a state.
+pub enum State {
+ /// There is a future state: a boxed [`StateFn`][] to pass the next code to.
+ Fn(Box<StateFn>),
+ /// The state is successful.
+ Ok,
+ /// The state is not successful.
+ Nok,
+}
+
+/// The internal state of a tokenizer, not to be confused with states from the
+/// state machine, this instead is all the information about where we currently
+/// are and what’s going on.
+#[derive(Debug, Clone)]
+struct InternalState {
+ /// Length of `events`. We only add to events, so reverting will just pop stuff off.
+ events_len: usize,
+ /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt.
+ stack_len: usize,
+ /// Current code.
+ current: Code,
+ /// `index` in codes of the current code.
+ index: usize,
+ /// Current relative and absolute position in the file.
+ point: Point,
+}
+
+/// A tokenizer itself.
+#[derive(Debug)]
+pub struct Tokenizer {
+ /// Track whether a character is expected to be consumed, and whether it’s
+ /// actually consumed
+ ///
+ /// Tracked to make sure everything’s valid.
+ consumed: bool,
+ /// Semantic labels of one or more codes in `codes`.
+ pub events: Vec<Event>,
+ /// Hierarchy of semantic labels.
+ ///
+ /// Tracked to make sure everything’s valid.
+ stack: Vec<TokenType>,
+ /// Current character code.
+ current: Code,
+ /// `index` in codes of the current code.
+ index: usize,
+ /// Current relative and absolute place in the file.
+ point: Point,
+}
+
+impl Tokenizer {
+ /// Create a new tokenizer.
+ pub fn new() -> Tokenizer {
+ Tokenizer {
+ current: Code::None,
+ index: 0,
+ consumed: true,
+ point: Point {
+ line: 1,
+ column: 1,
+ offset: 0,
+ },
+ stack: vec![],
+ events: vec![],
+ }
+ }
+
+ /// Prepare for a next code to get consumed.
+ fn expect(&mut self, code: Code) {
+ assert!(self.consumed, "expected previous character to be consumed");
+ self.consumed = false;
+ self.current = code;
+ }
+
+ /// Consume the current character.
+ /// Each [`StateFn`][] is expected to call this to signal that this code is
+ /// used, or call a next `StateFn`.
+ pub fn consume(&mut self, code: Code) {
+ assert_eq!(
+ code, self.current,
+ "expected given code to equal expected code"
+ );
+ log::debug!("consume: `{:?}` ({:?})", code, self.point);
+ assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned");
+
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ self.point.line += 1;
+ self.point.column = 1;
+ self.point.offset += if code == Code::CarriageReturnLineFeed {
+ 2
+ } else {
+ 1
+ };
+ // To do: accountForPotentialSkip()
+ log::debug!("position: after eol: `{:?}`", self.point);
+ }
+ Code::VirtualSpace => {
+ // Empty.
+ }
+ _ => {
+ self.point.column += 1;
+ self.point.offset += 1;
+ }
+ }
+
+ self.index += 1;
+ // Mark as consumed.
+ self.consumed = true;
+ }
+
+ /// Mark the start of a semantic label.
+ pub fn enter(&mut self, token_type: TokenType) {
+ log::debug!("enter `{:?}` ({:?})", token_type, self.point);
+ let event = Event {
+ event_type: EventType::Enter,
+ token_type: token_type.clone(),
+ point: self.point.clone(),
+ index: self.index,
+ };
+
+ self.events.push(event);
+ self.stack.push(token_type);
+ }
+
+ /// Mark the end of a semantic label.
+ pub fn exit(&mut self, token_type: TokenType) {
+ let token_on_stack = self.stack.pop().expect("cannot close w/o open tokens");
+
+ assert_eq!(
+ token_on_stack, token_type,
+ "expected exit TokenType to match current TokenType"
+ );
+
+ let ev = self.events.last().expect("cannot close w/o open event");
+
+ let point = self.point.clone();
+
+ assert!(
+ token_on_stack != ev.token_type || ev.point != point,
+ "expected non-empty TokenType"
+ );
+
+ log::debug!("exit `{:?}` ({:?})", token_type, self.point);
+ let event = Event {
+ event_type: EventType::Exit,
+ token_type,
+ point,
+ index: self.index,
+ };
+
+ self.events.push(event);
+ }
+
+ /// Capture the internal state.
+ fn capture(&mut self) -> InternalState {
+ InternalState {
+ index: self.index,
+ current: self.current,
+ point: self.point.clone(),
+ events_len: self.events.len(),
+ stack_len: self.stack.len(),
+ }
+ }
+
+ /// Apply the internal state.
+ fn free(&mut self, previous: InternalState) {
+ self.index = previous.index;
+ self.current = previous.current;
+ self.point = previous.point;
+ assert!(
+ self.events.len() >= previous.events_len,
+ "expected to restore less events than before"
+ );
+ self.events.truncate(previous.events_len);
+ assert!(
+ self.stack.len() >= previous.stack_len,
+ "expected to restore less stack items than before"
+ );
+ self.stack.truncate(previous.stack_len);
+ }
+
+ /// Check if `state` and its future states are successful or not.
+ ///
+ /// This captures the current state of the tokenizer, returns a wrapped
+ /// state that captures all codes and feeds them to `state` and its future
+ /// states until it yields [`State::Ok`][] or [`State::Nok`][].
+ /// It then applies the captured state, calls `done`, and feeds all
+ /// captured codes to its future states.
+ pub fn check(
+ &mut self,
+ state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ done: impl FnOnce(bool) -> Box<StateFn> + 'static,
+ ) -> Box<StateFn> {
+ let previous = self.capture();
+
+ attempt_impl(
+ state,
+ vec![],
+ |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| {
+ let codes = result.0;
+ tokenizer.free(previous);
+ log::debug!(
+ "check: {:?}, codes: {:?}, at {:?}",
+ ok,
+ codes,
+ tokenizer.point
+ );
+ let result = done(ok);
+ tokenizer.feed(codes, result, false)
+ },
+ )
+ }
+
+ /// Attempt to parse with `state` and its future states, reverting if
+ /// unsuccessful.
+ ///
+ /// This captures the current state of the tokenizer, returns a wrapped
+ /// state that captures all codes and feeds them to `state` and its future
+ /// states until it yields [`State::Ok`][], at which point it calls `done`
+ /// and yields its result.
+ /// If instead [`State::Nok`][] was yielded, the captured state is applied,
+ /// `done` is called, and all captured codes are fed to its future states.
+ pub fn attempt(
+ &mut self,
+ state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ done: impl FnOnce(bool) -> Box<StateFn> + 'static,
+ ) -> Box<StateFn> {
+ let previous = self.capture();
+
+ attempt_impl(
+ state,
+ vec![],
+ |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| {
+ let codes = if ok {
+ result.1
+ } else {
+ tokenizer.free(previous);
+ result.0
+ };
+
+ log::debug!(
+ "attempt: {:?}, codes: {:?}, at {:?}",
+ ok,
+ codes,
+ tokenizer.point
+ );
+ let result = done(ok);
+ tokenizer.feed(codes, result, false)
+ },
+ )
+ }
+
+ /// Feed a list of `codes` into `start`.
+ ///
+ /// This is set up to support repeatedly calling `feed`, and thus streaming
+ /// markdown into the state machine, and normally pauses after feeding.
+ /// When `done: true` is passed, the EOF is fed.
+ pub fn feed(
+ &mut self,
+ codes: Vec<Code>,
+ start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ drain: bool,
+ ) -> StateFnResult {
+ let mut codes = codes;
+ let mut state = State::Fn(Box::new(start));
+ let mut index = 0;
+
+ self.consumed = true;
+
+ while index < codes.len() {
+ let code = codes[index];
+
+ match state {
+ State::Nok | State::Ok => {
+ break;
+ }
+ State::Fn(func) => {
+ log::debug!("main: passing `{:?}`", code);
+ self.expect(code);
+ let (next, remainder) = check_statefn_result(func(self, code));
+ state = next;
+ index = index + 1
+ - (if let Some(ref x) = remainder {
+ x.len()
+ } else {
+ 0
+ });
+ }
+ }
+ }
+
+ // Yield to a higher loop if we shouldn’t feed EOFs.
+ if !drain {
+ return (state, Some(codes.split_off(index)));
+ }
+
+ loop {
+ // Feed EOF.
+ match state {
+ State::Ok | State::Nok => break,
+ State::Fn(func) => {
+ let code = Code::None;
+ log::debug!("main: passing eof");
+ self.expect(code);
+ let (next, remainder) = check_statefn_result(func(self, code));
+
+ if let Some(ref x) = remainder {
+ if !x.is_empty() {
+ // To do: handle?
+ unreachable!("drain:remainder {:?}", x);
+ }
+ }
+
+ state = next;
+ }
+ }
+ }
+
+ check_statefn_result((state, None))
+ }
+}
+
+/// Internal utility to wrap states to also capture codes.
+///
+/// Recurses into itself.
+/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check].
+fn attempt_impl(
+ state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ codes: Vec<Code>,
+ done: impl FnOnce((Vec<Code>, Vec<Code>), bool, &mut Tokenizer) -> StateFnResult + 'static,
+) -> Box<StateFn> {
+ Box::new(|tokenizer, code| {
+ let mut codes = codes;
+
+ let (next, remainder) = check_statefn_result(state(tokenizer, code));
+
+ match code {
+ Code::None => {}
+ _ => {
+ codes.push(code);
+ }
+ }
+
+ // To do: `remainder` must never be bigger than codes I guess?
+ // To do: `remainder` probably has to be taken *from* `codes`, in a similar vain to the `Ok` handling below.
+ match next {
+ State::Ok => {
+ let remaining = if let Some(x) = remainder { x } else { vec![] };
+ check_statefn_result(done((codes, remaining), true, tokenizer))
+ }
+ State::Nok => check_statefn_result(done((codes, vec![]), false, tokenizer)),
+ State::Fn(func) => {
+ check_statefn_result((State::Fn(attempt_impl(func, codes, done)), None))
+ }
+ }
+ })
+}
+
+/// Turn a string into codes.
+// To do: handle BOM at start?
+pub fn as_codes(value: &str) -> Vec<Code> {
+ let mut codes: Vec<Code> = vec![];
+ let mut at_carriage_return = false;
+ let mut column = 1;
+
+ for char in value.chars() {
+ // Send a CRLF.
+ if at_carriage_return && '\n' == char {
+ at_carriage_return = false;
+ codes.push(Code::CarriageReturnLineFeed);
+ } else {
+ // Send the previous CR: we’re not at a next `\n`.
+ if at_carriage_return {
+ at_carriage_return = false;
+ codes.push(Code::Char('\r'));
+ }
+
+ match char {
+ // Send a replacement character.
+ '\0' => {
+ column += 1;
+ codes.push(Code::Char('�'));
+ }
+ // Send a tab and virtual spaces.
+ '\t' => {
+ // To do: is this correct?
+ let virtual_spaces = TAB_SIZE - (column % TAB_SIZE);
+ println!("tabs, expand {:?}, {:?}", column, virtual_spaces);
+ codes.push(Code::Char(char));
+ column += 1;
+ let mut index = 0;
+ while index < virtual_spaces {
+ codes.push(Code::VirtualSpace);
+ column += 1;
+ index += 1;
+ }
+ }
+ // Send an LF.
+ '\n' => {
+ column = 1;
+ codes.push(Code::Char(char));
+ }
+ // Don’t send anything yet.
+ '\r' => {
+ column = 1;
+ at_carriage_return = true;
+ }
+ // Send the char.
+ _ => {
+ column += 1;
+ codes.push(Code::Char(char));
+ }
+ }
+ };
+ }
+
+ // To do: handle a final CR?
+
+ codes
+}
+
+/// Check a [`StateFnResult`][], make sure its valid (that there are no bugs),
+/// and clean a final eof passed back in `remainder`.
+fn check_statefn_result(result: StateFnResult) -> StateFnResult {
+ let (state, mut remainder) = result;
+
+ match state {
+ State::Nok | State::Fn(_) => {
+ if let Some(ref x) = remainder {
+ assert_eq!(
+ x.len(),
+ 0,
+ "expected `None` to be passed back as remainder from `State::Nok`, `State::Fn`"
+ );
+ }
+ }
+ State::Ok => {}
+ }
+
+ // Remove an eof.
+ // For convencience, feeding back an eof is allowed, but cleaned here.
+ // Most states handle eof and eol in the same branch, and hence pass
+ // all back.
+ // This might not be needed, because if EOF is passed back, we’re at the EOF.
+ // But they’re not supposed to be in codes, so here we remove them.
+ if let Some(ref mut list) = remainder {
+ if Some(&Code::None) == list.last() {
+ list.pop();
+ }
+ }
+
+ (state, remainder)
+}
diff --git a/src/util.rs b/src/util.rs
new file mode 100644
index 0000000..47359a3
--- /dev/null
+++ b/src/util.rs
@@ -0,0 +1,241 @@
+//! Some utilities helpful when parsing and compiling markdown.
+
+use crate::constant::{CHARACTER_REFERENCE_NAMES, CHARACTER_REFERENCE_VALUES};
+use crate::tokenizer::{Code, Event, EventType};
+
+/// Encode dangerous html characters.
+///
+/// This ensures that certain characters which have special meaning in HTML are
+/// dealt with.
+/// Technically, we can skip `>` and `"` in many cases, but CM includes them.
+///
+/// This behavior is not explained in prose in `CommonMark` but can be inferred
+/// from the input/output test cases.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::encode;
+///
+/// assert_eq!(encode("I <3 🦀"), "I &lt;3 🦀");
+/// ```
+///
+/// ## References
+///
+/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
+pub fn encode(value: &str) -> String {
+ value
+ .replace('&', "&amp;")
+ .replace('"', "&quot;")
+ .replace('<', "&lt;")
+ .replace('>', "&gt;")
+}
+
+/// Decode numeric character references.
+///
+/// Turn the number (in string form as either hexadecimal or decimal) coming
+/// from a numeric character reference into a character.
+/// Whether the base of the string form is `10` (decimal) or `16` (hexadecimal)
+/// must be passed as the `radix` parameter.
+///
+/// This returns the `char` associated with that number or a replacement
+/// character for C0 control characters (except for ASCII whitespace), C1
+/// control characters, lone surrogates, noncharacters, and out of range
+/// characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_numeric_character_reference;
+///
+/// assert_eq!(decode_numeric_character_reference("123", 10), '{');
+/// assert_eq!(decode_numeric_character_reference("9", 16), '\t');
+/// assert_eq!(decode_numeric_character_reference("0", 10), '�'); // Not allowed.
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a invalid string or an out of bounds valid string
+/// is given.
+/// It is expected that figuring out whether a number is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_numeric_character_reference(value: &str, radix: u32) -> char {
+ let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int");
+
+ if
+ // C0 except for HT, LF, FF, CR, space
+ code < 0x09 ||
+ code == 0x0B ||
+ (code > 0x0D && code < 0x20) ||
+ // Control character (DEL) of the basic block and C1 controls.
+ (code > 0x7E && code < 0xA0) ||
+ // Lone high surrogates and low surrogates.
+ (code > 0xd7ff && code < 0xe000) ||
+ // Noncharacters.
+ (code > 0xfdcf && code < 0xfdf0) ||
+ ((code & 0xffff) == 0xffff) ||
+ ((code & 0xffff) == 0xfffe) ||
+ // Out of range
+ code > 0x0010_ffff
+ {
+ '�'
+ } else {
+ char::from_u32(code).expect("expected valid `code`")
+ }
+}
+
+/// Decode named character references.
+///
+/// Turn the name coming from a named character reference (without the `&` or
+/// `;`) into a string.
+/// This looks the given string up in [`CHARACTER_REFERENCE_NAMES`][] and then
+/// takes the corresponding value from [`CHARACTER_REFERENCE_VALUES`][].
+///
+/// The result is `String` instead of `char` because named character references
+/// can expand into multiple characters.
+///
+/// ## Examples
+///
+/// ```rust ignore
+/// use micromark::util::decode_named_character_reference;
+///
+/// assert_eq!(decode_named_character_reference("amp"), "&");
+/// assert_eq!(decode_named_character_reference("AElig"), "Æ");
+/// assert_eq!(decode_named_character_reference("aelig"), "æ");
+/// ```
+///
+/// ## Panics
+///
+/// This function panics if a name not in [`CHARACTER_REFERENCE_NAMES`][] is
+/// given.
+/// It is expected that figuring out whether a name is allowed is handled in
+/// the parser.
+/// When `micromark` is used, this function never panics.
+///
+/// ## References
+///
+/// * [`wooorm/decode-named-character-reference`](https://github.com/wooorm/decode-named-character-reference)
+/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
+pub fn decode_named_character_reference(value: &str) -> String {
+ let position = CHARACTER_REFERENCE_NAMES.iter().position(|&x| x == value);
+ if let Some(index) = position {
+ CHARACTER_REFERENCE_VALUES[index].to_string()
+ } else {
+ unreachable!("expected valid `name`")
+ }
+}
+
+/// A struct representing the span of an opening and closing event of a token.
+#[derive(Debug)]
+pub struct Span {
+ // To do: probably needed in the future.
+ // start: Point,
+ /// Absolute offset (and `index` in `codes`) of where this span starts.
+ pub start_index: usize,
+ // To do: probably needed in the future.
+ // end: Point,
+ /// Absolute offset (and `index` in `codes`) of where this span ends.
+ pub end_index: usize,
+ // To do: probably needed in the future.
+ // token_type: TokenType,
+}
+
+/// Get a span from an event.
+///
+/// Get the span of an `exit` event, by looking backwards through the events to
+/// find the corresponding `enter` event.
+/// This assumes that tokens with the same are not nested.
+///
+/// ## Panics
+///
+/// This function panics if an enter event is given.
+/// When `micromark` is used, this function never panics.
+pub fn get_span(events: &[Event], index: usize) -> Span {
+ let exit = &events[index];
+ // let end = exit.point.clone();
+ let end_index = exit.index;
+ let token_type = exit.token_type.clone();
+ // To do: support `enter` events if needed and walk forwards?
+ assert_eq!(
+ exit.event_type,
+ EventType::Exit,
+ "expected get_span to be called on `exit` event"
+ );
+ let mut start_index = index - 1;
+
+ loop {
+ let enter = &events[start_index];
+ if enter.event_type == EventType::Enter && enter.token_type == token_type {
+ return Span {
+ // start: enter.point.clone(),
+ start_index: enter.index,
+ // end,
+ end_index,
+ // token_type,
+ };
+ }
+
+ start_index -= 1;
+ }
+}
+
+/// Serialize a span, optionally expanding tabs.
+pub fn slice_serialize(codes: &[Code], span: &Span, expand_tabs: bool) -> String {
+ serialize_chunks(slice_codes(codes, span), expand_tabs)
+}
+
+/// Get a slice of codes from a span.
+pub fn slice_codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
+ &codes[span.start_index..span.end_index]
+}
+
+/// Serialize a slice of codes, optionally expanding tabs.
+pub fn serialize_chunks(codes: &[Code], expand_tabs: bool) -> String {
+ let mut at_tab = false;
+ let mut index = 0;
+ let mut value: Vec<char> = vec![];
+
+ while index < codes.len() {
+ let code = codes[index];
+ let mut at_tab_next = false;
+
+ match code {
+ Code::CarriageReturnLineFeed => {
+ value.push('\r');
+ value.push('\n');
+ }
+ Code::Char(char) if char == '\n' || char == '\r' => {
+ value.push(char);
+ }
+ Code::Char(char) if char == '\t' => {
+ at_tab_next = true;
+ value.push(if expand_tabs { ' ' } else { char });
+ }
+ Code::VirtualSpace => {
+ if !expand_tabs && at_tab {
+ index += 1;
+ continue;
+ }
+ value.push(' ');
+ }
+ Code::Char(char) => {
+ value.push(char);
+ }
+ Code::None => {
+ unreachable!("unexpected EOF code in codes");
+ }
+ }
+
+ at_tab = at_tab_next;
+
+ index += 1;
+ }
+
+ value.into_iter().collect()
+}
diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs
new file mode 100644
index 0000000..46fa9cb
--- /dev/null
+++ b/tests/code_fenced.rs
@@ -0,0 +1,266 @@
+extern crate micromark;
+use micromark::micromark;
+
+#[test]
+fn code_fenced() {
+ assert_eq!(
+ micromark("```\n<\n >\n```"),
+ "<pre><code>&lt;\n &gt;\n</code></pre>",
+ "should support fenced code w/ grave accents"
+ );
+
+ assert_eq!(
+ micromark("~~~\n<\n >\n~~~"),
+ "<pre><code>&lt;\n &gt;\n</code></pre>",
+ "should support fenced code w/ tildes"
+ );
+
+ // To do: code (text).
+ // assert_eq!(
+ // micromark("``\nfoo\n``"),
+ // "<p><code>foo</code></p>",
+ // "should not support fenced code w/ less than three markers"
+ // );
+
+ assert_eq!(
+ micromark("```\naaa\n~~~\n```"),
+ "<pre><code>aaa\n~~~\n</code></pre>",
+ "should not support a tilde closing sequence for a grave accent opening sequence"
+ );
+
+ assert_eq!(
+ micromark("~~~\naaa\n```\n~~~"),
+ "<pre><code>aaa\n```\n</code></pre>",
+ "should not support a grave accent closing sequence for a tilde opening sequence"
+ );
+
+ assert_eq!(
+ micromark("````\naaa\n```\n``````"),
+ "<pre><code>aaa\n```\n</code></pre>",
+ "should support a closing sequence longer, but not shorter than, the opening"
+ );
+
+ assert_eq!(
+ micromark("~~~~\naaa\n~~~\n~~~~"),
+ "<pre><code>aaa\n~~~\n</code></pre>",
+ "should support a closing sequence equal to, but not shorter than, the opening"
+ );
+
+ assert_eq!(
+ micromark("```"),
+ "<pre><code></code></pre>\n",
+ "should support an eof right after an opening sequence"
+ );
+
+ assert_eq!(
+ micromark("`````\n\n```\naaa\n"),
+ "<pre><code>\n```\naaa\n</code></pre>\n",
+ "should support an eof somewhere in content"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark("> ```\n> aaa\n\nbbb"),
+ // "<blockquote>\n<pre><code>aaa\n</code></pre>\n</blockquote>\n<p>bbb</p>",
+ // "should support no closing sequence in a block quote"
+ // );
+
+ assert_eq!(
+ micromark("```\n\n \n```"),
+ "<pre><code>\n \n</code></pre>",
+ "should support blank lines in fenced code"
+ );
+
+ assert_eq!(
+ micromark("```\n```"),
+ "<pre><code></code></pre>",
+ "should support empty fenced code"
+ );
+
+ assert_eq!(
+ micromark(" ```\n aaa\naaa\n```"),
+ "<pre><code>aaa\naaa\n</code></pre>",
+ "should remove up to one space from the content if the opening sequence is indented w/ 1 space"
+ );
+
+ assert_eq!(
+ micromark(" ```\naaa\n aaa\naaa\n ```"),
+ "<pre><code>aaa\naaa\naaa\n</code></pre>",
+ "should remove up to two space from the content if the opening sequence is indented w/ 2 spaces"
+ );
+
+ assert_eq!(
+ micromark(" ```\n aaa\n aaa\n aaa\n ```"),
+ "<pre><code>aaa\n aaa\naaa\n</code></pre>",
+ "should remove up to three space from the content if the opening sequence is indented w/ 3 spaces"
+ );
+
+ assert_eq!(
+ micromark(" ```\n aaa\n ```"),
+ "<pre><code>```\naaa\n```\n</code></pre>",
+ "should not support indenteding the opening sequence w/ 4 spaces"
+ );
+
+ assert_eq!(
+ micromark("```\naaa\n ```"),
+ "<pre><code>aaa\n</code></pre>",
+ "should support an indented closing sequence"
+ );
+
+ assert_eq!(
+ micromark(" ```\naaa\n ```"),
+ "<pre><code>aaa\n</code></pre>",
+ "should support a differently indented closing sequence than the opening sequence"
+ );
+
+ assert_eq!(
+ micromark("```\naaa\n ```\n"),
+ "<pre><code>aaa\n ```\n</code></pre>\n",
+ "should not support an indented closing sequence w/ 4 spaces"
+ );
+
+ // To do: code (text).
+ // assert_eq!(
+ // micromark("``` ```\naaa"),
+ // "<p><code> </code>\naaa</p>",
+ // "should not support grave accents in the opening fence after the opening sequence"
+ // );
+
+ assert_eq!(
+ micromark("~~~~~~\naaa\n~~~ ~~\n"),
+ "<pre><code>aaa\n~~~ ~~\n</code></pre>\n",
+ "should not support spaces in the closing sequence"
+ );
+
+ assert_eq!(
+ micromark("foo\n```\nbar\n```\nbaz"),
+ "<p>foo</p>\n<pre><code>bar\n</code></pre>\n<p>baz</p>",
+ "should support interrupting paragraphs"
+ );
+
+ // To do: setext.
+ // assert_eq!(
+ // micromark("foo\n---\n~~~\nbar\n~~~\n# baz"),
+ // "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>",
+ // "should support interrupting other content"
+ // );
+
+ assert_eq!(
+ micromark("```ruby\ndef foo(x)\n return 3\nend\n```"),
+ "<pre><code class=\"language-ruby\">def foo(x)\n return 3\nend\n</code></pre>",
+ "should support the info string as a `language-` class (1)"
+ );
+
+ assert_eq!(
+ micromark("````;\n````"),
+ "<pre><code class=\"language-;\"></code></pre>",
+ "should support the info string as a `language-` class (2)"
+ );
+
+ assert_eq!(
+ micromark("~~~~ ruby startline=3 $%@#$\ndef foo(x)\n return 3\nend\n~~~~~~~"),
+ "<pre><code class=\"language-ruby\">def foo(x)\n return 3\nend\n</code></pre>",
+ "should support the info string as a `language-` class, but not the meta string"
+ );
+
+ // To do: code (text).
+ // assert_eq!(
+ // micromark("``` aa ```\nfoo"),
+ // "<p><code>aa</code>\nfoo</p>",
+ // "should not support grave accents in the meta string"
+ // );
+
+ assert_eq!(
+ micromark("~~~ aa ``` ~~~\nfoo\n~~~"),
+ "<pre><code class=\"language-aa\">foo\n</code></pre>",
+ "should support grave accents and tildes in the meta string of tilde fenced code"
+ );
+
+ assert_eq!(
+ micromark("```\n``` aaa\n```"),
+ "<pre><code>``` aaa\n</code></pre>",
+ "should not support info string on closing sequences"
+ );
+
+ // Our own:
+ assert_eq!(
+ micromark("``` "),
+ "<pre><code></code></pre>\n",
+ "should support an eof after whitespace, after the start fence sequence"
+ );
+
+ assert_eq!(
+ micromark("``` js\nalert(1)\n```"),
+ "<pre><code class=\"language-js\">alert(1)\n</code></pre>",
+ "should support whitespace between the sequence and the info string"
+ );
+
+ assert_eq!(
+ micromark("```js"),
+ "<pre><code class=\"language-js\"></code></pre>\n",
+ "should support an eof after the info string"
+ );
+
+ assert_eq!(
+ micromark("``` js \nalert(1)\n```"),
+ "<pre><code class=\"language-js\">alert(1)\n</code></pre>",
+ "should support whitespace after the info string"
+ );
+
+ assert_eq!(
+ micromark("```\n "),
+ "<pre><code> \n</code></pre>\n",
+ "should support an eof after whitespace in content"
+ );
+
+ assert_eq!(
+ micromark(" ```\n "),
+ "<pre><code></code></pre>\n",
+ "should support an eof in the prefix, in content"
+ );
+
+ // To do: strings.
+ // assert_eq!(
+ // micromark("```j\\+s&copy;"),
+ // "<pre><code class=\"language-j+s©\"></code></pre>\n",
+ // "should support character escapes and character references in info strings"
+ // );
+
+ assert_eq!(
+ micromark(" ```\naaa\n ```"),
+ "<pre><code>aaa\n ```\n</code></pre>\n",
+ "should not support a closing sequence w/ too much indent, regardless of opening sequence (1)"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark("> ```\n>\n>\n>\n\na"),
+ // "<blockquote>\n<pre><code>\n\n\n</code></pre>\n</blockquote>\n<p>a</p>",
+ // "should not support a closing sequence w/ too much indent, regardless of opening sequence (2)"
+ // );
+
+ // assert_eq!(
+ // micromark("> ```a\nb"),
+ // "<blockquote>\n<pre><code class=\"language-a\"></code></pre>\n</blockquote>\n<p>b</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n```b"),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<pre><code class=\"language-b\"></code></pre>\n",
+ // "should not support lazyness (2)"
+ // );
+
+ // assert_eq!(
+ // micromark("> ```a\n```"),
+ // "<blockquote>\n<pre><code class=\"language-a\"></code></pre>\n</blockquote>\n<pre><code></code></pre>\n",
+ // "should not support lazyness (3)"
+ // );
+
+ // To do: extensions.
+ // assert_eq!(
+ // micromark("```", {extensions: [{disable: {null: ["codeFenced"]}}]}),
+ // "<p>```</p>",
+ // "should support turning off code (fenced)"
+ // );
+}
diff --git a/tests/code_indented.rs b/tests/code_indented.rs
new file mode 100644
index 0000000..f5926c0
--- /dev/null
+++ b/tests/code_indented.rs
@@ -0,0 +1,196 @@
+extern crate micromark;
+use micromark::micromark;
+
+#[test]
+fn code_indented() {
+ assert_eq!(
+ micromark(" a simple\n indented code block"),
+ "<pre><code>a simple\n indented code block\n</code></pre>",
+ "should support indented code"
+ );
+
+ // To do: list.
+ // assert_eq!(
+ // micromark(" - foo\n\n bar"),
+ // "<ul>\n<li>\n<p>foo</p>\n<p>bar</p>\n</li>\n</ul>",
+ // "should prefer list item content over indented code (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("1. foo\n\n - bar"),
+ // "<ol>\n<li>\n<p>foo</p>\n<ul>\n<li>bar</li>\n</ul>\n</li>\n</ol>",
+ // "should prefer list item content over indented code (2)"
+ // );
+
+ assert_eq!(
+ micromark(" <a/>\n *hi*\n\n - one"),
+ "<pre><code>&lt;a/&gt;\n*hi*\n\n- one\n</code></pre>",
+ "should support blank lines in indented code (1)"
+ );
+
+ assert_eq!(
+ micromark(" chunk1\n\n chunk2\n \n \n \n chunk3"),
+ "<pre><code>chunk1\n\nchunk2\n\n\n\nchunk3\n</code></pre>",
+ "should support blank lines in indented code (2)"
+ );
+
+ assert_eq!(
+ micromark(" chunk1\n \n chunk2"),
+ "<pre><code>chunk1\n \n chunk2\n</code></pre>",
+ "should support blank lines in indented code (3)"
+ );
+
+ // To do: paragraphs.
+ // assert_eq!(
+ // micromark("Foo\n bar"),
+ // "<p>Foo\nbar</p>",
+ // "should not support interrupting paragraphs"
+ // );
+
+ // To do: paragraphs.
+ // assert_eq!(
+ // micromark(" foo\nbar"),
+ // "<pre><code>foo\n</code></pre>\n<p>bar</p>",
+ // "should support paragraphs directly after indented code"
+ // );
+
+ // To do: setext.
+ // assert_eq!(
+ // micromark("# Heading\n foo\nHeading\n------\n foo\n----"),
+ // "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />",
+ // "should mix w/ other content"
+ // );
+
+ assert_eq!(
+ micromark(" foo\n bar"),
+ "<pre><code> foo\nbar\n</code></pre>",
+ "should support extra whitespace on the first line"
+ );
+
+ assert_eq!(
+ micromark("\n \n foo\n "),
+ "<pre><code>foo\n</code></pre>",
+ "should not support initial blank lines"
+ );
+
+ assert_eq!(
+ micromark(" foo "),
+ "<pre><code>foo \n</code></pre>",
+ "should support trailing whitespace"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark("> a\nb"),
+ // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<p>b</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b"),
+ // "<blockquote>\n<p>a\nb</p>\n</blockquote>",
+ // "should not support lazyness (2)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b"),
+ // "<blockquote>\n<p>a\nb</p>\n</blockquote>",
+ // "should not support lazyness (3)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b"),
+ // "<blockquote>\n<p>a\nb</p>\n</blockquote>",
+ // "should not support lazyness (4)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b"),
+ // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code>b\n</code></pre>",
+ // "should not support lazyness (5)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b"),
+ // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code> b\n</code></pre>",
+ // "should not support lazyness (6)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b"),
+ // "<blockquote>\n<pre><code>a\n</code></pre>\n</blockquote>\n<pre><code> b\n</code></pre>",
+ // "should not support lazyness (7)"
+ // );
+
+ // To do: extensions.
+ // assert_eq!(
+ // micromark(" a", {extensions: [{disable: {null: ["codeIndented"]}}]}),
+ // "<p>a</p>",
+ // "should support turning off code (indented, 1)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n b", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<blockquote>\n<p>a\nb</p>\n</blockquote>",
+ // "should support turning off code (indented, 2)"
+ // );
+
+ // assert_eq!(
+ // micromark("- a\n b", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<ul>\n<li>a\nb</li>\n</ul>",
+ // "should support turning off code (indented, 3)"
+ // );
+
+ // assert_eq!(
+ // micromark("- a\n - b", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<ul>\n<li>a\n<ul>\n<li>b</li>\n</ul>\n</li>\n</ul>",
+ // "should support turning off code (indented, 4)"
+ // );
+
+ // assert_eq!(
+ // micromark("- a\n - b", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<ul>\n<li>a\n<ul>\n<li>b</li>\n</ul>\n</li>\n</ul>",
+ // "should support turning off code (indented, 5)"
+ // );
+
+ // assert_eq!(
+ // micromark("```\na\n ```", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<pre><code>a\n</code></pre>",
+ // "should support turning off code (indented, 6)"
+ // );
+
+ // assert_eq!(
+ // micromark("a <?\n ?>", {
+ // allowDangerousHtml: true,
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<p>a <?\n?></p>",
+ // "should support turning off code (indented, 7)"
+ // );
+
+ // assert_eq!(
+ // micromark("- Foo\n---", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<ul>\n<li>Foo</li>\n</ul>\n<hr />",
+ // "should support turning off code (indented, 8)"
+ // );
+
+ // assert_eq!(
+ // micromark("- Foo\n ---", {
+ // extensions: [{disable: {null: ["codeIndented"]}}]
+ // }),
+ // "<ul>\n<li>\n<h2>Foo</h2>\n</li>\n</ul>",
+ // "should support turning off code (indented, 9)"
+ // );
+}
diff --git a/tests/heading_atx.rs b/tests/heading_atx.rs
new file mode 100644
index 0000000..b75d058
--- /dev/null
+++ b/tests/heading_atx.rs
@@ -0,0 +1,208 @@
+extern crate micromark;
+use micromark::micromark;
+#[test]
+fn heading_atx() {
+ assert_eq!(
+ micromark("# foo"),
+ "<h1>foo</h1>",
+ "should support a heading w/ rank 1"
+ );
+
+ assert_eq!(
+ micromark("## foo"),
+ "<h2>foo</h2>",
+ "should support a heading w/ rank 2"
+ );
+
+ assert_eq!(
+ micromark("### foo"),
+ "<h3>foo</h3>",
+ "should support a heading w/ rank 3"
+ );
+
+ assert_eq!(
+ micromark("#### foo"),
+ "<h4>foo</h4>",
+ "should support a heading w/ rank 4"
+ );
+
+ assert_eq!(
+ micromark("##### foo"),
+ "<h5>foo</h5>",
+ "should support a heading w/ rank 5"
+ );
+
+ assert_eq!(
+ micromark("###### foo"),
+ "<h6>foo</h6>",
+ "should support a heading w/ rank 6"
+ );
+
+ assert_eq!(
+ micromark("####### foo"),
+ "<p>####### foo</p>",
+ "should not support a heading w/ rank 7"
+ );
+
+ assert_eq!(
+ micromark("#5 bolt"),
+ "<p>#5 bolt</p>",
+ "should not support a heading for a number sign not followed by whitespace (1)"
+ );
+
+ assert_eq!(
+ micromark("#hashtag"),
+ "<p>#hashtag</p>",
+ "should not support a heading for a number sign not followed by whitespace (2)"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark("\\## foo"),
+ // "<p>## foo</p>",
+ // "should not support a heading for an escaped number sign"
+ // );
+
+ // assert_eq!(
+ // micromark("# foo *bar* \\*baz\\*"),
+ // "<h1>foo <em>bar</em> *baz*</h1>",
+ // "should support text content in headings"
+ // );
+
+ assert_eq!(
+ micromark("# foo "),
+ "<h1>foo</h1>",
+ "should support arbitrary initial and final whitespace"
+ );
+
+ assert_eq!(
+ micromark(" ### foo"),
+ "<h3>foo</h3>",
+ "should support an initial space"
+ );
+
+ assert_eq!(
+ micromark(" ## foo"),
+ "<h2>foo</h2>",
+ "should support two initial spaces"
+ );
+
+ assert_eq!(
+ micromark(" # foo"),
+ "<h1>foo</h1>",
+ "should support three initial spaces"
+ );
+
+ assert_eq!(
+ micromark(" # foo"),
+ "<pre><code># foo\n</code></pre>",
+ "should not support four initial spaces"
+ );
+
+ // To do: lazy.
+ // assert_eq!(
+ // micromark("foo\n # bar"),
+ // "<p>foo\n# bar</p>",
+ // "should not support four initial spaces when interrupting"
+ // );
+
+ assert_eq!(
+ micromark("## foo ##"),
+ "<h2>foo</h2>",
+ "should support a closing sequence (1)"
+ );
+
+ assert_eq!(
+ micromark(" ### bar ###"),
+ "<h3>bar</h3>",
+ "should support a closing sequence (2)"
+ );
+
+ assert_eq!(
+ micromark("# foo ##################################"),
+ "<h1>foo</h1>",
+ "should support a closing sequence w/ an arbitrary number of number signs (1)"
+ );
+
+ assert_eq!(
+ micromark("##### foo ##"),
+ "<h5>foo</h5>",
+ "should support a closing sequence w/ an arbitrary number of number signs (2)"
+ );
+
+ assert_eq!(
+ micromark("### foo ### "),
+ "<h3>foo</h3>",
+ "should support trailing whitespace after a closing sequence"
+ );
+
+ assert_eq!(
+ micromark("### foo ### b"),
+ "<h3>foo ### b</h3>",
+ "should not support other content after a closing sequence"
+ );
+
+ assert_eq!(
+ micromark("# foo#"),
+ "<h1>foo#</h1>",
+ "should not support a closing sequence w/o whitespace before it"
+ );
+
+ // Phrasing.
+ // assert_eq!(
+ // micromark("### foo \\###"),
+ // "<h3>foo ###</h3>",
+ // "should not support an “escaped” closing sequence (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("## foo #\\##"),
+ // "<h2>foo ###</h2>",
+ // "should not support an “escaped” closing sequence (2)"
+ // );
+
+ // assert_eq!(
+ // micromark("# foo \\#"),
+ // "<h1>foo #</h1>",
+ // "should not support an “escaped” closing sequence (3)"
+ // );
+
+ assert_eq!(
+ micromark("****\n## foo\n****"),
+ "<hr />\n<h2>foo</h2>\n<hr />",
+ "should support atx headings when not surrounded by blank lines"
+ );
+
+ assert_eq!(
+ micromark("Foo bar\n# baz\nBar foo"),
+ "<p>Foo bar</p>\n<h1>baz</h1>\n<p>Bar foo</p>",
+ "should support atx headings interrupting paragraphs"
+ );
+
+ // Line endings.
+ assert_eq!(
+ micromark("## \n#\n### ###"),
+ "<h2></h2>\n<h1></h1>\n<h3></h3>",
+ "should support empty atx headings"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark("> #\na"),
+ // "<blockquote>\n<h1></h1>\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n#"),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<h1></h1>",
+ // "should not support lazyness (2)"
+ // );
+
+ // Extensions:
+ // assert_eq!(
+ // micromark("# a", {extensions: [{disable: {null: ["headingAtx"]}}]}),
+ // "<p># a</p>",
+ // "should support turning off heading (atx)"
+ // );
+}
diff --git a/tests/html_flow.rs b/tests/html_flow.rs
new file mode 100644
index 0000000..51d1a2a
--- /dev/null
+++ b/tests/html_flow.rs
@@ -0,0 +1,1058 @@
+extern crate micromark;
+use micromark::{micromark, micromark_with_options, CompileOptions};
+
+const DANGER: &CompileOptions = &CompileOptions {
+ allow_dangerous_html: true,
+};
+
+#[test]
+fn html_flow() {
+ assert_eq!(
+ micromark("<!-- asd -->"),
+ "&lt;!-- asd --&gt;",
+ "should support a heading w/ rank 1"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!-- asd -->", DANGER),
+ "<!-- asd -->",
+ "should support a heading w/ rank 1"
+ );
+
+ // To do: extensions.
+ // assert_eq!(
+ // micromark_with_options("<x>", {extensions: [{disable: {null: ["htmlFlow"]}}]}),
+ // "<p>&lt;x&gt;</p>",
+ // "should support turning off html (flow)"
+ // );
+}
+
+#[test]
+fn html_flow_1_raw() {
+ assert_eq!(
+ micromark_with_options(
+ "<pre language=\"haskell\"><code>
+import Text.HTML.TagSoup
+
+main :: IO ()
+main = print $ parseTags tags
+</code></pre>
+okay",
+ DANGER
+ ),
+ "<pre language=\"haskell\"><code>
+import Text.HTML.TagSoup
+
+main :: IO ()
+main = print $ parseTags tags
+</code></pre>
+<p>okay</p>",
+ "should support raw pre tags (type 1)"
+ );
+
+ assert_eq!(
+ micromark_with_options(
+ "<script type=\"text/javascript\">
+// JavaScript example
+
+document.getElementById(\"demo\").innerHTML = \"Hello JavaScript!\";
+</script>
+okay",
+ DANGER
+ ),
+ "<script type=\"text/javascript\">
+// JavaScript example
+
+document.getElementById(\"demo\").innerHTML = \"Hello JavaScript!\";
+</script>
+<p>okay</p>",
+ "should support raw script tags"
+ );
+
+ assert_eq!(
+ micromark_with_options(
+ "<style
+ type=\"text/css\">
+h1 {color:red;}
+
+p {color:blue;}
+</style>
+okay",
+ DANGER
+ ),
+ "<style
+ type=\"text/css\">
+h1 {color:red;}
+
+p {color:blue;}
+</style>
+<p>okay</p>",
+ "should support raw style tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("<style\n type=\"text/css\">\n\nfoo", DANGER),
+ "<style\n type=\"text/css\">\n\nfoo",
+ "should support raw tags w/o ending"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<style>p{color:red;}</style>\n*foo*", DANGER),
+ // "<style>p{color:red;}</style>\n<p><em>foo</em></p>",
+ // "should support raw tags w/ start and end on a single line"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<script>\nfoo\n</script>1. *bar*", DANGER),
+ "<script>\nfoo\n</script>1. *bar*",
+ "should support raw tags w/ more data on ending line"
+ );
+
+ assert_eq!(
+ micromark_with_options("<script", DANGER),
+ "<script",
+ "should support an eof directly after a raw tag name"
+ );
+
+ // To do: paragraphs.
+ // assert_eq!(
+ // micromark_with_options("</script\nmore", DANGER),
+ // "<p>&lt;/script\nmore</p>",
+ // "should not support a raw closing tag"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<script/", DANGER),
+ "<p>&lt;script/</p>",
+ "should not support an eof after a self-closing slash"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<script/\n*asd*", DANGER),
+ // "<p>&lt;script/\n<em>asd</em></p>",
+ // "should not support a line ending after a self-closing slash"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<script/>", DANGER),
+ "<script/>",
+ "should support an eof after a self-closing tag"
+ );
+
+ assert_eq!(
+ micromark_with_options("<script/>\na", DANGER),
+ "<script/>\na",
+ "should support a line ending after a self-closing tag"
+ );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<script/>a", DANGER),
+ // "<p><script/>a</p>",
+ // "should not support other characters after a self-closing tag"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<script>a", DANGER),
+ "<script>a",
+ "should support other characters after a raw opening tag"
+ );
+
+ // Extra.
+ assert_eq!(
+ micromark_with_options("Foo\n<script", DANGER),
+ "<p>Foo</p>\n<script",
+ "should support interrupting paragraphs w/ raw tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("<script>\n \n \n</script>", DANGER),
+ "<script>\n \n \n</script>",
+ "should support blank lines in raw"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark_with_options("> <script>\na", DANGER),
+ // "<blockquote>\n<script>\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<script>", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<script>",
+ // "should not support lazyness (2)"
+ // );
+}
+
+#[test]
+fn html_flow_2_comment() {
+ assert_eq!(
+ micromark_with_options("<!-- Foo\n\nbar\n baz -->\nokay", DANGER),
+ "<!-- Foo\n\nbar\n baz -->\n<p>okay</p>",
+ "should support comments (type 2)"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<!-- foo -->*bar*\n*baz*", DANGER),
+ // "<!-- foo -->*bar*\n<p><em>baz</em></p>",
+ // "should support comments w/ start and end on a single line"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<!-asd-->", DANGER),
+ "<p>&lt;!-asd--&gt;</p>",
+ "should not support a single dash to start comments"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!-->", DANGER),
+ "<!-->",
+ "should support comments where the start dashes are the end dashes (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!--->", DANGER),
+ "<!--->",
+ "should support comments where the start dashes are the end dashes (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!---->", DANGER),
+ "<!---->",
+ "should support empty comments"
+ );
+
+ // If the `\"` is encoded, we’re in text. If it remains, we’re in HTML.
+ assert_eq!(
+ micromark_with_options("<!--\n->\n\"", DANGER),
+ "<!--\n->\n\"",
+ "should not end a comment at one dash (`->`)"
+ );
+ assert_eq!(
+ micromark_with_options("<!--\n-->\n\"", DANGER),
+ "<!--\n-->\n<p>&quot;</p>",
+ "should end a comment at two dashes (`-->`)"
+ );
+ assert_eq!(
+ micromark_with_options("<!--\n--->\n\"", DANGER),
+ "<!--\n--->\n<p>&quot;</p>",
+ "should end a comment at three dashes (`--->`)"
+ );
+ assert_eq!(
+ micromark_with_options("<!--\n---->\n\"", DANGER),
+ "<!--\n---->\n<p>&quot;</p>",
+ "should end a comment at four dashes (`---->`)"
+ );
+
+ assert_eq!(
+ micromark_with_options(" <!-- foo -->", DANGER),
+ " <!-- foo -->",
+ "should support comments w/ indent"
+ );
+
+ assert_eq!(
+ micromark_with_options(" <!-- foo -->", DANGER),
+ "<pre><code>&lt;!-- foo --&gt;\n</code></pre>",
+ "should not support comments w/ a 4 character indent"
+ );
+
+ // Extra.
+ assert_eq!(
+ micromark_with_options("Foo\n<!--", DANGER),
+ "<p>Foo</p>\n<!--",
+ "should support interrupting paragraphs w/ comments"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!--\n \n \n-->", DANGER),
+ "<!--\n \n \n-->",
+ "should support blank lines in comments"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark_with_options("> <!--\na", DANGER),
+ // "<blockquote>\n<!--\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<!--", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<!--",
+ // "should not support lazyness (2)"
+ // );
+}
+
+#[test]
+fn html_flow_3_instruction() {
+ assert_eq!(
+ micromark_with_options("<?php\n\n echo \">\";\n\n?>\nokay", DANGER),
+ "<?php\n\n echo \">\";\n\n?>\n<p>okay</p>",
+ "should support instructions (type 3)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<?>", DANGER),
+ "<?>",
+ "should support empty instructions where the `?` is part of both the start and the end"
+ );
+
+ assert_eq!(
+ micromark_with_options("<??>", DANGER),
+ "<??>",
+ "should support empty instructions"
+ );
+
+ // Extra.
+ assert_eq!(
+ micromark_with_options("Foo\n<?", DANGER),
+ "<p>Foo</p>\n<?",
+ "should support interrupting paragraphs w/ instructions"
+ );
+
+ assert_eq!(
+ micromark_with_options("<?\n \n \n?>", DANGER),
+ "<?\n \n \n?>",
+ "should support blank lines in instructions"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark_with_options("> <?\na", DANGER),
+ // "<blockquote>\n<?\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<?", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<?",
+ // "should not support lazyness (2)"
+ // );
+}
+
+#[test]
+fn html_flow_4_declaration() {
+ assert_eq!(
+ micromark_with_options("<!DOCTYPE html>", DANGER),
+ "<!DOCTYPE html>",
+ "should support declarations (type 4)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!123>", DANGER),
+ "<p>&lt;!123&gt;</p>",
+ "should not support declarations that start w/o an alpha"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!>", DANGER),
+ "<p>&lt;!&gt;</p>",
+ "should not support declarations w/o an identifier"
+ );
+
+ assert_eq!(
+ micromark_with_options("<!a>", DANGER),
+ "<!a>",
+ "should support declarations w/o a single alpha as identifier"
+ );
+
+ // Extra.
+ assert_eq!(
+ micromark_with_options("Foo\n<!d", DANGER),
+ "<p>Foo</p>\n<!d",
+ "should support interrupting paragraphs w/ declarations"
+ );
+
+ // Note about the lower letter:
+ // <https://github.com/commonmark/commonmark-spec/pull/621>
+ assert_eq!(
+ micromark_with_options("<!a\n \n \n>", DANGER),
+ "<!a\n \n \n>",
+ "should support blank lines in declarations"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark_with_options("> <!a\nb", DANGER),
+ // "<blockquote>\n<!a\n</blockquote>\n<p>b</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<!b", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<!b",
+ // "should not support lazyness (2)"
+ // );
+}
+
+#[test]
+fn html_flow_5_cdata() {
+ assert_eq!(
+ micromark_with_options(
+ "<![CDATA[\nfunction matchwo(a,b)\n{\n if (a < b && a < 0) then {\n return 1;\n\n } else {\n\n return 0;\n }\n}\n]]>\nokay",
+ DANGER
+ ),
+ "<![CDATA[\nfunction matchwo(a,b)\n{\n if (a < b && a < 0) then {\n return 1;\n\n } else {\n\n return 0;\n }\n}\n]]>\n<p>okay</p>",
+ "should support cdata (type 5)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<![CDATA[]]>", DANGER),
+ "<![CDATA[]]>",
+ "should support empty cdata"
+ );
+
+ assert_eq!(
+ micromark_with_options("<![CDATA]]>", DANGER),
+ "<p>&lt;![CDATA]]&gt;</p>",
+ "should not support cdata w/ a missing `[`"
+ );
+
+ assert_eq!(
+ micromark_with_options("<![CDATA[]]]>", DANGER),
+ "<![CDATA[]]]>",
+ "should support cdata w/ a single `]` as content"
+ );
+
+ // Extra.
+ assert_eq!(
+ micromark_with_options("Foo\n<![CDATA[", DANGER),
+ "<p>Foo</p>\n<![CDATA[",
+ "should support interrupting paragraphs w/ cdata"
+ );
+
+ // Note: cmjs parses this differently.
+ // See: <https://github.com/commonmark/commonmark.js/issues/193>
+ assert_eq!(
+ micromark_with_options("<![cdata[]]>", DANGER),
+ "<p>&lt;![cdata[]]&gt;</p>",
+ "should not support lowercase cdata"
+ );
+
+ assert_eq!(
+ micromark_with_options("<![CDATA[\n \n \n]]>", DANGER),
+ "<![CDATA[\n \n \n]]>",
+ "should support blank lines in cdata"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark_with_options("> <![CDATA[\na", DANGER),
+ // "<blockquote>\n<![CDATA[\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<![CDATA[", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<![CDATA[",
+ // "should not support lazyness (2)"
+ // );
+}
+
+#[test]
+fn html_flow_6_basic() {
+ // To do: phrasing, paragraphs, etc.
+ // assert_eq!(
+ // micromark_with_options(
+ // "<table><tr><td>\n<pre>\n**Hello**,\n\n_world_.\n</pre>\n</td></tr></table>",
+ // DANGER
+ // ),
+ // "<table><tr><td>\n<pre>\n**Hello**,\n<p><em>world</em>.\n</pre></p>\n</td></tr></table>",
+ // "should support html (basic)"
+ // );
+
+ // To do: paragraphs.
+ // assert_eq!(
+ // micromark_with_options(
+ // "<table>
+ // <tr>
+ // <td>
+ // hi
+ // </td>
+ // </tr>
+ // </table>
+
+ // okay.",
+ // DANGER
+ // ),
+ // "<table>
+ // <tr>
+ // <td>
+ // hi
+ // </td>
+ // </tr>
+ // </table>
+ // <p>okay.</p>",
+ // "should support html of type 6 (1)"
+ // );
+
+ assert_eq!(
+ micromark_with_options(" <div>\n *hello*\n <foo><a>", DANGER),
+ " <div>\n *hello*\n <foo><a>",
+ "should support html of type 6 (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("</div>\n*foo*", DANGER),
+ "</div>\n*foo*",
+ "should support html starting w/ a closing tag"
+ );
+
+ // To do: phrasing
+ // assert_eq!(
+ // micromark_with_options("<DIV CLASS=\"foo\">\n\n*Markdown*\n\n</DIV>", DANGER),
+ // "<DIV CLASS=\"foo\">\n<p><em>Markdown</em></p>\n</DIV>",
+ // "should support html w/ markdown in between"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<div id=\"foo\"\n class=\"bar\">\n</div>", DANGER),
+ "<div id=\"foo\"\n class=\"bar\">\n</div>",
+ "should support html w/ line endings (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div id=\"foo\" class=\"bar\n baz\">\n</div>", DANGER),
+ "<div id=\"foo\" class=\"bar\n baz\">\n</div>",
+ "should support html w/ line endings (2)"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<div>\n*foo*\n\n*bar*", DANGER),
+ // "<div>\n*foo*\n<p><em>bar</em></p>",
+ // "should support an unclosed html element"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<div id=\"foo\"\n*hi*", DANGER),
+ "<div id=\"foo\"\n*hi*",
+ "should support garbage html (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div class\nfoo", DANGER),
+ "<div class\nfoo",
+ "should support garbage html (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div *???-&&&-<---\n*foo*", DANGER),
+ "<div *???-&&&-<---\n*foo*",
+ "should support garbage html (3)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div><a href=\"bar\">*foo*</a></div>", DANGER),
+ "<div><a href=\"bar\">*foo*</a></div>",
+ "should support other tags in the opening (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<table><tr><td>\nfoo\n</td></tr></table>", DANGER),
+ "<table><tr><td>\nfoo\n</td></tr></table>",
+ "should support other tags in the opening (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div></div>\n``` c\nint x = 33;\n```", DANGER),
+ "<div></div>\n``` c\nint x = 33;\n```",
+ "should include everything ’till a blank line"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark_with_options("> <div>\n> foo\n\nbar", DANGER),
+ // "<blockquote>\n<div>\nfoo\n</blockquote>\n<p>bar</p>",
+ // "should support basic tags w/o ending in containers (1)"
+ // );
+
+ // To do: list.
+ // assert_eq!(
+ // micromark_with_options("- <div>\n- foo", DANGER),
+ // "<ul>\n<li>\n<div>\n</li>\n<li>foo</li>\n</ul>",
+ // "should support basic tags w/o ending in containers (2)"
+ // );
+
+ assert_eq!(
+ micromark_with_options(" <div>", DANGER),
+ " <div>",
+ "should support basic tags w/ indent"
+ );
+
+ assert_eq!(
+ micromark_with_options(" <div>", DANGER),
+ "<pre><code>&lt;div&gt;\n</code></pre>",
+ "should not support basic tags w/ a 4 character indent"
+ );
+
+ assert_eq!(
+ micromark_with_options("Foo\n<div>\nbar\n</div>", DANGER),
+ "<p>Foo</p>\n<div>\nbar\n</div>",
+ "should support interrupting paragraphs w/ basic tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div>\nbar\n</div>\n*foo*", DANGER),
+ "<div>\nbar\n</div>\n*foo*",
+ "should require a blank line to end"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<div>\n\n*Emphasized* text.\n\n</div>", DANGER),
+ // "<div>\n<p><em>Emphasized</em> text.</p>\n</div>",
+ // "should support interleaving w/ blank lines"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<div>\n*Emphasized* text.\n</div>", DANGER),
+ "<div>\n*Emphasized* text.\n</div>",
+ "should not support interleaving w/o blank lines"
+ );
+
+ assert_eq!(
+ micromark_with_options(
+ "<table>\n\n<tr>\n\n<td>\nHi\n</td>\n\n</tr>\n\n</table>",
+ DANGER
+ ),
+ "<table>\n<tr>\n<td>\nHi\n</td>\n</tr>\n</table>",
+ "should support blank lines between adjacent html"
+ );
+
+ assert_eq!(
+ micromark_with_options(
+ "<table>
+
+ <tr>
+
+ <td>
+ Hi
+ </td>
+
+ </tr>
+
+</table>",
+ DANGER
+ ),
+ "<table>
+ <tr>
+<pre><code>&lt;td&gt;
+ Hi
+&lt;/td&gt;
+</code></pre>
+ </tr>
+</table>",
+ "should not support indented, blank-line delimited, adjacent html"
+ );
+
+ assert_eq!(
+ micromark_with_options("</1>", DANGER),
+ "<p>&lt;/1&gt;</p>",
+ "should not support basic tags w/ an incorrect name start character"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div", DANGER),
+ "<div",
+ "should support an eof directly after a basic tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div\n", DANGER),
+ "<div\n",
+ "should support a line ending directly after a tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div ", DANGER),
+ "<div ",
+ "should support an eof after a space directly after a tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div/", DANGER),
+ "<p>&lt;div/</p>",
+ "should not support an eof directly after a self-closing slash"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<div/\n*asd*", DANGER),
+ // "<p>&lt;div/\n<em>asd</em></p>",
+ // "should not support a line ending after a self-closing slash"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<div/>", DANGER),
+ "<div/>",
+ "should support an eof after a self-closing tag"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div/>\na", DANGER),
+ "<div/>\na",
+ "should support a line ending after a self-closing tag"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div/>a", DANGER),
+ "<div/>a",
+ "should support another character after a self-closing tag"
+ );
+
+ assert_eq!(
+ micromark_with_options("<div>a", DANGER),
+ "<div>a",
+ "should support another character after a basic opening tag"
+ );
+
+ // Extra.
+ assert_eq!(
+ micromark_with_options("Foo\n<div/>", DANGER),
+ "<p>Foo</p>\n<div/>",
+ "should support interrupting paragraphs w/ self-closing basic tags"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark_with_options("<div\n \n \n>", DANGER),
+ // "<div\n<blockquote>\n</blockquote>",
+ // "should not support blank lines in basic"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> <div\na", DANGER),
+ // "<blockquote>\n<div\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<div", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<div",
+ // "should not support lazyness (2)"
+ // );
+}
+
+#[test]
+fn html_flow_7_complete() {
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<a href=\"foo\">\n*bar*\n</a>", DANGER),
+ // "<a href=\"foo\">\n*bar*\n</a>",
+ // "should support complete tags (type 7)"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<Warning>\n*bar*\n</Warning>", DANGER),
+ "<Warning>\n*bar*\n</Warning>",
+ "should support non-html tag names"
+ );
+
+ assert_eq!(
+ micromark_with_options("<i class=\"foo\">\n*bar*\n</i>", DANGER),
+ "<i class=\"foo\">\n*bar*\n</i>",
+ "should support non-“block” html tag names (1)"
+ );
+
+ assert_eq!(
+ micromark_with_options("<del>\n*foo*\n</del>", DANGER),
+ "<del>\n*foo*\n</del>",
+ "should support non-“block” html tag names (2)"
+ );
+
+ assert_eq!(
+ micromark_with_options("</ins>\n*bar*", DANGER),
+ "</ins>\n*bar*",
+ "should support closing tags"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<del>\n\n*foo*\n\n</del>", DANGER),
+ // "<del>\n<p><em>foo</em></p>\n</del>",
+ // "should support interleaving"
+ // );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<del>*foo*</del>", DANGER),
+ // "<p><del><em>foo</em></del></p>",
+ // "should not support interleaving w/o blank lines"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<div>\n \nasd", DANGER),
+ "<div>\n<p>asd</p>",
+ "should support interleaving w/ whitespace-only blank lines"
+ );
+
+ // To do: interrupting.
+ // assert_eq!(
+ // micromark_with_options("Foo\n<a href=\"bar\">\nbaz", DANGER),
+ // "<p>Foo\n<a href=\"bar\">\nbaz</p>",
+ // "should not support interrupting paragraphs w/ complete tags"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x", DANGER),
+ "<p>&lt;x</p>",
+ "should not support an eof directly after a tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x/", DANGER),
+ "<p>&lt;x/</p>",
+ "should not support an eof directly after a self-closing slash"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x\n", DANGER),
+ "<p>&lt;x</p>\n",
+ "should not support a line ending directly after a tag name"
+ );
+
+ // To do: paragraphs (trailing whitespace).
+ // assert_eq!(
+ // micromark_with_options("<x ", DANGER),
+ // "<p>&lt;x</p>",
+ // "should not support an eof after a space directly after a tag name"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x/", DANGER),
+ "<p>&lt;x/</p>",
+ "should not support an eof directly after a self-closing slash"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark_with_options("<x/\n*asd*", DANGER),
+ // "<p>&lt;x/\n<em>asd</em></p>",
+ // "should not support a line ending after a self-closing slash"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x/>", DANGER),
+ "<x/>",
+ "should support an eof after a self-closing tag"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x/>\na", DANGER),
+ "<x/>\na",
+ "should support a line ending after a self-closing tag"
+ );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x/>a", DANGER),
+ // "<p><x/>a</p>",
+ // "should not support another character after a self-closing tag"
+ // );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x>a", DANGER),
+ // "<p><x>a</p>",
+ // "should not support another character after an opening tag"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x y>", DANGER),
+ "<x y>",
+ "should support boolean attributes in a complete tag"
+ );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x\ny>", DANGER),
+ // "<p><x\ny></p>",
+ // "should not support a line ending before an attribute name"
+ // );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x\n y>", DANGER),
+ // "<p><x\ny></p>",
+ // "should not support a line ending w/ whitespace before an attribute name"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x\n \ny>", DANGER),
+ "<p>&lt;x</p>\n<p>y&gt;</p>",
+ "should not support a line ending w/ whitespace and another line ending before an attribute name"
+ );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x y\nz>", DANGER),
+ // "<p><x y\nz></p>",
+ // "should not support a line ending between attribute names"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x y z>", DANGER),
+ "<x y z>",
+ "should support whitespace between attribute names"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x:y>", DANGER),
+ "<p>&lt;x:y&gt;</p>",
+ "should not support a colon in a tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x_y>", DANGER),
+ "<p>&lt;x_y&gt;</p>",
+ "should not support an underscore in a tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x.y>", DANGER),
+ "<p>&lt;x.y&gt;</p>",
+ "should not support a dot in a tag name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x :y>", DANGER),
+ "<x :y>",
+ "should support a colon to start an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x _y>", DANGER),
+ "<x _y>",
+ "should support an underscore to start an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x .y>", DANGER),
+ "<p>&lt;x .y&gt;</p>",
+ "should not support a dot to start an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y:>", DANGER),
+ "<x y:>",
+ "should support a colon to end an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y_>", DANGER),
+ "<x y_>",
+ "should support an underscore to end an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y.>", DANGER),
+ "<x y.>",
+ "should support a dot to end an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y123>", DANGER),
+ "<x y123>",
+ "should support numbers to end an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x data->", DANGER),
+ "<x data->",
+ "should support a dash to end an attribute name"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y=>", DANGER),
+ "<p>&lt;x y=&gt;</p>",
+ "should not upport an initializer w/o a value"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y==>", DANGER),
+ "<p>&lt;x y==&gt;</p>",
+ "should not support an equals to as an initializer"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y=z>", DANGER),
+ "<x y=z>",
+ "should support a single character as an unquoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y=\"\">", DANGER),
+ "<x y=\"\">",
+ "should support an empty double quoted attribute value"
+ );
+
+ assert_eq!(
+ micromark_with_options("<x y=\"\">", DANGER),
+ "<x y=\"\">",
+ "should support an empty single quoted attribute value"
+ );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x y=\"\n\">", DANGER),
+ // "<p><x y=\"\n\"></p>",
+ // "should not support a line ending in a double quoted attribute value"
+ // );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<x y=\"\n\">", DANGER),
+ // "<p><x y=\"\n\"></p>",
+ // "should not support a line ending in a single quoted attribute value"
+ // );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<w x=y\nz>", DANGER),
+ // "<p><w x=y\nz></p>",
+ // "should not support a line ending in/after an unquoted attribute value"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<w x=y\"z>", DANGER),
+ "<p>&lt;w x=y&quot;z&gt;</p>",
+ "should not support a double quote in/after an unquoted attribute value"
+ );
+
+ // To do: html (text).
+ // assert_eq!(
+ // micromark_with_options("<w x=y\"z>", DANGER),
+ // "<p>&lt;w x=y\"z&gt;</p>",
+ // "should not support a single quote in/after an unquoted attribute value"
+ // );
+
+ assert_eq!(
+ micromark_with_options("<x y=\"\"z>", DANGER),
+ "<p>&lt;x y=&quot;&quot;z&gt;</p>",
+ "should not support an attribute after a double quoted attribute value"
+ );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark_with_options("<x>\n \n \n>", DANGER),
+ // "<x>\n<blockquote>\n</blockquote>",
+ // "should not support blank lines in complete"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> <a>\n*bar*", DANGER),
+ // "<blockquote>\n<a>\n</blockquote>\n<p><em>bar</em></p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark_with_options("> a\n<a>", DANGER),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<a>",
+ // "should not support lazyness (2)"
+ // );
+}
diff --git a/tests/lib.rs b/tests/lib.rs
new file mode 100644
index 0000000..18fcef2
--- /dev/null
+++ b/tests/lib.rs
@@ -0,0 +1,8 @@
+extern crate micromark;
+use micromark::micromark;
+
+#[test]
+fn basic() {
+ assert_eq!(micromark("asd"), "<p>asd</p>", "should work");
+ assert_eq!(micromark("1 < 3"), "<p>1 &lt; 3</p>", "should encode");
+}
diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs
new file mode 100644
index 0000000..833fa6f
--- /dev/null
+++ b/tests/thematic_break.rs
@@ -0,0 +1,181 @@
+extern crate micromark;
+use micromark::micromark;
+
+#[test]
+fn thematic_break() {
+ assert_eq!(
+ micromark("***\n---\n___"),
+ "<hr />\n<hr />\n<hr />",
+ "should support thematic breaks w/ asterisks, dashes, and underscores"
+ );
+
+ assert_eq!(
+ micromark("+++"),
+ "<p>+++</p>",
+ "should not support thematic breaks w/ plusses"
+ );
+
+ assert_eq!(
+ micromark("==="),
+ "<p>===</p>",
+ "should not support thematic breaks w/ equals"
+ );
+
+ assert_eq!(
+ micromark("--"),
+ "<p>--</p>",
+ "should not support thematic breaks w/ two dashes"
+ );
+
+ assert_eq!(
+ micromark("**"),
+ "<p>**</p>",
+ "should not support thematic breaks w/ two asterisks"
+ );
+
+ assert_eq!(
+ micromark("__"),
+ "<p>__</p>",
+ "should not support thematic breaks w/ two underscores"
+ );
+
+ assert_eq!(
+ micromark(" ***"),
+ "<hr />",
+ "should support thematic breaks w/ 1 space"
+ );
+
+ assert_eq!(
+ micromark(" ***"),
+ "<hr />",
+ "should support thematic breaks w/ 2 spaces"
+ );
+
+ assert_eq!(
+ micromark(" ***"),
+ "<hr />",
+ "should support thematic breaks w/ 3 spaces"
+ );
+
+ assert_eq!(
+ micromark(" ***"),
+ "<pre><code>***\n</code></pre>",
+ "should not support thematic breaks w/ 4 spaces"
+ );
+
+ // To do: paragraphs.
+ // assert_eq!(
+ // micromark("Foo\n ***"),
+ // "<p>Foo\n***</p>",
+ // "should not support thematic breaks w/ 4 spaces as paragraph continuation"
+ // );
+
+ assert_eq!(
+ micromark("_____________________________________"),
+ "<hr />",
+ "should support thematic breaks w/ many markers"
+ );
+
+ assert_eq!(
+ micromark(" - - -"),
+ "<hr />",
+ "should support thematic breaks w/ spaces (1)"
+ );
+
+ assert_eq!(
+ micromark(" ** * ** * ** * **"),
+ "<hr />",
+ "should support thematic breaks w/ spaces (2)"
+ );
+
+ assert_eq!(
+ micromark("- - - -"),
+ "<hr />",
+ "should support thematic breaks w/ spaces (3)"
+ );
+
+ assert_eq!(
+ micromark("- - - - "),
+ "<hr />",
+ "should support thematic breaks w/ trailing spaces"
+ );
+
+ assert_eq!(
+ micromark("_ _ _ _ a"),
+ "<p>_ _ _ _ a</p>",
+ "should not support thematic breaks w/ other characters (1)"
+ );
+
+ assert_eq!(
+ micromark("a------"),
+ "<p>a------</p>",
+ "should not support thematic breaks w/ other characters (2)"
+ );
+
+ assert_eq!(
+ micromark("---a---"),
+ "<p>---a---</p>",
+ "should not support thematic breaks w/ other characters (3)"
+ );
+
+ // To do: phrasing.
+ // assert_eq!(
+ // micromark(" *-*"),
+ // "<p><em>-</em></p>",
+ // "should not support thematic breaks w/ mixed markers"
+ // );
+
+ // To do: lists.
+ // assert_eq!(
+ // micromark("- foo\n***\n- bar"),
+ // "<ul>\n<li>foo</li>\n</ul>\n<hr />\n<ul>\n<li>bar</li>\n</ul>",
+ // "should support thematic breaks mixed w/ lists (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("* Foo\n* * *\n* Bar"),
+ // "<ul>\n<li>Foo</li>\n</ul>\n<hr />\n<ul>\n<li>Bar</li>\n</ul>",
+ // "should support thematic breaks mixed w/ lists (2)"
+ // );
+
+ // To do: paragraph.
+ // assert_eq!(
+ // micromark("Foo\n***\nbar"),
+ // "<p>Foo</p>\n<hr />\n<p>bar</p>",
+ // "should support thematic breaks interrupting paragraphs"
+ // );
+
+ // To do: setext.
+ // assert_eq!(
+ // micromark("Foo\n---\nbar"),
+ // "<h2>Foo</h2>\n<p>bar</p>",
+ // "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)"
+ // );
+
+ // To do: list.
+ // assert_eq!(
+ // micromark("- Foo\n- * * *"),
+ // "<ul>\n<li>Foo</li>\n<li>\n<hr />\n</li>\n</ul>",
+ // "should support thematic breaks in lists"
+ // );
+
+ // To do: blockquote.
+ // assert_eq!(
+ // micromark("> ---\na"),
+ // "<blockquote>\n<hr />\n</blockquote>\n<p>a</p>",
+ // "should not support lazyness (1)"
+ // );
+
+ // assert_eq!(
+ // micromark("> a\n---"),
+ // "<blockquote>\n<p>a</p>\n</blockquote>\n<hr />",
+ // "should not support lazyness (2)"
+ // );
+
+ // To do: extensions.
+ // assert_eq!(
+ // micromark("***", {extensions: [{disable: {null: ["thematicBreak"]}}]}),
+ // "<p>***</p>",
+ // "should support turning off thematic breaks"
+ // );
+}