From 4c06c8554c35887f8f5147783953b2b7e7c2327f Mon Sep 17 00:00:00 2001
From: Titus Wormer ".to_string());
+ }
+ TokenType::CodeIndented => {
+ code_flow_seen_data = Some(false);
+ line_ending_if_needed(buffers);
+ buf_tail_mut(buffers).push("".to_string());
+ }
+ TokenType::CodeFenced => {
+ code_flow_seen_data = Some(false);
+ line_ending_if_needed(buffers);
+ // Note: no `>`, which is added later.
+ buf_tail_mut(buffers).push("
{
+ buffer(buffers);
+ }
+ TokenType::HtmlFlow => {
+ line_ending_if_needed(buffers);
+ if options.allow_dangerous_html {
+ ignore_encode = true;
+ }
+ }
+ TokenType::ContentPhrasing
+ | TokenType::AtxHeading
+ | TokenType::AtxHeadingSequence
+ | TokenType::AtxHeadingWhitespace
+ | TokenType::AtxHeadingText
+ | TokenType::LineEnding
+ | TokenType::ThematicBreak
+ | TokenType::ThematicBreakSequence
+ | TokenType::ThematicBreakWhitespace
+ | TokenType::CodeIndentedPrefixWhitespace
+ | TokenType::CodeFlowChunk
+ | TokenType::BlankLineEnding
+ | TokenType::BlankLineWhitespace
+ | TokenType::Whitespace
+ | TokenType::HtmlFlowData
+ | TokenType::CodeFencedFence
+ | TokenType::CodeFencedFenceSequence
+ | TokenType::ChunkString
+ | TokenType::CodeFencedFenceWhitespace
+ | TokenType::Data
+ | TokenType::CharacterEscape
+ | TokenType::CharacterEscapeMarker
+ | TokenType::CharacterEscapeValue
+ | TokenType::CharacterReference
+ | TokenType::CharacterReferenceMarker
+ | TokenType::CharacterReferenceMarkerNumeric
+ | TokenType::CharacterReferenceMarkerHexadecimal
+ | TokenType::CharacterReferenceMarkerSemi
+ | TokenType::CharacterReferenceValue => {}
+ #[allow(unreachable_patterns)]
+ _ => {
+ unreachable!("unhandled `enter` of TokenType {:?}", token_type)
+ }
+ },
+ EventType::Exit => match token_type {
+ TokenType::ThematicBreakSequence
+ | TokenType::ThematicBreakWhitespace
+ | TokenType::CodeIndentedPrefixWhitespace
+ | TokenType::BlankLineEnding
+ | TokenType::BlankLineWhitespace
+ | TokenType::Whitespace
+ | TokenType::CodeFencedFenceSequence
+ | TokenType::CodeFencedFenceWhitespace
+ | TokenType::CharacterEscape
+ | TokenType::CharacterEscapeMarker
+ | TokenType::CharacterReference
+ | TokenType::CharacterReferenceMarkerSemi => {}
+ TokenType::HtmlFlow => {
+ ignore_encode = false;
+ }
+ TokenType::HtmlFlowData => {
+ let slice = slice_serialize(codes, &get_span(events, index), false);
+
+ let res = if ignore_encode { slice } else { encode(&slice) };
+
+ // last_was_tag = false;
+ buf_tail_mut(buffers).push(res);
+ }
+ TokenType::Content => {
+ buf_tail_mut(buffers).push("
` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! The optional `meta` part is ignored: it is not used when parsing or
+//! rendering.
+//! The optional `info` part is used and is expected to specify the programming
+//! language that the code is in.
+//! Which value it holds depends on what your syntax highlighter supports, if
+//! one is used.
+//! The `info` is, when rendering to HTML, typically exposed as a class.
+//! This behavior stems from the HTML spec ([*§ 4.5.15 The `code`
+//! element*][html-code]).
+//! For example:
+//!
+//! ```markdown
+//! ~~~css
+//! * { color: tomato }
+//! ~~~
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! * { color: tomato }
+//!
+//! ```
+//!
+//! The `info` and `meta` parts are interpreted as the string content type.
+//! That means that character escapes and character reference are allowed.
+//!
+//! In markdown, it is also possible to use code (text) in the text content
+//! type.
+//! It is also possible to create code with the
+//! [code (indented)][code-indented] construct.
+//! That construct is less explicit, different from code (text), and has no
+//! support for specifying the programming language, so it is recommended to
+//! use code (fenced) instead of code (indented).
+//!
+//! ## References
+//!
+//! * [`code-fenced.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-fenced.js)
+//! * [*§ 4.5 Fenced code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#fenced-code-blocks)
+//!
+//! [code-indented]: crate::construct::code_indented
+//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+//!
+//!
+
+use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE};
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::get_span;
+
+/// Kind of fences.
+#[derive(Debug, Clone, PartialEq)]
+pub enum Kind {
+ /// Grave accent (tick) code.
+ GraveAccent,
+ /// Tilde code.
+ Tilde,
+}
+
+/// State needed to parse code (fenced).
+#[derive(Debug, Clone)]
+struct Info {
+ /// Number of markers on the opening fence sequence.
+ size: usize,
+ /// Number of tabs or spaces of indentation before the opening fence
+ /// sequence.
+ prefix: usize,
+ /// Kind of fences.
+ kind: Kind,
+}
+
+/// Start of fenced code.
+///
+/// ```markdown
+/// | ~~~js
+/// console.log(1);
+/// ~~~
+/// ```
+///
+/// Parsing note: normally, the prefix is already stripped.
+/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need
+/// it.
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::CodeFenced);
+ tokenizer.enter(TokenType::CodeFencedFence);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(before_sequence_open),
+ )(tokenizer, code)
+}
+
+/// Inside the opening fence, after an optional prefix, before a sequence.
+///
+/// ```markdown
+/// |~~~js
+/// console.log(1);
+/// ~~~
+/// ```
+fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let tail = tokenizer.events.last();
+ let mut prefix = 0;
+
+ if let Some(event) = tail {
+ if event.token_type == TokenType::Whitespace {
+ let span = get_span(&tokenizer.events, tokenizer.events.len() - 1);
+ prefix = span.end_index - span.start_index;
+ }
+ }
+
+ match code {
+ Code::Char(char) if char == '`' || char == '~' => {
+ tokenizer.enter(TokenType::CodeFencedFenceSequence);
+ sequence_open(
+ tokenizer,
+ Info {
+ prefix,
+ size: 0,
+ kind: if char == '`' {
+ Kind::GraveAccent
+ } else {
+ Kind::Tilde
+ },
+ },
+ code,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside the opening fence sequence.
+///
+/// ```markdown
+/// ~|~~js
+/// console.log(1);
+/// ~~~
+/// ```
+fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ let marker = if info.kind == Kind::GraveAccent {
+ '`'
+ } else {
+ '~'
+ };
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ let mut info = info;
+ info.size += 1;
+ sequence_open(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => {
+ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN {
+ (State::Nok, None)
+ } else {
+ tokenizer.exit(TokenType::CodeFencedFenceSequence);
+ tokenizer.attempt(
+ |tokenizer, code| {
+ whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace)
+ },
+ |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)),
+ )(tokenizer, code)
+ }
+ }
+ }
+}
+
+/// Inside the opening fence, after the sequence (and optional whitespace), before the info.
+///
+/// ```markdown
+/// ~~~|js
+/// console.log(1);
+/// ~~~
+/// ```
+fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeFencedFenceInfo);
+ tokenizer.enter(TokenType::ChunkString);
+ info_inside(tokenizer, info, code, vec![])
+ }
+ }
+}
+
+/// Inside the opening fence info.
+///
+/// ```markdown
+/// ~~~j|s
+/// console.log(1);
+/// ~~~
+/// ```
+fn info_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+ codes: Vec,
+) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ println!("to do: subtokenize: {:?}", codes);
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::CodeFencedFenceInfo);
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ println!("to do: subtokenize: {:?}", codes);
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::CodeFencedFenceInfo);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace),
+ |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)),
+ )(tokenizer, code)
+ }
+ Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
+ Code::Char(_) => {
+ let mut codes = codes;
+ codes.push(code);
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ info_inside(tokenizer, info, code, codes)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// Inside the opening fence, after the info and whitespace, before the meta.
+///
+/// ```markdown
+/// ~~~js |eval
+/// console.log(1);
+/// ~~~
+/// ```
+fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeFencedFenceMeta);
+ tokenizer.enter(TokenType::ChunkString);
+ meta(tokenizer, info, code)
+ }
+ }
+}
+
+/// Inside the opening fence meta.
+///
+/// ```markdown
+/// ~~~js e|val
+/// console.log(1);
+/// ~~~
+/// ```
+fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::CodeFencedFenceMeta);
+ tokenizer.exit(TokenType::CodeFencedFence);
+ at_break(tokenizer, info, code)
+ }
+ Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))),
+ None,
+ )
+ }
+ }
+}
+
+/// At an eol/eof in code, before a closing fence or before content.
+///
+/// ```markdown
+/// ~~~js|
+/// aa|
+/// ~~~
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ let clone = info.clone();
+
+ match code {
+ Code::None => after(tokenizer, code),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.attempt(
+ |tokenizer, code| {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ close_before(tokenizer, info, code)
+ })),
+ None,
+ )
+ },
+ |ok| {
+ if ok {
+ Box::new(after)
+ } else {
+ Box::new(|tokenizer, code| {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ content_start(tokenizer, clone, code)
+ })),
+ None,
+ )
+ })
+ }
+ },
+ )(tokenizer, code),
+ _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code),
+ }
+}
+
+/// Before a closing fence, before optional whitespace.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// |~~~
+///
+/// ~~~js
+/// console.log('1')
+/// | ~~~
+/// ```
+fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::CodeFencedFence);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)),
+ )(tokenizer, code)
+}
+
+/// In a closing fence, after optional whitespace, before sequence.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// |~~~
+///
+/// ~~~js
+/// console.log('1')
+/// |~~~
+/// ```
+fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ let tail = tokenizer.events.last();
+ let mut prefix = 0;
+ let marker = if info.kind == Kind::GraveAccent {
+ '`'
+ } else {
+ '~'
+ };
+
+ if let Some(event) = tail {
+ if event.token_type == TokenType::Whitespace {
+ let span = get_span(&tokenizer.events, tokenizer.events.len() - 1);
+ prefix = span.end_index - span.start_index;
+ }
+ }
+
+ // To do: 4+ should be okay if code (indented) is turned off!
+ if prefix >= TAB_SIZE {
+ return (State::Nok, None);
+ }
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.enter(TokenType::CodeFencedFenceSequence);
+ close_sequence(tokenizer, info, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In the closing fence sequence.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// ~|~~
+/// ```
+fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult {
+ let marker = if info.kind == Kind::GraveAccent {
+ '`'
+ } else {
+ '~'
+ };
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ close_sequence(tokenizer, info, code, size + 1)
+ })),
+ None,
+ )
+ }
+ _ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => {
+ tokenizer.exit(TokenType::CodeFencedFenceSequence);
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace),
+ |_ok| Box::new(close_whitespace_after),
+ )(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After the closing fence sequence after optional whitespace.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// ~~~ |
+/// ```
+fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFencedFence);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Before code content, definitely not before a closing fence.
+///
+/// ```markdown
+/// ~~~js
+/// |aa
+/// ~~~
+/// ```
+fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ at_break(tokenizer, info, code)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => {
+ tokenizer.enter(TokenType::Whitespace);
+ content_prefix(tokenizer, info, 0, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeFlowChunk);
+ content_continue(tokenizer, info, code)
+ }
+ }
+}
+
+/// Before code content, in a prefix.
+///
+/// ```markdown
+/// ~~~js
+/// | aa
+/// ~~~
+/// ```
+fn content_prefix(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ prefix: usize,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ content_prefix(tokenizer, info, prefix + 1, code)
+ })),
+ None,
+ )
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::Whitespace);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.exit(TokenType::Whitespace);
+ tokenizer.enter(TokenType::CodeFlowChunk);
+ content_continue(tokenizer, info, code)
+ }
+ }
+}
+
+/// In code content.
+///
+/// ```markdown
+/// ~~~js
+/// |ab
+/// a|b
+/// ab|
+/// ~~~
+/// ```
+fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFlowChunk);
+ at_break(tokenizer, info, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ content_continue(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// After fenced code.
+///
+/// ```markdown
+/// ~~~js
+/// console.log('1')
+/// ~~~|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::CodeFenced);
+ (State::Ok, Some(vec![code]))
+}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
new file mode 100644
index 0000000..6bf089b
--- /dev/null
+++ b/src/construct/code_indented.rs
@@ -0,0 +1,190 @@
+//! Code (indented) is a construct that occurs in the flow content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line )
+//!
+//! ; Restriction: at least one `code` must not be whitespace.
+//! indented_filled_line ::= 4space_or_tab *code
+//! blank_line ::= *space_or_tab
+//! eol ::= '\r' | '\r\n' | '\n'
+//! code ::= . ; any unicode code point (other than line endings).
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Code (indented) relates to both the `` and the `` elements in
+//! HTML.
+//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
+//! element*][html-code] in the HTML spec for more info.
+//!
+//! In markdown, it is also possible to use code (text) in the text content
+//! type.
+//! It is also possible to create code with the [code (fenced)][code-fenced]
+//! construct.
+//! That construct is more explicit, more similar to code (text), and has
+//! support for specifying the programming language that the code is in, so it
+//! is recommended to use that instead of indented code.
+//!
+//! ## References
+//!
+//! * [`code-indented.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-indented.js)
+//! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks)
+//!
+//! [code-fenced]: crate::construct::code_fenced
+//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+//!
+//!
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of code (indented).
+///
+/// ```markdown
+/// | asd
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.enter(TokenType::CodeIndented);
+ tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+ indent(tokenizer, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside the initial whitespace.
+///
+/// ```markdown
+/// | asd
+/// | asd
+/// | asd
+/// |asd
+/// ```
+///
+/// > **Parsing note**: it is not needed to check if this first line is a
+/// > filled line (that it has a non-whitespace character), because blank lines
+/// > are parsed already, so we never run into that.
+fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ _ if size == TAB_SIZE => {
+ tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+ at_break(tokenizer, code)
+ }
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ indent(tokenizer, code, size + 1)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// At a break.
+///
+/// ```markdown
+/// |asd
+/// asd|
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => after(tokenizer, code),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
+ .attempt(further_start, |ok| {
+ Box::new(if ok { at_break } else { after })
+ })(tokenizer, code),
+ _ => {
+ tokenizer.enter(TokenType::CodeFlowChunk);
+ content(tokenizer, code)
+ }
+ }
+}
+
+/// Inside code content.
+///
+/// ```markdown
+/// |ab
+/// a|b
+/// ab|
+/// ```
+fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeFlowChunk);
+ at_break(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(content)), None)
+ }
+ }
+}
+
+/// After indented code.
+///
+/// ```markdown
+/// ab|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::CodeIndented);
+ (State::Ok, Some(vec![code]))
+}
+
+/// Right at a line ending, trying to parse another indent.
+///
+/// ```markdown
+/// ab|
+/// cd
+/// ```
+fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // To do: `nok` if lazy line.
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(further_start)), None)
+ }
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
+ further_indent(tokenizer, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// Inside further whitespace.
+///
+/// ```markdown
+/// asd
+/// | asd
+/// ```
+fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ match code {
+ _ if size == TAB_SIZE => {
+ tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+ (State::Ok, Some(vec![code]))
+ }
+ Code::VirtualSpace | Code::Char(' ' | '\t') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ further_indent(tokenizer, code, size + 1)
+ })),
+ None,
+ )
+ }
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
+ further_start(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
new file mode 100644
index 0000000..b3aef1b
--- /dev/null
+++ b/src/construct/heading_atx.rs
@@ -0,0 +1,175 @@
+//! Heading (atx) is a construct that occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab
+//!
+//! code ::= . ; any unicode code point (other than line endings).
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Headings in markdown relate to the `` through `` elements in HTML.
+//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the
+//! HTML spec][html] for more info.
+//!
+//! `CommonMark` introduced the requirement on whitespace existing after the
+//! opening sequence and before text.
+//! In older markdown versions, this was not required, and headings would form
+//! without it.
+//!
+//! In markdown, it is also possible to create headings with the setext heading
+//! construct.
+//! The benefit of setext headings is that their text can include line endings.
+//! However, their limit is that they cannot form `` through ``
+//! headings.
+//! Due to this limitation, it is recommended to use atx headings.
+//!
+//! > 🏛 **Background**: the word *setext* originates from a small markup
+//! > language by Ian Feldman from 1991.
+//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info.
+//! > The word *atx* originates from a tiny markup language by Aaron Swartz
+//! > from 2002.
+//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for
+//! > more info.
+//!
+//! ## References
+//!
+//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js)
+//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings)
+//!
+//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements
+//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
+//! [atx]: http://www.aaronsw.com/2002/atx/
+//!
+//!
+
+use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a heading (atx).
+///
+/// ```markdown
+/// |## alpha
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if Code::Char('#') == code {
+ tokenizer.enter(TokenType::AtxHeading);
+ tokenizer.enter(TokenType::AtxHeadingSequence);
+ sequence_open(tokenizer, code, 0)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// In the opening sequence.
+///
+/// ```markdown
+/// #|# alpha
+/// ```
+fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult {
+ match code {
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\n' | '\r' | ' ')
+ if rank > 0 =>
+ {
+ tokenizer.exit(TokenType::AtxHeadingSequence);
+ at_break(tokenizer, code)
+ }
+ Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ sequence_open(tokenizer, code, rank + 1)
+ })),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After something but before something else.
+///
+/// ```markdown
+/// ## |alpha
+/// ## alpha| bravo
+/// ## alpha |bravo
+/// ## alpha bravo|##
+/// ## alpha bravo ##|
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::AtxHeading);
+ (State::Ok, Some(vec![code]))
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.enter(TokenType::AtxHeadingWhitespace);
+ whitespace(tokenizer, code)
+ }
+ Code::Char('#') => {
+ tokenizer.enter(TokenType::AtxHeadingSequence);
+ further_sequence(tokenizer, code)
+ }
+ Code::Char(_) => {
+ tokenizer.enter(TokenType::AtxHeadingText);
+ data(tokenizer, code)
+ }
+ }
+}
+
+/// In a further sequence (after whitespace).
+/// Could be normal “visible” hashes in the heading or a final sequence.
+///
+/// ```markdown
+/// ## alpha #|#
+/// ```
+fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if let Code::Char('#') = code {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(further_sequence)), None)
+ } else {
+ tokenizer.exit(TokenType::AtxHeadingSequence);
+ at_break(tokenizer, code)
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// ## alpha | bravo
+/// ```
+fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(whitespace)), None)
+ }
+ _ => {
+ tokenizer.exit(TokenType::AtxHeadingWhitespace);
+ at_break(tokenizer, code)
+ }
+ }
+}
+
+/// In text.
+///
+/// ```markdown
+/// ## al|pha
+/// ```
+fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text.
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => {
+ tokenizer.exit(TokenType::AtxHeadingText);
+ at_break(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(data)), None)
+ }
+ }
+}
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
new file mode 100644
index 0000000..b7d5570
--- /dev/null
+++ b/src/construct/html_flow.rs
@@ -0,0 +1,1068 @@
+//! HTML (flow) is a construct that occurs in the flow content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete
+//!
+//! ; Note: closing tag name need to match opening tag name.
+//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '' raw_tag_name *line ]
+//! comment ::= '' *line ] ]
+//! instruction ::= '' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ]
+//! declaration ::= '' *line ]
+//! cdata ::= '' *line ]
+//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ]
+//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional )
+//!
+//! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive.
+//! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive.
+//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>'
+//! closing_tag ::= '' tag_name whitespace_optional '>'
+//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric )
+//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ]
+//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric )
+//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" ) "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`')
+//!
+//! whitespace ::= 1*space_or_tab
+//! whitespace_optional ::= [ space_or_tab ]
+//! line ::= code - eol
+//! eol ::= '\r' | '\r\n' | '\n'
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! The grammar for HTML in markdown does not resemble the rules of parsing
+//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML
+//! spec][html-parsing].
+//! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?)
+//! attempt to parse an XML-like language.
+//! By extension, another notable property of the grammar is that it can
+//! result in invalid HTML, in that it allows things that wouldn’t work or
+//! wouldn’t work well in HTML, such as mismatched tags.
+//!
+//! Because the **basic** and **complete** productions in the grammar form with
+//! a tag, followed by more stuff, and stop at a blank line, it is possible to
+//! interleave (a word for switching between languages) markdown and HTML
+//! together, by placing the opening and closing tags on their own lines,
+//! with blank lines between them and markdown.
+//! For example:
+//!
+//! ```markdown
+//! This is a div
but *this* is not emphasis.
+//!
+//!
+//!
+//! This is a paragraph in a `div` and *this* is emphasis.
+//!
+//!
+//! ```
+//!
+//! The **complete** production of HTML (flow) is not allowed to interrupt
+//! content.
+//! That means that a blank line is needed between a paragraph and it.
+//! However, HTML (text) has a similar production, which will typically kick-in
+//! instead.
+//!
+//! The list of tag names allowed in the **raw** production are defined in
+//! [`HTML_RAW_NAMES`][html_raw_names].
+//! This production exists because there are a few cases where markdown
+//! *inside* some elements, and hence interleaving, does not make sense.
+//!
+//! The list of tag names allowed in the **basic** production are defined in
+//! [`HTML_BLOCK_NAMES`][html_block_names].
+//! This production exists because there are a few cases where we can decide
+//! early that something is going to be a flow (block) element instead of a
+//! phrasing (inline) element.
+//! We *can* interrupt and don’t have to care too much about it being
+//! well-formed.
+//!
+//! ## References
+//!
+//! * [`html-flow.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-flow.js)
+//! * [*§ 4.6 HTML blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#html-blocks)
+//!
+//! [html_raw_names]: crate::constant::HTML_RAW_NAMES
+//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
+//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
+//!
+//!
+
+use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX};
+use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Kind of HTML (flow).
+#[derive(Debug, Clone, PartialEq)]
+enum Kind {
+ /// Not yet known.
+ Unknown,
+ /// Symbol for `
+/// ```
+fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ let tag_name_buffer = info.buffer.iter().collect::().to_lowercase();
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.clear();
+
+ if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_close(tokenizer, clone, code)
+ })),
+ None,
+ )
+ } else {
+ continuation(tokenizer, clone, code)
+ }
+ }
+ Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => {
+ tokenizer.consume(code);
+ // To do: life times.
+ let mut clone = info;
+ clone.buffer.push(char);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_raw_end_tag(tokenizer, clone, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In cdata continuation, after `]`, expecting `]>`.
+///
+/// ```markdown
+/// &<]|]>
+/// ```
+fn continuation_character_data_inside(
+ tokenizer: &mut Tokenizer,
+ info: Info,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::Char(']') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ continuation_declaration_inside(tokenizer, info, code)
+ })),
+ None,
+ )
+ }
+ _ => continuation(tokenizer, info, code),
+ }
+}
+
+/// In declaration or instruction continuation, waiting for `>` to close it.
+///
+/// ```markdown
+///
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+// To do: should `token_type` be a `Some`, with `None` defaulting to something?
+// To do: should `max: Some(usize)` be added?
+
+/// Before whitespace.
+///
+/// ```markdown
+/// alpha| bravo
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ // To do: lifetimes.
+ let clone = token_type.clone();
+ tokenizer.enter(token_type);
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// alpha |bravo
+/// alpha | bravo
+/// ```
+fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(|tokenizer, code| {
+ inside(tokenizer, code, token_type)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(token_type);
+ (State::Ok, Some(vec![code]))
+ }
+ }
+}
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
new file mode 100644
index 0000000..15ebac7
--- /dev/null
+++ b/src/construct/thematic_break.rs
@@ -0,0 +1,137 @@
+//! Thematic breaks, sometimes called horizontal rules, are a construct that
+//! occurs in the flow content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: all markers must be identical.
+//! ; Restriction: at least 3 markers must be used.
+//! thematic_break ::= *space_or_tab 1*(1*marker *space_or_tab)
+//!
+//! space_or_tab ::= ' ' | '\t'
+//! marker ::= '*' | '-' | '_'
+//! ```
+//!
+//! Thematic breaks in markdown typically relate to the HTML element `
`.
+//! See [*§ 4.4.2 The `hr` element* in the HTML spec][html] for more info.
+//!
+//! It is recommended to use exactly three asterisks without whitespace when
+//! writing markdown.
+//! As using more than three markers has no effect other than wasting space,
+//! it is recommended to use exactly three markers.
+//! Thematic breaks formed with asterisks or dashes can interfere with lists
+//! in if there is whitespace between them: `* * *` and `- - -`.
+//! For these reasons, it is recommend to not use spaces or tabs between the
+//! markers.
+//! Thematic breaks formed with dashes (without whitespace) can also form
+//! setext headings.
+//! As dashes and underscores frequently occur in natural language and URLs, it
+//! is recommended to use asterisks for thematic breaks to distinguish from
+//! such use.
+//! Because asterisks can be used to form the most markdown constructs, using
+//! them has the added benefit of making it easier to gloss over markdown: you
+//! can look for asterisks to find syntax while not worrying about other
+//! characters.
+//!
+//! ## References
+//!
+//! * [`thematic-break.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/thematic-break.js)
+//! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks)
+//!
+//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element
+//!
+//!
+
+use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of a thematic break.
+///
+/// ```markdown
+/// |***
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == '*' || char == '-' || char == '_' => {
+ tokenizer.enter(TokenType::ThematicBreak);
+ at_break(tokenizer, code, char, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After something but before something else.
+///
+/// ```markdown
+/// |***
+/// *| * *
+/// * |* *
+/// ```
+fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.enter(TokenType::ThematicBreakSequence);
+ sequence(tokenizer, code, marker, size)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.enter(TokenType::ThematicBreakWhitespace);
+ whitespace(tokenizer, code, marker, size)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
+ if size >= THEMATIC_BREAK_MARKER_COUNT_MIN =>
+ {
+ tokenizer.exit(TokenType::ThematicBreak);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In a sequence of markers.
+///
+/// ```markdown
+/// |***
+/// *|**
+/// **|*
+/// ```
+fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ sequence(tokenizer, code, marker, size + 1)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(TokenType::ThematicBreakSequence);
+ at_break(tokenizer, code, marker, size)
+ }
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// * |* *
+/// * | * *
+/// ```
+fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ whitespace(tokenizer, code, marker, size)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(TokenType::ThematicBreakWhitespace);
+ at_break(tokenizer, code, marker, size)
+ }
+ }
+}
diff --git a/src/content/flow.rs b/src/content/flow.rs
new file mode 100644
index 0000000..21c5721
--- /dev/null
+++ b/src/content/flow.rs
@@ -0,0 +1,258 @@
+//! The flow content type.
+//!
+//! **Flow** represents the sections, such as headings, code, and content, which
+//! is parsed per line.
+//! An example is HTML, which has a certain starting condition (such as
+//! `
+okay",
+ DANGER
+ ),
+ "
+okay
",
+ "should support raw script tags"
+ );
+
+ assert_eq!(
+ micromark_with_options(
+ "
+okay",
+ DANGER
+ ),
+ "
+okay
",
+ "should support raw style tags"
+ );
+
+ assert_eq!(
+ micromark_with_options("\n*foo*", DANGER),
+ // "\nfoo
",
+ // "should support raw tags w/ start and end on a single line"
+ // );
+
+ assert_eq!(
+ micromark_with_options("1. *bar*", DANGER),
+ "1. *bar*",
+ "should support raw tags w/ more data on ending line"
+ );
+
+ assert_eq!(
+ micromark_with_options("", DANGER),
+ "",
+ "should support blank lines in raw"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark_with_options(">