From 24fec22e912c1aa2569e95683ca95edf1aafce8b Mon Sep 17 00:00:00 2001
From: Titus Wormer [foo]: /url [foo] [foo]: /url 'title with blank line' [foo] [foo]: [foo] [foo]: [foo] bar [foo]: /url "title" ok "title" ok [foo] [foo] Foo\n[bar]: /baz [bar] bar ===\nfoo Link: [+]. [x]: < [x]
".to_string());
}
diff --git a/src/constant.rs b/src/constant.rs
index ff9e62e..1f833c2 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -72,6 +72,9 @@ pub const HEADING_ATX_OPENING_FENCE_SIZE_MAX: usize = 6;
/// [code_fenced]: crate::construct::code_fenced
pub const CODE_FENCED_SEQUENCE_SIZE_MIN: usize = 3;
+/// To safeguard performance, labels are capped at a large number: `999`.
+pub const LINK_REFERENCE_SIZE_MAX: usize = 999;
+
/// List of HTML tag names that form the **raw** production of
/// [HTML (flow)][html_flow].
///
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
new file mode 100644
index 0000000..e540b44
--- /dev/null
+++ b/src/construct/definition.rs
@@ -0,0 +1,318 @@
+//! Definition is a construct that occurs in the [flow] content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! definition ::= label ':' whitespace destination [ whitespace title ] [ space_or_tab ]
+//!
+//! ; Restriction: maximum `999` codes allowed between brackets.
+//! ; Restriction: no blank lines.
+//! ; Restriction: at least 1 non-space and non-eol code must exist.
+//! label ::= '[' *( label_text | label_escape ) ']'
+//! label_text ::= code - '[' - '\\' - ']'
+//! label_escape ::= '\\' [ '[' | '\\' | ']' ]
+//!
+//! destination ::= destination_enclosed | destination_raw
+//! destination_enclosed ::= '<' *( destination_enclosed_text | destination_enclosed_escape ) '>'
+//! destination_enclosed_text ::= code - '<' - '\\' - eol
+//! destination_enclosed_escape ::= '\\' [ '<' | '\\' | '>' ]
+//! destination_raw ::= 1*( destination_raw_text | destination_raw_escape )
+//! ; Restriction: unbalanced `)` characters are not allowed.
+//! destination_raw_text ::= code - '\\' - ascii_control - space_or_tab - eol
+//! destination_raw_escape ::= '\\' [ '(' | ')' | '\\' ]
+//!
+//! ; Restriction: no blank lines.
+//! ; Restriction: markers must match (in case of `(` with `)`).
+//! title ::= marker [ *( code - '\\' | '\\' [ marker ] ) ] marker
+//! marker ::= '"' | '\'' | '('
+//!
+//! whitespace ::= eol *whitespace | 1*space_or_tab [ eol *whitespace ]
+//! space_or_tab ::= ' ' | '\t'
+//! ```
+//!
+//! Definitions in markdown to not, on their own, relate to anything in HTML.
+//! When connected with a link (reference), they together relate to the ``
+//! element in HTML.
+//! The definition forms its `href`, and optionally `title`, attributes.
+//! See [*§ 4.5.1 The `a` element*][html] in the HTML spec for more info.
+//!
+//! The `label`, `destination`, and `title` parts are interpreted as the
+//! [string][] content type.
+//! That means that character escapes and character reference are allowed.
+//!
+//! ## References
+//!
+//! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js)
+//! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.30/#link-reference-definitions)
+//!
+//! [flow]: crate::content::flow
+//! [string]: crate::content::string
+//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
+//!
+//!
+//!
+//!
+
+use crate::construct::{
+ partial_destination::start as destination, partial_label::start as label,
+ partial_title::start as title, partial_whitespace::start as whitespace,
+};
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// At the start of a definition.
+///
+/// ```markdown
+/// |[a]: b "c"
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('[') => {
+ tokenizer.enter(TokenType::Definition);
+ tokenizer.go(label, label_after)(tokenizer, code)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After the label of a definition.
+///
+/// ```markdown
+/// [a]|: b "c"
+/// ```
+pub fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // To do: get the identifier:
+ // identifier = normalizeIdentifier(
+ // self.sliceSerialize(self.events[self.events.length - 1][1]).slice(1, -1)
+ // )
+
+ match code {
+ Code::Char(':') => {
+ tokenizer.enter(TokenType::DefinitionMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionMarker);
+ (State::Fn(Box::new(marker_after)), None)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After the marker of a definition.
+///
+/// ```markdown
+/// [a]:| b "c"
+///
+/// [a]:| ␊
+/// b "c"
+/// ```
+pub fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(marker_after_optional_whitespace),
+ )(tokenizer, code)
+}
+
+/// After the marker, after whitespace.
+///
+/// ```markdown
+/// [a]: |b "c"
+///
+/// [a]: |␊
+/// b "c"
+/// ```
+pub fn marker_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(marker_after_optional_line_ending)), None)
+ }
+ _ => destination_before(tokenizer, code),
+ }
+}
+
+/// After the marker, after a line ending.
+///
+/// ```markdown
+/// [a]:
+/// | b "c"
+/// ```
+pub fn marker_after_optional_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(destination_before),
+ )(tokenizer, code)
+}
+
+/// Before a destination.
+///
+/// ```markdown
+/// [a]: |b "c"
+///
+/// [a]:
+/// |b "c"
+/// ```
+pub fn destination_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let event = tokenizer.events.last().unwrap();
+ // Blank line not ok.
+ let char_nok = matches!(
+ code,
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ );
+
+ if !char_nok
+ && (event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace)
+ {
+ tokenizer.go(destination, destination_after)(tokenizer, code)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// After a destination.
+///
+/// ```markdown
+/// [a]: b| "c"
+///
+/// [a]: b| ␊
+/// "c"
+/// ```
+pub fn destination_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(title_before, |_ok| Box::new(after))(tokenizer, code)
+}
+
+/// After a definition.
+///
+/// ```markdown
+/// [a]: b|
+/// [a]: b "c"|
+/// ```
+pub fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(after_whitespace),
+ )(tokenizer, code)
+}
+
+/// After a definition, after optional whitespace.
+///
+/// ```markdown
+/// [a]: b |
+/// [a]: b "c"|
+/// ```
+pub fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.exit(TokenType::Definition);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// After a destination, presumably before a title.
+///
+/// ```markdown
+/// [a]: b| "c"
+///
+/// [a]: b| ␊
+/// "c"
+/// ```
+pub fn title_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(title_before_after_optional_whitespace),
+ )(tokenizer, code)
+}
+
+/// Before a title, after optional whitespace.
+///
+/// ```markdown
+/// [a]: b |"c"
+///
+/// [a]: b |␊
+/// "c"
+/// ```
+pub fn title_before_after_optional_whitespace(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(title_before_after_optional_line_ending)),
+ None,
+ )
+ }
+ _ => title_before_marker(tokenizer, code),
+ }
+}
+
+/// Before a title, after a line ending.
+///
+/// ```markdown
+/// [a]: b␊
+/// | "c"
+/// ```
+pub fn title_before_after_optional_line_ending(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(title_before_marker),
+ )(tokenizer, code)
+}
+
+/// Before a title, after a line ending.
+///
+/// ```markdown
+/// [a]: b␊
+/// | "c"
+/// ```
+pub fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let event = tokenizer.events.last().unwrap();
+
+ if event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace {
+ tokenizer.go(title, title_after)(tokenizer, code)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// After a title.
+///
+/// ```markdown
+/// [a]: b "c"|
+///
+/// [a]: b␊
+/// "c"|
+/// ```
+pub fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(title_after_after_optional_whitespace),
+ )(tokenizer, code)
+}
+
+/// After a title, after optional whitespace.
+///
+/// ```markdown
+/// [a]: b "c"|
+///
+/// [a]: b "c" |
+/// ```
+pub fn title_after_after_optional_whitespace(
+ _tokenizer: &mut Tokenizer,
+ code: Code,
+) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index ca1149f..fb79f68 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -16,7 +16,7 @@
//! The following constructs are found in markdown:
//!
//! * attention (strong, emphasis) (text)
-//! * [autolink][autolink]
+//! * [autolink][]
//! * [blank line][blank_line]
//! * block quote
//! * [character escape][character_escape]
@@ -25,7 +25,7 @@
//! * [code (indented)][code_indented]
//! * [code (text)][code_text]
//! * content
-//! * definition
+//! * [definition][]
//! * [hard break (escape)][hard_break_escape]
//! * [hard break (trailing)][hard_break_trailing]
//! * [heading (atx)][heading_atx]
@@ -61,11 +61,15 @@ pub mod character_reference;
pub mod code_fenced;
pub mod code_indented;
pub mod code_text;
+pub mod definition;
pub mod hard_break_escape;
pub mod hard_break_trailing;
pub mod heading_atx;
pub mod heading_setext;
pub mod html_flow;
pub mod html_text;
+pub mod partial_destination;
+pub mod partial_label;
+pub mod partial_title;
pub mod partial_whitespace;
pub mod thematic_break;
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
new file mode 100644
index 0000000..8cf5b77
--- /dev/null
+++ b/src/construct/partial_destination.rs
@@ -0,0 +1,154 @@
+// To do: pass token types in.
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('<') => {
+ tokenizer.enter(TokenType::DefinitionDestination);
+ tokenizer.enter(TokenType::DefinitionDestinationLiteral);
+ tokenizer.enter(TokenType::DefinitionDestinationLiteralMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionDestinationLiteralMarker);
+ (State::Fn(Box::new(enclosed_before)), None)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(')') => {
+ (State::Nok, None)
+ }
+ Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
+ Code::Char(_) => {
+ tokenizer.enter(TokenType::DefinitionDestination);
+ tokenizer.enter(TokenType::DefinitionDestinationRaw);
+ tokenizer.enter(TokenType::DefinitionDestinationString);
+ // To do: link.
+ tokenizer.enter(TokenType::ChunkString);
+ raw(tokenizer, code, 0)
+ }
+ }
+}
+
+/// To do.
+fn enclosed_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if let Code::Char('>') = code {
+ tokenizer.enter(TokenType::DefinitionDestinationLiteralMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionDestinationLiteralMarker);
+ tokenizer.exit(TokenType::DefinitionDestinationLiteral);
+ tokenizer.exit(TokenType::DefinitionDestination);
+ (State::Ok, None)
+ } else {
+ tokenizer.enter(TokenType::DefinitionDestinationString);
+ // To do: link.
+ tokenizer.enter(TokenType::ChunkString);
+ enclosed(tokenizer, code)
+ }
+}
+
+/// To do.
+fn enclosed(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('>') => {
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::DefinitionDestinationString);
+ enclosed_before(tokenizer, code)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '<') => {
+ (State::Nok, None)
+ }
+ Code::Char('\\') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(enclosed_escape)), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(enclosed)), None)
+ }
+ }
+}
+
+/// To do.
+fn enclosed_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('<' | '>' | '\\') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(enclosed)), None)
+ }
+ _ => enclosed(tokenizer, code),
+ }
+}
+
+/// To do.
+// To do: these arms can be improved?
+fn raw(tokenizer: &mut Tokenizer, code: Code, balance: usize) -> StateFnResult {
+ // To do: configurable.
+ let limit = usize::MAX;
+
+ match code {
+ Code::Char('(') if balance >= limit => (State::Nok, None),
+ Code::Char('(') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| raw(t, c, balance + 1))),
+ None,
+ )
+ }
+ Code::Char(')') if balance == 0 => {
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::DefinitionDestinationString);
+ tokenizer.exit(TokenType::DefinitionDestinationRaw);
+ tokenizer.exit(TokenType::DefinitionDestination);
+ (State::Ok, Some(vec![code]))
+ }
+ Code::Char(')') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| raw(t, c, balance - 1))),
+ None,
+ )
+ }
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\r' | '\n' | ' ')
+ if balance > 0 =>
+ {
+ (State::Nok, None)
+ }
+ Code::None
+ | Code::CarriageReturnLineFeed
+ | Code::VirtualSpace
+ | Code::Char('\t' | '\r' | '\n' | ' ') => {
+ tokenizer.exit(TokenType::ChunkString);
+ tokenizer.exit(TokenType::DefinitionDestinationString);
+ tokenizer.exit(TokenType::DefinitionDestinationRaw);
+ tokenizer.exit(TokenType::DefinitionDestination);
+ (State::Ok, Some(vec![code]))
+ }
+ Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
+ Code::Char('\\') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| raw_escape(t, c, balance))),
+ None,
+ )
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(move |t, c| raw(t, c, balance))), None)
+ }
+ }
+}
+
+/// To do.
+fn raw_escape(tokenizer: &mut Tokenizer, code: Code, balance: usize) -> StateFnResult {
+ match code {
+ Code::Char('(' | ')' | '\\') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| raw(t, c, balance + 1))),
+ None,
+ )
+ }
+ _ => raw(tokenizer, code, balance),
+ }
+}
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
new file mode 100644
index 0000000..c772c56
--- /dev/null
+++ b/src/construct/partial_label.rs
@@ -0,0 +1,100 @@
+// To do: pass token types in.
+
+use crate::constant::LINK_REFERENCE_SIZE_MAX;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// To do.
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::Char('[') => {
+ tokenizer.enter(TokenType::DefinitionLabel);
+ tokenizer.enter(TokenType::DefinitionLabelMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionLabelMarker);
+ tokenizer.enter(TokenType::DefinitionLabelData);
+ (State::Fn(Box::new(|t, c| at_break(t, c, false, 0))), None)
+ }
+ // To do: allow?
+ _ => unreachable!("expected `[` at start of label"),
+ }
+}
+
+/// To do.
+fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('[') => (State::Nok, None),
+ Code::Char(']') if !data => (State::Nok, None),
+ _ if size > LINK_REFERENCE_SIZE_MAX => (State::Nok, None),
+ Code::Char(']') => {
+ tokenizer.exit(TokenType::DefinitionLabelData);
+ tokenizer.enter(TokenType::DefinitionLabelMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionLabelMarker);
+ tokenizer.exit(TokenType::DefinitionLabel);
+ (State::Ok, None)
+ }
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(move |t, c| at_break(t, c, data, size))),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.enter(TokenType::ChunkString);
+ // To do: link.
+ label(tokenizer, code, data, size)
+ }
+ }
+}
+
+/// To do.
+fn label(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => {
+ tokenizer.exit(TokenType::ChunkString);
+ at_break(tokenizer, code, data, size)
+ }
+ _ if size > LINK_REFERENCE_SIZE_MAX => {
+ tokenizer.exit(TokenType::ChunkString);
+ at_break(tokenizer, code, data, size)
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| label(t, c, data, size + 1))),
+ None,
+ )
+ }
+ Code::Char('/') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| escape(t, c, true, size + 1))),
+ None,
+ )
+ }
+ Code::Char(_) => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| label(t, c, true, size + 1))),
+ None,
+ )
+ }
+ }
+}
+
+/// To do.
+fn escape(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult {
+ match code {
+ Code::Char('[' | '\\' | ']') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| label(t, c, true, size + 1))),
+ None,
+ )
+ }
+ _ => label(tokenizer, code, data, size),
+ }
+}
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
new file mode 100644
index 0000000..4c7b527
--- /dev/null
+++ b/src/construct/partial_title.rs
@@ -0,0 +1,136 @@
+// To do: pass token types in.
+
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Type of quote, if we’re in an attribure, in complete (condition 7).
+#[derive(Debug, Clone, PartialEq)]
+enum TitleKind {
+ /// In a parenthesised (`(` and `)`) title.
+ Paren,
+ /// In a double quoted (`"`) title.
+ Double,
+ /// In a single quoted (`"`) title.
+ Single,
+}
+
+fn kind_to_marker(kind: &TitleKind) -> char {
+ match kind {
+ TitleKind::Double => '"',
+ TitleKind::Single => '\'',
+ TitleKind::Paren => ')',
+ }
+}
+
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let kind = match code {
+ Code::Char('"') => Some(TitleKind::Double),
+ Code::Char('\'') => Some(TitleKind::Single),
+ Code::Char('(') => Some(TitleKind::Paren),
+ _ => None,
+ };
+
+ if let Some(kind) = kind {
+ tokenizer.enter(TokenType::DefinitionTitle);
+ tokenizer.enter(TokenType::DefinitionTitleMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionTitleMarker);
+ (State::Fn(Box::new(|t, c| at_first_break(t, c, kind))), None)
+ } else {
+ (State::Nok, None)
+ }
+}
+
+/// To do.
+fn at_first_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == kind_to_marker(&kind) => {
+ tokenizer.enter(TokenType::DefinitionTitleMarker);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::DefinitionTitleMarker);
+ tokenizer.exit(TokenType::DefinitionTitle);
+ (State::Ok, None)
+ }
+ _ => {
+ tokenizer.enter(TokenType::DefinitionTitleString);
+ at_break(tokenizer, code, kind)
+ }
+ }
+}
+
+/// To do.
+fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == kind_to_marker(&kind) => {
+ tokenizer.exit(TokenType::DefinitionTitleString);
+ at_first_break(tokenizer, code, kind)
+ }
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (
+ State::Fn(Box::new(|t, c| at_break_line_start(t, c, kind))),
+ None,
+ )
+ }
+ _ => {
+ // To do: link.
+ tokenizer.enter(TokenType::ChunkString);
+ title(tokenizer, code, kind)
+ }
+ }
+}
+
+fn at_break_line_start(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult {
+ tokenizer.attempt(
+ |t, c| whitespace(t, c, TokenType::Whitespace),
+ |_ok| Box::new(|t, c| at_break_line_begin(t, c, kind)),
+ )(tokenizer, code)
+}
+
+fn at_break_line_begin(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult {
+ match code {
+ // Blank line not allowed.
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None),
+ _ => at_break(tokenizer, code, kind),
+ }
+}
+
+/// To do.
+fn title(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == kind_to_marker(&kind) => {
+ tokenizer.exit(TokenType::ChunkString);
+ at_break(tokenizer, code, kind)
+ }
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.exit(TokenType::ChunkString);
+ at_break(tokenizer, code, kind)
+ }
+ Code::Char('\\') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(|t, c| escape(t, c, kind))), None)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(|t, c| title(t, c, kind))), None)
+ }
+ }
+}
+
+/// To do.
+fn escape(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult {
+ match code {
+ Code::Char(char) if char == kind_to_marker(&kind) => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(move |t, c| title(t, c, kind))), None)
+ }
+ Code::Char('\\') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(move |t, c| title(t, c, kind))), None)
+ }
+ _ => title(tokenizer, code, kind),
+ }
+}
diff --git a/src/content/content.rs b/src/content/content.rs
index 4ca69ee..86bc290 100644
--- a/src/content/content.rs
+++ b/src/content/content.rs
@@ -16,10 +16,9 @@
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
-/// Before content.
+/// Before a paragraph.
///
/// ```markdown
-/// |[x]: y
/// |asd
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
@@ -27,48 +26,10 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
unreachable!("expected non-eol/eof");
}
- _ => after_definitions(tokenizer, code)
- // To do: definition.
- // _ => tokenizer.attempt(definition, |ok| {
- // Box::new(if ok {
- // a
- // } else {
- // b
- // })
- // })(tokenizer, code),
- }
-}
-
-/// Before a paragraph.
-///
-/// ```markdown
-/// |asd
-/// ```
-fn after_definitions(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::None => (State::Ok, None),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- unreachable!("to do: handle eol after definition");
- }
- _ => paragraph_initial(tokenizer, code),
- }
-}
-
-/// Before a paragraph.
-///
-/// ```markdown
-/// |asd
-/// ```
-fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::None => (State::Ok, None),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- unreachable!("to do: handle eol after definition");
- }
_ => {
tokenizer.enter(TokenType::Paragraph);
tokenizer.enter(TokenType::ChunkText);
- data(tokenizer, code, tokenizer.events.len() - 1)
+ inside(tokenizer, code, tokenizer.events.len() - 1)
}
}
}
@@ -79,7 +40,7 @@ fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// |\&
/// |qwe
/// ```
-fn data(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult {
+fn inside(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult {
match code {
Code::None => {
tokenizer.exit(TokenType::ChunkText);
@@ -94,14 +55,14 @@ fn data(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFn
tokenizer.events[previous_index].next = Some(next_index);
tokenizer.events[next_index].previous = Some(previous_index);
(
- State::Fn(Box::new(move |t, c| data(t, c, next_index))),
+ State::Fn(Box::new(move |t, c| inside(t, c, next_index))),
None,
)
}
_ => {
tokenizer.consume(code);
(
- State::Fn(Box::new(move |t, c| data(t, c, previous_index))),
+ State::Fn(Box::new(move |t, c| inside(t, c, previous_index))),
None,
)
}
diff --git a/src/content/flow.rs b/src/content/flow.rs
index d7509d7..3fab523 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -13,6 +13,7 @@
//! * [Blank line][crate::construct::blank_line]
//! * [Code (fenced)][crate::construct::code_fenced]
//! * [Code (indented)][crate::construct::code_indented]
+//! * [Definition][crate::construct::definition]
//! * [Heading (atx)][crate::construct::heading_atx]
//! * [Heading (setext)][crate::construct::heading_setext]
//! * [HTML (flow)][crate::construct::html_flow]
@@ -23,9 +24,10 @@
use crate::constant::TAB_SIZE;
use crate::construct::{
blank_line::start as blank_line, code_fenced::start as code_fenced,
- code_indented::start as code_indented, heading_atx::start as heading_atx,
- heading_setext::start as heading_setext, html_flow::start as html_flow,
- partial_whitespace::start as whitespace, thematic_break::start as thematic_break,
+ code_indented::start as code_indented, definition::start as definition,
+ heading_atx::start as heading_atx, heading_setext::start as heading_setext,
+ html_flow::start as html_flow, partial_whitespace::start as whitespace,
+ thematic_break::start as thematic_break,
};
use crate::subtokenize::subtokenize;
use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
@@ -96,6 +98,7 @@ fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
+ // To do: should all flow just start before the prefix?
_ => tokenizer.attempt_3(code_indented, code_fenced, html_flow, |ok| {
Box::new(if ok { after } else { before })
})(tokenizer, code),
@@ -145,9 +148,13 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// |***
/// ```
pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt_3(heading_atx, thematic_break, heading_setext, |ok| {
- Box::new(if ok { after } else { content_before })
- })(tokenizer, code)
+ tokenizer.attempt_4(
+ heading_atx,
+ thematic_break,
+ definition,
+ heading_setext,
+ |ok| Box::new(if ok { after } else { content_before }),
+ )(tokenizer, code)
}
/// Before content.
@@ -156,9 +163,7 @@ pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResu
/// |qwe
/// ```
///
-// To do:
-// - Multiline
-// - One or more definitions.
+// To do: we don’t need content anymore in `micromark-rs` it seems?
fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
@@ -247,6 +252,7 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) ->
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
// To do: If code is disabled, indented lines are part of the content.
_ if prefix >= TAB_SIZE => (State::Ok, None),
+ // To do: definitions, setext headings, etc?
_ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| {
let result = if ok {
(State::Nok, None)
diff --git a/src/content/string.rs b/src/content/string.rs
index 25d8582..e8134c4 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -15,6 +15,8 @@ use crate::construct::{
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+// To do: line endings?
+
/// Before string.
///
/// ```markdown
diff --git a/src/parser.rs b/src/parser.rs
index 5648942..250ff0c 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,6 +1,5 @@
//! Turn a string of markdown into events.
// To do: this should start with `containers`, when they’re done.
-// To do: definitions and such will mean more data has to be passed around.
use crate::content::flow::flow;
use crate::tokenizer::{as_codes, Code, Event, Point};
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index fc9e177..9884986 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -50,6 +50,19 @@ pub enum TokenType {
CodeTextData,
Content,
Data,
+ Definition,
+ DefinitionLabel,
+ DefinitionLabelMarker,
+ DefinitionLabelData,
+ DefinitionMarker,
+ DefinitionDestination,
+ DefinitionDestinationLiteral,
+ DefinitionDestinationLiteralMarker,
+ DefinitionDestinationRaw,
+ DefinitionDestinationString,
+ DefinitionTitle,
+ DefinitionTitleMarker,
+ DefinitionTitleString,
HardBreakEscape,
HardBreakEscapeMarker,
HardBreakTrailing,
@@ -350,6 +363,39 @@ impl Tokenizer {
self.stack.truncate(previous.stack_len);
}
+ /// To do.
+ pub fn go(
+ &mut self,
+ state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ ok: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ ) -> Box, Vec
), is_ok, tokenizer: &mut Tokenizer| {
+ let codes = if is_ok { result.1 } else { result.0 };
+ log::debug!(
+ "go: {:?}, codes: {:?}, at {:?}",
+ is_ok,
+ codes,
+ tokenizer.point
+ );
+
+ if is_ok {
+ tokenizer.feed(&codes, ok, false)
+ } else {
+ tokenizer.free(previous);
+ (State::Nok, None)
+ }
+ },
+ )
+ }
+
/// Check if `state` and its future states are successful or not.
///
/// This captures the current state of the tokenizer, returns a wrapped
@@ -461,6 +507,27 @@ impl Tokenizer {
)
}
+ pub fn attempt_4(
+ &mut self,
+ a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ done: impl FnOnce(bool) -> Box
{
fn check_statefn_result(result: StateFnResult) -> StateFnResult {
let (state, mut remainder) = result;
- match state {
- State::Nok | State::Fn(_) => {
- if let Some(ref x) = remainder {
- assert_eq!(
- x.len(),
- 0,
- "expected `None` to be passed back as remainder from `State::Nok`, `State::Fn`"
- );
- }
- }
- State::Ok => {}
- }
-
// Remove an eof.
// For convencience, feeding back an eof is allowed, but cleaned here.
// Most states handle eof and eol in the same branch, and hence pass
diff --git a/tests/character_escape.rs b/tests/character_escape.rs
index 9e2a5c8..ba94ab3 100644
--- a/tests/character_escape.rs
+++ b/tests/character_escape.rs
@@ -67,7 +67,7 @@ fn character_escape() {
// "should escape in resource and title"
// );
- // To do: definition.
+ // To do: link (reference).
// assert_eq!(
// micromark("[foo]: /bar\\* \"ti\\*tle\"\n\n[foo]"),
// "",
diff --git a/tests/character_reference.rs b/tests/character_reference.rs
index e351088..bcd0aca 100644
--- a/tests/character_reference.rs
+++ b/tests/character_reference.rs
@@ -61,7 +61,7 @@ fn character_reference() {
// "should support character references in resource URLs and titles"
// );
- // To do: definition.
+ // To do: link (resource).
// assert_eq!(
// micromark("[foo]: /föö \"föö\"\n\n[foo]"),
// "",
diff --git a/tests/definition.rs b/tests/definition.rs
new file mode 100644
index 0000000..f0869a3
--- /dev/null
+++ b/tests/definition.rs
@@ -0,0 +1,446 @@
+extern crate micromark;
+use micromark::{micromark, micromark_with_options, CompileOptions};
+
+const DANGER: &CompileOptions = &CompileOptions {
+ allow_dangerous_html: true,
+ allow_dangerous_protocol: true,
+};
+
+#[test]
+fn definition() {
+ // To do: link (reference).
+ // assert_eq!(
+ // micromark("[foo]: /url \"title\"\n\n[foo]"),
+ // "",
+ // "should support link definitions"
+ // );
+
+ assert_eq!(
+ micromark("[foo]:\n\n/url\n\n[foo]"),
+ "
\n[foo]: /url "title"\n
\n[foo]: /url\n
Foo
\n\n
",
+ // "should not support definitions in headings"
+ // );
+
+ // To do: link (reference).
+ // assert_eq!(
+ // micromark("[foo]: /url\nbar\n===\n[foo]"),
+ // "bar
\n",
+ // "should support setext headings after definitions"
+ // );
+
+ // To do: link (reference).
+ // assert_eq!(
+ // micromark("[foo]: /url\n===\n[foo]"),
+ // "\n
\n",
+ // "should support definitions in block quotes"
+ // );
+
+ // Extra
+ // To do: link (reference).
+ // assert_eq!(
+ // micromark("[\\[\\+\\]]: example.com\n\nLink: [\\[\\+\\]]."),
+ // "
[x]
", + "should not support a line ending in enclosed destination" + ); + + assert_eq!( + micromark("[x]: \u{000b}a\n\n[x]"), + "[x]: \u{000b}a
\n[x]
", + "should not support ascii control characters at the start of destination" + ); + + assert_eq!( + micromark("[x]: a\u{000b}b\n\n[x]"), + "[x]: a\u{000b}b
\n[x]
", + "should not support ascii control characters in destination" + ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: <\u{000b}a>\n\n[x]"), + // "", + // "should support ascii control characters at the start of enclosed destination" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: \n\n[x]"), + // "", + // "should support ascii control characters in enclosed destinations" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a \"\\\"\"\n\n[x]"), + // "", + // "should support character escapes at the start of a title" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a \"\\\"\"\n\n[x]"), + // "", + // "should support double quoted titles" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a '\"'\n\n[x]"), + // "", + // "should support double quoted titles" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a (\"\")\n\n[x]"), + // "", + // "should support paren enclosed titles" + // ); + + assert_eq!( + micromark("[x]: a(()\n\n[x]"), + "[x]: a(()
\n[x]
", + "should not support more opening than closing parens in the destination" + ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a(())\n\n[x]"), + // "", + // "should support balanced opening and closing parens in the destination" + // ); + + assert_eq!( + micromark("[x]: a())\n\n[x]"), + "[x]: a())
\n[x]
", + "should not support more closing than opening parens in the destination" + ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a \t\n\n[x]"), + // "", + // "should support trailing whitespace after a destination" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: a \"\"X \t\n\n[x]"), + // "", + // "should support trailing whitespace after a destination" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[&©&]: example.com/&©& \"&©&\"\n\n[&©&]"), + // "", + // "should support character references in definitions" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]:\nexample.com\n\n[x]"), + // "", + // "should support a line ending before a destination" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[x]: \t\nexample.com\n\n[x]"), + // "", + // "should support whitespace before a destination" + // ); + + // See:[a]
\n[a]: <b
[a]
\n[a]: b(c
", + "should not support an extra left paren (`(`) in a raw destination" + ); + + assert_eq!( + micromark("[a]\n\n[a]: b)c"), + "[a]
\n[a]: b)c
", + "should not support an extra right paren (`)`) in a raw destination" + ); + + assert_eq!( + micromark("[a]\n\n[a]: b)c"), + "[a]
\n[a]: b)c
", + "should not support an extra right paren (`)`) in a raw destination" + ); + + // To do: link (reference). + // assert_eq!( + // micromark("[a]\n\n[a]: a(1(2(3(4()))))b"), + // "\n", + // "should support 4 or more sets of parens in a raw destination (link resources don’t)" + // ); + + assert_eq!( + micromark("[a]\n\n[a]: aaa)"), + "[a]
\n[a]: aaa)
", + "should not support a final (unbalanced) right paren in a raw destination" + ); + + assert_eq!( + micromark("[a]\n\n[a]: aaa) \"a\""), + "[a]
\n[a]: aaa) "a"
", + "should not support a final (unbalanced) right paren in a raw destination “before” a title" + ); + + // To do: link (reference). + // assert_eq!( + // micromark(" [a]: b \"c\"\n [d]: e\n [f]: g \"h\"\n [i]: j\n\t[k]: l (m)\n\t n [k] o"), + // "n k o
", + // "should support subsequent indented definitions" + // ); + + // To do: link (reference). + // assert_eq!( + // micromark("[a\n b]: c\n\n[a\n b]"), + // "", + // "should support line prefixes in definition labels" + // ); + + assert_eq!( + micromark("[a]: )\n\n[a]"), + "[a]: )
\n[a]
", + "should not support definitions w/ only a closing paren as a raw destination" + ); + + assert_eq!( + micromark("[a]: )b\n\n[a]"), + "[a]: )b
\n[a]
", + "should not support definitions w/ closing paren + more text as a raw destination" + ); + + assert_eq!( + micromark("[a]: b)\n\n[a]"), + "[a]: b)
\n[a]
", + "should not support definitions w/ text + a closing paren as a raw destination" + ); + + // To do: support turning off things. + // assert_eq!( + // micromark("[foo]: /url \"title\"", { + // extensions: [{disable: {null: ["definition"]}}] + // }), + // "[foo]: /url "title"
", + // "should support turning off definitions" + // ); +} -- cgit