aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/construct/gfm_autolink_literal.rs46
-rw-r--r--src/construct/mdx_jsx_text.rs877
-rw-r--r--src/construct/mod.rs12
-rw-r--r--src/construct/text.rs23
-rw-r--r--src/event.rs22
-rw-r--r--src/lib.rs29
-rw-r--r--src/state.rs64
-rw-r--r--src/util/slice.rs25
8 files changed, 1050 insertions, 48 deletions
diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs
index 62f18ef..ae483a7 100644
--- a/src/construct/gfm_autolink_literal.rs
+++ b/src/construct/gfm_autolink_literal.rs
@@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name};
use crate::state::{Name as StateName, State};
use crate::tokenizer::Tokenizer;
use crate::util::{
- classify_character::{classify_opt, Kind as CharacterKind},
- slice::{char_after_index, Position, Slice},
+ classify_character::Kind as CharacterKind,
+ slice::{byte_to_kind, Position, Slice},
};
use alloc::vec::Vec;
@@ -366,11 +366,8 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
- if byte_to_kind(
- tokenizer.parse_state.bytes,
- tokenizer.point.index,
- tokenizer.current,
- ) == CharacterKind::Other
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Other
{
tokenizer.tokenize_state.seen = true;
tokenizer.consume();
@@ -473,11 +470,8 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
- if byte_to_kind(
- tokenizer.parse_state.bytes,
- tokenizer.point.index,
- tokenizer.current,
- ) == CharacterKind::Whitespace
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
{
State::Retry(StateName::GfmAutolinkLiteralPathAfter)
} else {
@@ -549,11 +543,8 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State {
}
_ => {
// Whitespace is the end of the URL, anything else is continuation.
- if byte_to_kind(
- tokenizer.parse_state.bytes,
- tokenizer.point.index,
- tokenizer.current,
- ) == CharacterKind::Whitespace
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
{
State::Ok
} else {
@@ -937,24 +928,3 @@ fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {
split
}
-
-/// Classify a byte (or `char`).
-fn byte_to_kind(bytes: &[u8], index: usize, byte: Option<u8>) -> CharacterKind {
- match byte {
- None => CharacterKind::Whitespace,
- Some(byte) => {
- if byte.is_ascii_whitespace() {
- CharacterKind::Whitespace
- } else if byte.is_ascii_punctuation() {
- CharacterKind::Punctuation
- } else if byte.is_ascii_alphanumeric() {
- CharacterKind::Other
- } else {
- // Otherwise: seems to be an ASCII control, so it seems to be a
- // non-ASCII `char`.
- let char = char_after_index(bytes, index);
- classify_opt(char)
- }
- }
- }
-}
diff --git a/src/construct/mdx_jsx_text.rs b/src/construct/mdx_jsx_text.rs
new file mode 100644
index 0000000..7a33499
--- /dev/null
+++ b/src/construct/mdx_jsx_text.rs
@@ -0,0 +1,877 @@
+//! To do.
+
+use crate::construct::partial_space_or_tab_eol::space_or_tab_eol;
+use crate::event::Name;
+use crate::state::{Name as StateName, State};
+use crate::tokenizer::Tokenizer;
+use crate::util::{classify_character::Kind as CharacterKind, slice::byte_to_kind};
+use alloc::{
+ format,
+ string::{String, ToString},
+};
+use core::str;
+
+/// Start of MDX: JSX (text).
+///
+/// ```markdown
+/// > | a <B /> c
+/// ^
+/// ```
+pub fn start(tokenizer: &mut Tokenizer) -> State {
+ if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.mdx_jsx_text {
+ tokenizer.enter(Name::MdxJsxTextTag);
+ tokenizer.enter(Name::MdxJsxTextTagMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagMarker);
+ State::Next(StateName::MdxJsxTextStartAfter)
+ } else {
+ State::Nok
+ }
+}
+
+/// After `<`.
+///
+/// ```markdown
+/// > | a <B /> c
+/// ^
+/// ```
+pub fn start_after(tokenizer: &mut Tokenizer) -> State {
+ // Deviate from JSX, which allows arbitrary whitespace.
+ // See: <https://github.com/micromark/micromark-extension-mdx-jsx/issues/7>.
+ if let Some(b'\t' | b'\n' | b' ') = tokenizer.current {
+ State::Nok
+ } else {
+ tokenizer.attempt(State::Next(StateName::MdxJsxTextNameBefore), State::Nok);
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+}
+
+/// Before name, self slash, or end of tag for fragments.
+///
+/// ```markdown
+/// > | a <B> c
+/// ^
+/// > | a </B> c
+/// ^
+/// > | a <> b
+/// ^
+/// ```
+pub fn name_before(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Closing tag.
+ Some(b'/') => {
+ tokenizer.enter(Name::MdxJsxTextTagClosingMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagClosingMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextClosingTagNameBefore),
+ State::Nok,
+ );
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Fragment opening tag.
+ Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+ _ => {
+ // To do: unicode.
+ if id_start(tokenizer.current) {
+ tokenizer.enter(Name::MdxJsxTextTagName);
+ tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextPrimaryName)
+ } else {
+ crash(
+ tokenizer,
+ "before name",
+ &format!(
+ "a character that can start a name, such as a letter, `$`, or `_`{}",
+ if tokenizer.current == Some(b'!') {
+ " (note: to create a comment in MDX, use `{/* text */}`)"
+ } else {
+ ""
+ }
+ ),
+ )
+ }
+ }
+ }
+}
+
+/// Before name of closing tag or end of closing fragment tag.
+///
+/// ```markdown
+/// > | a </> b
+/// ^
+/// > | a </B> c
+/// ^
+/// ```
+pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Fragment closing tag.
+ Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+ // Start of a closing tag name.
+ _ => {
+ // To do: unicode.
+ if id_start(tokenizer.current) {
+ tokenizer.enter(Name::MdxJsxTextTagName);
+ tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextPrimaryName)
+ } else {
+ crash(
+ tokenizer,
+ "before name",
+ &format!(
+ "a character that can start a name, such as a letter, `$`, or `_`{}",
+ if tokenizer.current == Some(b'*' | b'/') {
+ " (note: JS comments in JSX tags are not supported in MDX)"
+ } else {
+ ""
+ }
+ ),
+ )
+ }
+ }
+ }
+}
+
+/// In primary name.
+///
+/// ```markdown
+/// > | a <Bc> d
+/// ^
+/// ```
+pub fn primary_name(tokenizer: &mut Tokenizer) -> State {
+ // End of name.
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ || matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{'))
+ {
+ tokenizer.exit(Name::MdxJsxTextTagNamePrimary);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextPrimaryNameAfter),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Continuation of name: remain.
+ // To do: unicode.
+ else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextPrimaryName)
+ } else {
+ crash(
+ tokenizer,
+ "in name",
+ &format!(
+ "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}",
+ if tokenizer.current == Some(b'@') {
+ " (note: to create a link in MDX, use `[text](url)`)"
+ } else {
+ ""
+ }
+ ),
+ )
+ }
+}
+
+/// After primary name.
+///
+/// ```markdown
+/// > | a <b.c> d
+/// ^
+/// > | a <b:c> d
+/// ^
+/// ```
+pub fn primary_name_after(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Start of a member name.
+ Some(b'.') => {
+ tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextMemberNameBefore),
+ State::Nok,
+ );
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Start of a local name.
+ Some(b':') => {
+ tokenizer.enter(Name::MdxJsxTextTagNamePrefixMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagNamePrefixMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextLocalNameBefore),
+ State::Nok,
+ );
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // End of name.
+ _ => {
+ if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+ {
+ tokenizer.exit(Name::MdxJsxTextTagName);
+ State::Retry(StateName::MdxJsxTextAttributeBefore)
+ } else {
+ crash(
+ tokenizer,
+ "after name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+ )
+ }
+ }
+ }
+}
+
+/// Before member name.
+///
+/// ```markdown
+/// > | a <b.c> d
+/// ^
+/// ```
+pub fn member_name_before(tokenizer: &mut Tokenizer) -> State {
+ // Start of a member name.
+ if id_start(tokenizer.current) {
+ tokenizer.enter(Name::MdxJsxTextTagNameMember);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextMemberName)
+ } else {
+ crash(
+ tokenizer,
+ "before member name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+ )
+ }
+}
+
+/// In member name.
+///
+/// ```markdown
+/// > | a <b.cd> e
+/// ^
+/// ```
+pub fn member_name(tokenizer: &mut Tokenizer) -> State {
+ // End of name.
+ // Note: no `:` allowed here.
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ || matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{'))
+ {
+ tokenizer.exit(Name::MdxJsxTextTagNameMember);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextMemberNameAfter),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Continuation of name: remain.
+ // To do: unicode.
+ else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextMemberName)
+ } else {
+ crash(
+ tokenizer,
+ "in member name",
+ &format!(
+ "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}",
+ if tokenizer.current == Some(b'@') {
+ " (note: to create a link in MDX, use `[text](url)`)"
+ } else {
+ ""
+ }
+ ),
+ )
+ }
+}
+
+/// After member name.
+///
+/// ```markdown
+/// > | a <b.c> d
+/// ^
+/// > | a <b.c.d> e
+/// ^
+/// ```
+pub fn member_name_after(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Start of another member name.
+ Some(b'.') => {
+ tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextMemberNameBefore),
+ State::Nok,
+ );
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // End of name.
+ _ => {
+ if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+ {
+ tokenizer.exit(Name::MdxJsxTextTagName);
+ State::Retry(StateName::MdxJsxTextAttributeBefore)
+ } else {
+ crash(
+ tokenizer,
+ "after member name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+ )
+ }
+ }
+ }
+}
+
+/// Local member name.
+///
+/// ```markdown
+/// > | a <b:c> d
+/// ^
+/// ```
+pub fn local_name_before(tokenizer: &mut Tokenizer) -> State {
+ // Start of a local name.
+ if id_start(tokenizer.current) {
+ tokenizer.enter(Name::MdxJsxTextTagNameLocal);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextLocalName)
+ } else {
+ crash(
+ tokenizer,
+ "before local name",
+ &format!(
+ "a character that can start a name, such as a letter, `$`, or `_`{}",
+ if matches!(tokenizer.current, Some(b'+' | b'/'..=b'9')) {
+ " (note: to create a link in MDX, use `[text](url)`)"
+ } else {
+ ""
+ }
+ ),
+ )
+ }
+}
+
+/// In local name.
+///
+/// ```markdown
+/// > | a <b:cd> e
+/// ^
+/// ```
+pub fn local_name(tokenizer: &mut Tokenizer) -> State {
+ // End of local name (note that we don’t expect another colon, or a member).
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+ {
+ tokenizer.exit(Name::MdxJsxTextTagNameLocal);
+ tokenizer.attempt(State::Next(StateName::MdxJsxTextLocalNameAfter), State::Nok);
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Continuation of name: remain.
+ // To do: unicode.
+ else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextLocalName)
+ } else {
+ crash(
+ tokenizer,
+ "in local name",
+ "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag"
+ )
+ }
+}
+
+/// After local name.
+///
+/// This is like as `primary_name_after`, but we don’t expect colons or
+/// periods.
+///
+/// ```markdown
+/// > | a <b.c> d
+/// ^
+/// > | a <b.c.d> e
+/// ^
+/// ```
+pub fn local_name_after(tokenizer: &mut Tokenizer) -> State {
+ // End of name.
+ if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) {
+ tokenizer.exit(Name::MdxJsxTextTagName);
+ State::Retry(StateName::MdxJsxTextAttributeBefore)
+ } else {
+ crash(
+ tokenizer,
+ "after local name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+ )
+ }
+}
+
+/// Before attribute.
+///
+/// ```markdown
+/// > | a <b /> c
+/// ^
+/// > | a <b > c
+/// ^
+/// > | a <b {...c}> d
+/// ^
+/// > | a <b c> d
+/// ^
+/// ```
+pub fn attribute_before(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Self-closing.
+ Some(b'/') => {
+ tokenizer.enter(Name::MdxJsxTextTagSelfClosingMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagSelfClosingMarker);
+ tokenizer.attempt(State::Next(StateName::MdxJsxTextSelfClosing), State::Nok);
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // End of tag.
+ Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+ // Attribute expression.
+ Some(b'{') => unreachable!("to do: attribute expression"),
+ _ => {
+ // Start of an attribute name.
+ if id_start(tokenizer.current) {
+ tokenizer.enter(Name::MdxJsxTextTagAttribute);
+ tokenizer.enter(Name::MdxJsxTextTagAttributeName);
+ tokenizer.enter(Name::MdxJsxTextTagAttributePrimaryName);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextAttributePrimaryName)
+ } else {
+ crash(
+ tokenizer,
+ "before attribute name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+ )
+ }
+ }
+ }
+}
+
+/// In primary attribute name.
+///
+/// ```markdown
+/// > | a <b cd/> e
+/// ^
+/// > | a <b c:d> e
+/// ^
+/// > | a <b c=d> e
+/// ^
+/// ```
+pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {
+ // End of attribute name or tag.
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ || matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{'))
+ {
+ tokenizer.exit(Name::MdxJsxTextTagAttributePrimaryName);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributePrimaryNameAfter),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Continuation of the attribute name: remain.
+ // To do: unicode.
+ else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextLocalName)
+ } else {
+ crash(
+ tokenizer,
+ "in attribute name",
+ "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag"
+ )
+ }
+}
+
+/// After primary attribute name.
+///
+/// ```markdown
+/// > | a <b c/> d
+/// ^
+/// > | a <b c:d> e
+/// ^
+/// > | a <b c=d> e
+/// ^
+/// ```
+pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Start of a local name.
+ Some(b':') => {
+ tokenizer.enter(Name::MdxJsxTextTagAttributeNamePrefixMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagAttributeNamePrefixMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeLocalNameBefore),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Initializer: start of an attribute value.
+ Some(b'=') => {
+ tokenizer.exit(Name::MdxJsxTextTagAttributeName);
+ tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeValueBefore),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ _ => {
+ // End of tag / new attribute.
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
+ || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+ || id_start(tokenizer.current)
+ {
+ tokenizer.exit(Name::MdxJsxTextTagAttributeName);
+ tokenizer.exit(Name::MdxJsxTextTagAttribute);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeBefore),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ } else {
+ crash(
+ tokenizer,
+ "after attribute name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag"
+ )
+ }
+ }
+ }
+}
+
+/// Before local attribute name.
+///
+/// ```markdown
+/// > | a <b c:d/> e
+/// ^
+/// ```
+pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State {
+ // Start of a local name.
+ if id_start(tokenizer.current) {
+ tokenizer.enter(Name::MdxJsxTextTagAttributeNameLocal);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextAttributeLocalName)
+ } else {
+ crash(
+ tokenizer,
+ "before local attribute name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag"
+ )
+ }
+}
+
+/// In local attribute name.
+///
+/// ```markdown
+/// > | a <b c:de/> f
+/// ^
+/// > | a <b c:d=e/> f
+/// ^
+/// ```
+pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State {
+ // End of local name (note that we don’t expect another colon).
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+ || matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{'))
+ {
+ tokenizer.exit(Name::MdxJsxTextTagAttributeNameLocal);
+ tokenizer.exit(Name::MdxJsxTextTagAttributeName);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeLocalNameAfter),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Continuation of local name: remain.
+ // To do: unicode.
+ else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextAttributeLocalName)
+ } else {
+ crash(
+ tokenizer,
+ "in local attribute name",
+ "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag"
+ )
+ }
+}
+
+/// After local attribute name.
+///
+/// ```markdown
+/// > | a <b c:d/> f
+/// ^
+/// > | a <b c:d=e/> f
+/// ^
+/// ```
+pub fn attribute_local_name_after(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Start of an attribute value.
+ Some(b'=') => {
+ tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeValueBefore),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ _ => {
+ // End of name.
+ if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+ {
+ tokenizer.exit(Name::MdxJsxTextTagAttribute);
+ State::Retry(StateName::MdxJsxTextAttributeBefore)
+ } else {
+ crash(
+ tokenizer,
+ "after local attribute name",
+ "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag"
+ )
+ }
+ }
+ }
+}
+
+/// After `=`, before value.
+///
+/// ```markdown
+/// > | a <b c="d"/> e
+/// ^
+/// > | a <b c={d}/> e
+/// ^
+/// ```
+pub fn attribute_value_before(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ // Start of double- or single quoted value.
+ Some(b'"' | b'\'') => {
+ tokenizer.tokenize_state.marker = tokenizer.current.unwrap();
+ tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral);
+ tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+ State::Next(StateName::MdxJsxTextAttributeValueQuotedStart)
+ }
+ // Attribute value expression.
+ Some(b'{') => unreachable!("to do: attribute value expression"),
+ _ => crash(
+ tokenizer,
+ "before attribute value",
+ &format!(
+ "a character that can start an attribute value, such as `\"`, `'`, or `{{`{}",
+ if tokenizer.current == Some(b'<') {
+ " (note: to use an element or fragment as a prop value in MDX, use `{<element />}`)"
+ } else {
+ ""
+ }
+ ),
+ ),
+ }
+}
+
+/// Before quoted literal attribute value.
+///
+/// ```markdown
+/// > | a <b c="d"/> e
+/// ^
+/// ```
+pub fn attribute_value_quoted_start(tokenizer: &mut Tokenizer) -> State {
+ if let Some(byte) = tokenizer.current {
+ if byte == tokenizer.tokenize_state.marker {
+ tokenizer.tokenize_state.marker = 0;
+ tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+ tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral);
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeValueBefore),
+ State::Nok,
+ );
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+ } else if byte == b'\n' {
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextAttributeValueQuotedStart),
+ State::Nok,
+ );
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ } else {
+ tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralValue);
+ State::Retry(StateName::MdxJsxTextAttributeValueQuoted)
+ }
+ } else {
+ crash(
+ tokenizer,
+ "in attribute value",
+ &format!(
+ "a corresponding closing quote {}",
+ format_byte(tokenizer.tokenize_state.marker)
+ ),
+ )
+ }
+}
+
+/// In quoted literal attribute value.
+///
+/// ```markdown
+/// > | a <b c="d"/> e
+/// ^
+/// ```
+pub fn attribute_value_quoted(tokenizer: &mut Tokenizer) -> State {
+ // To do: doesn’t this break for:
+ // ```markdown
+ // a <b c="d"
+ // "f">
+ if tokenizer.current == Some(tokenizer.tokenize_state.marker)
+ || matches!(tokenizer.current, None | Some(b'\n'))
+ {
+ tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralValue);
+ State::Retry(StateName::MdxJsxTextAttributeValueQuoted)
+ } else {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextAttributeValueQuoted)
+ }
+}
+
+/// After self-closing slash.
+///
+/// ```markdown
+/// > | a <b/> c
+/// ^
+/// ```
+pub fn self_closing(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+ _ => crash(
+ tokenizer,
+ "after self-closing slash",
+ &format!(
+ "`>` to end the tag{}",
+ if tokenizer.current == Some(b'*' | b'/') {
+ " (note: JS comments in JSX tags are not supported in MDX)"
+ } else {
+ ""
+ }
+ ),
+ ),
+ }
+}
+
+/// At final `>`.
+///
+/// ```markdown
+/// > | a <b> c
+/// ^
+/// ```
+pub fn tag_end(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some(b'>') => {
+ tokenizer.enter(Name::MdxJsxTextTagMarker);
+ tokenizer.consume();
+ tokenizer.exit(Name::MdxJsxTextTagMarker);
+ tokenizer.exit(Name::MdxJsxTextTag);
+ State::Ok
+ }
+ _ => unreachable!("expected `>`"),
+ }
+}
+
+/// Before optional ECMAScript whitespace.
+///
+/// ```markdown
+/// > | a <a b> c
+/// ^
+/// ```
+pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some(b'\n') => {
+ // To do: check if this works for blank lines?
+ tokenizer.attempt(
+ State::Next(StateName::MdxJsxTextEsWhitespaceStart),
+ State::Nok,
+ );
+ State::Retry(space_or_tab_eol(tokenizer))
+ }
+ _ => {
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
+ {
+ tokenizer.enter(Name::MdxJsxTextEsWhitespace);
+ State::Retry(StateName::MdxJsxTextEsWhitespaceInside)
+ } else {
+ State::Ok
+ }
+ }
+ }
+}
+
+/// In ECMAScript whitespace.
+///
+/// ```markdown
+/// > | a <a b> c
+/// ^
+/// ```
+pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some(b'\n') => {
+ tokenizer.exit(Name::MdxJsxTextEsWhitespace);
+ State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+ }
+ // Allow continuation bytes.
+ Some(0x80..=0xBF) => {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextEsWhitespaceInside)
+ }
+ _ => {
+ if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+ == CharacterKind::Whitespace
+ {
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextEsWhitespaceInside)
+ } else {
+ tokenizer.exit(Name::MdxJsxTextEsWhitespace);
+ State::Ok
+ }
+ }
+ }
+}
+
+// To do: unicode.
+fn id_start(code: Option<u8>) -> bool {
+ matches!(code, Some(b'$' | b'_' | b'A'..=b'Z' | b'a'..=b'z'))
+}
+
+// To do: unicode.
+fn id_cont(code: Option<u8>) -> bool {
+ matches!(
+ code,
+ Some(b'$' | b'_' | b'A'..=b'Z' | b'0'..=b'9' | b'a'..=b'z')
+ )
+}
+
+fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> ! {
+ // To do: externalize this, and the print mechanism in the tokenizer,
+ // to one proper formatter.
+ // To do: figure out how Rust does errors?
+ let actual = match tokenizer.current {
+ None => "end of file".to_string(),
+ Some(byte) => format_byte(byte),
+ };
+
+ unreachable!(
+ "{}:{}: Unexpected {} {}, expected {}",
+ tokenizer.point.line, tokenizer.point.column, actual, at, expect
+ )
+}
+
+fn format_byte(byte: u8) -> String {
+ match byte {
+ b'`' => "`` ` ``".to_string(),
+ b' '..=b'~' => format!("`{}`", str::from_utf8(&[byte]).unwrap()),
+ _ => format!("U+{:>04X}", byte),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index de88174..d2843c3 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -30,14 +30,13 @@
//!
//! The following constructs are found in markdown (CommonMark):
//!
-//! * [attention (strong, emphasis)][attention]
+//! * [attention][attention] (strong, emphasis, extension: GFM strikethrough)
//! * [autolink][]
//! * [blank line][blank_line]
//! * [block quote][block_quote]
//! * [character escape][character_escape]
//! * [character reference][character_reference]
//! * [code (indented)][code_indented]
-//! * [code (text)][raw_text]
//! * [definition][]
//! * [hard break (escape)][hard_break_escape]
//! * [heading (atx)][heading_atx]
@@ -49,7 +48,8 @@
//! * [label start (link)][label_start_link]
//! * [list item][list_item]
//! * [paragraph][]
-//! * [raw (flow)][raw_flow] (code (fenced), math (flow))
+//! * [raw (flow)][raw_flow] (code (fenced), extensions: math (flow))
+//! * [raw (text)][raw_text] (code (text), extensions: math (text))
//! * [thematic break][thematic_break]
//!
//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
@@ -60,11 +60,10 @@
//! * [frontmatter][]
//! * [gfm autolink literal][gfm_autolink_literal]
//! * [gfm footnote definition][gfm_footnote_definition]
+//! * [gfm label start footnote][gfm_label_start_footnote]
//! * [gfm table][gfm_table]
//! * [gfm task list item check][gfm_task_list_item_check]
-//! * [gfm label start footnote][gfm_label_start_footnote]
-//! * math (text) (in `raw_text`)
-//! * gfm strikethrough (in attention)
+//! * [mdx jsx (text)][mdx_jsx_text]
//!
//! There are also several small subroutines typically used in different places:
//!
@@ -163,6 +162,7 @@ pub mod label_end;
pub mod label_start_image;
pub mod label_start_link;
pub mod list_item;
+pub mod mdx_jsx_text;
pub mod paragraph;
pub mod partial_bom;
pub mod partial_data;
diff --git a/src/construct/text.rs b/src/construct/text.rs
index 0168d02..b59fe65 100644
--- a/src/construct/text.rs
+++ b/src/construct/text.rs
@@ -18,6 +18,7 @@
//! * [Label start (image)][crate::construct::label_start_image]
//! * [Label start (link)][crate::construct::label_start_link]
//! * [Label end][crate::construct::label_end]
+//! * [MDX: JSX (text)][crate::construct::mdx_jsx_text]
//!
//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
//! > [whitespace][crate::construct::partial_whitespace].
@@ -34,7 +35,7 @@ const MARKERS: [u8; 15] = [
b'$', // `raw_text` (math (text))
b'&', // `character_reference`
b'*', // `attention` (emphasis, strong)
- b'<', // `autolink`, `html_text`
+ b'<', // `autolink`, `html_text`, `mdx_jsx_text`
b'H', // `gfm_autolink_literal` (`protocol` kind)
b'W', // `gfm_autolink_literal` (`www.` kind)
b'[', // `label_start_link`
@@ -109,7 +110,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {
);
State::Retry(StateName::AttentionStart)
}
- // `autolink`, `html_text` (order does not matter)
+ // `autolink`, `html_text` (order does not matter), `mdx_jsx_text` (order matters).
Some(b'<') => {
tokenizer.attempt(
State::Next(StateName::TextBefore),
@@ -167,11 +168,27 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {
pub fn before_html(tokenizer: &mut Tokenizer) -> State {
tokenizer.attempt(
State::Next(StateName::TextBefore),
- State::Next(StateName::TextBeforeData),
+ State::Next(StateName::TextBeforeMdxJsx),
);
State::Retry(StateName::HtmlTextStart)
}
+/// Before mdx jsx (text).
+///
+/// At `<`, which wasn’t an autolink or html.
+///
+/// ```markdown
+/// > | a <b>
+/// ^
+/// ```
+pub fn before_mdx_jsx(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.attempt(
+ State::Next(StateName::TextBefore),
+ State::Next(StateName::TextBeforeData),
+ );
+ State::Retry(StateName::MdxJsxTextStart)
+}
+
/// Before hard break escape.
///
/// At `\`, which wasn’t a character escape.
diff --git a/src/event.rs b/src/event.rs
index fad2c64..b476d45 100644
--- a/src/event.rs
+++ b/src/event.rs
@@ -2730,6 +2730,28 @@ pub enum Name {
/// ^ ^ ^
/// ```
ThematicBreakSequence,
+
+ // To do: sort.
+ MdxJsxTextTag,
+ MdxJsxTextTagMarker, // void
+ MdxJsxTextEsWhitespace, // void
+ MdxJsxTextTagClosingMarker, // void
+ MdxJsxTextTagName,
+ MdxJsxTextTagNamePrimary, // void?
+ MdxJsxTextTagNameMemberMarker, // void
+ MdxJsxTextTagNamePrefixMarker, // void
+ MdxJsxTextTagNameMember, // void
+ MdxJsxTextTagNameLocal, // void
+ MdxJsxTextTagSelfClosingMarker, // void
+ MdxJsxTextTagAttribute,
+ MdxJsxTextTagAttributeName,
+ MdxJsxTextTagAttributePrimaryName,
+ MdxJsxTextTagAttributeNamePrefixMarker, // void
+ MdxJsxTextTagAttributeInitializerMarker, // void
+ MdxJsxTextTagAttributeNameLocal, // void
+ MdxJsxTextTagAttributeValueLiteral,
+ MdxJsxTextTagAttributeValueLiteralMarker, // void
+ MdxJsxTextTagAttributeValueLiteralValue,
}
/// List of void events, used to make sure everything is working well.
diff --git a/src/lib.rs b/src/lib.rs
index e3fdfcb..7fd705b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -301,6 +301,13 @@ pub struct Constructs {
/// ^^^
/// ```
pub math_text: bool,
+ /// MDX: JSX (text).
+ ///
+ /// ```markdown
+ /// > | a <Component /> c
+ /// ^^^^^^^^^^^^^
+ /// ```
+ pub mdx_jsx_text: bool,
/// Thematic break.
///
/// ```markdown
@@ -342,6 +349,7 @@ impl Default for Constructs {
list_item: true,
math_flow: false,
math_text: false,
+ mdx_jsx_text: false,
thematic_break: true,
}
}
@@ -350,6 +358,8 @@ impl Default for Constructs {
impl Constructs {
/// GFM.
///
+ /// <https://github.github.com/gfm/>
+ ///
/// This turns on `CommonMark` + GFM.
#[must_use]
pub fn gfm() -> Self {
@@ -363,6 +373,25 @@ impl Constructs {
..Self::default()
}
}
+
+ /// MDX.
+ ///
+ /// <https://mdxjs.com>
+ ///
+ /// This turns on `CommonMark`, turns off some conflicting constructs
+ /// (autolinks, code (indented), html), and turns on MDX (JSX,
+ /// expressions, ESM).
+ #[must_use]
+ pub fn mdx() -> Self {
+ Self {
+ autolink: false,
+ code_indented: false,
+ html_flow: false,
+ html_text: false,
+ mdx_jsx_text: true,
+ ..Self::default()
+ }
+ }
}
/// Configuration (optional).
diff --git a/src/state.rs b/src/state.rs
index d7c0c8a..3294a2f 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -358,6 +358,7 @@ pub enum Name {
TextStart,
TextBefore,
TextBeforeHtml,
+ TextBeforeMdxJsx,
TextBeforeHardBreakEscape,
TextBeforeLabelStartLink,
TextBeforeData,
@@ -374,12 +375,74 @@ pub enum Name {
TitleAtBlankLine,
TitleEscape,
TitleInside,
+
+ // To do: sort.
+ MdxJsxTextEsWhitespaceStart,
+ MdxJsxTextEsWhitespaceInside,
+ MdxJsxTextStart,
+ MdxJsxTextStartAfter,
+ MdxJsxTextNameBefore,
+ MdxJsxTextClosingTagNameBefore,
+ MdxJsxTextTagEnd,
+ MdxJsxTextPrimaryName,
+ MdxJsxTextPrimaryNameAfter,
+ MdxJsxTextMemberNameBefore,
+ MdxJsxTextMemberName,
+ MdxJsxTextMemberNameAfter,
+ MdxJsxTextLocalNameBefore,
+ MdxJsxTextLocalName,
+ MdxJsxTextLocalNameAfter,
+ MdxJsxTextAttributeBefore,
+ MdxJsxTextSelfClosing,
+ MdxJsxTextAttributePrimaryName,
+ MdxJsxTextAttributePrimaryNameAfter,
+ MdxJsxTextAttributeLocalNameBefore,
+ MdxJsxTextAttributeLocalName,
+ MdxJsxTextAttributeLocalNameAfter,
+ MdxJsxTextAttributeValueBefore,
+ MdxJsxTextAttributeValueQuotedStart,
+ MdxJsxTextAttributeValueQuoted,
}
#[allow(clippy::too_many_lines)]
/// Call the corresponding state for a state name.
pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State {
let func = match name {
+ // To do: sort.
+ Name::MdxJsxTextEsWhitespaceStart => construct::mdx_jsx_text::es_whitespace_start,
+ Name::MdxJsxTextEsWhitespaceInside => construct::mdx_jsx_text::es_whitespace_inside,
+ Name::MdxJsxTextStart => construct::mdx_jsx_text::start,
+ Name::MdxJsxTextStartAfter => construct::mdx_jsx_text::start_after,
+ Name::MdxJsxTextNameBefore => construct::mdx_jsx_text::name_before,
+ Name::MdxJsxTextClosingTagNameBefore => construct::mdx_jsx_text::closing_tag_name_before,
+ Name::MdxJsxTextTagEnd => construct::mdx_jsx_text::tag_end,
+ Name::MdxJsxTextPrimaryName => construct::mdx_jsx_text::primary_name,
+ Name::MdxJsxTextPrimaryNameAfter => construct::mdx_jsx_text::primary_name_after,
+ Name::MdxJsxTextMemberNameBefore => construct::mdx_jsx_text::member_name_before,
+ Name::MdxJsxTextMemberName => construct::mdx_jsx_text::member_name,
+ Name::MdxJsxTextMemberNameAfter => construct::mdx_jsx_text::member_name_after,
+ Name::MdxJsxTextLocalNameBefore => construct::mdx_jsx_text::local_name_before,
+ Name::MdxJsxTextLocalName => construct::mdx_jsx_text::local_name,
+ Name::MdxJsxTextLocalNameAfter => construct::mdx_jsx_text::local_name_after,
+ Name::MdxJsxTextAttributeBefore => construct::mdx_jsx_text::attribute_before,
+ Name::MdxJsxTextSelfClosing => construct::mdx_jsx_text::self_closing,
+ Name::MdxJsxTextAttributePrimaryName => construct::mdx_jsx_text::attribute_primary_name,
+ Name::MdxJsxTextAttributePrimaryNameAfter => {
+ construct::mdx_jsx_text::attribute_primary_name_after
+ }
+ Name::MdxJsxTextAttributeLocalNameBefore => {
+ construct::mdx_jsx_text::attribute_local_name_before
+ }
+ Name::MdxJsxTextAttributeLocalName => construct::mdx_jsx_text::attribute_local_name,
+ Name::MdxJsxTextAttributeLocalNameAfter => {
+ construct::mdx_jsx_text::attribute_local_name_after
+ }
+ Name::MdxJsxTextAttributeValueBefore => construct::mdx_jsx_text::attribute_value_before,
+ Name::MdxJsxTextAttributeValueQuotedStart => {
+ construct::mdx_jsx_text::attribute_value_quoted_start
+ }
+ Name::MdxJsxTextAttributeValueQuoted => construct::mdx_jsx_text::attribute_value_quoted,
+
Name::AttentionStart => construct::attention::start,
Name::AttentionInside => construct::attention::inside,
@@ -776,6 +839,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State {
Name::TextStart => construct::text::start,
Name::TextBefore => construct::text::before,
Name::TextBeforeHtml => construct::text::before_html,
+ Name::TextBeforeMdxJsx => construct::text::before_mdx_jsx,
Name::TextBeforeHardBreakEscape => construct::text::before_hard_break_escape,
Name::TextBeforeLabelStartLink => construct::text::before_label_start_link,
Name::TextBeforeData => construct::text::before_data,
diff --git a/src/util/slice.rs b/src/util/slice.rs
index d02a526..54524c3 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -1,7 +1,10 @@
//! Deal with bytes.
use crate::event::{Event, Kind, Point};
-use crate::util::constant::TAB_SIZE;
+use crate::util::{
+ classify_character::{classify_opt, Kind as CharacterKind},
+ constant::TAB_SIZE,
+};
use alloc::string::String;
use core::str;
@@ -27,6 +30,26 @@ pub fn char_after_index(bytes: &[u8], index: usize) -> Option<char> {
String::from_utf8_lossy(&bytes[index..end]).chars().next()
}
+/// Classify a byte (or `char`).
+pub fn byte_to_kind(bytes: &[u8], index: usize) -> CharacterKind {
+ if index == bytes.len() {
+ CharacterKind::Whitespace
+ } else {
+ let byte = bytes[index];
+ if byte.is_ascii_whitespace() {
+ CharacterKind::Whitespace
+ } else if byte.is_ascii_punctuation() {
+ CharacterKind::Punctuation
+ } else if byte.is_ascii_alphanumeric() {
+ CharacterKind::Other
+ } else {
+ // Otherwise: seems to be an ASCII control, so it seems to be a
+ // non-ASCII `char`.
+ classify_opt(char_after_index(bytes, index))
+ }
+ }
+}
+
/// A range between two points.
#[derive(Debug)]
pub struct Position<'a> {