Add initial states for MDX JSX (text)

author: Titus Wormer <tituswormer@gmail.com> 2022-09-06 18:30:40 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-06 18:30:40 +0200
commit: 6af582ee16d9c54c9719144caabc7705a324c40b (patch)
tree: d80cd71964a38fb4cd1b4c1df8acfc256d4cbcba /src/construct
parent: 537bf2d6b7b3a2f7855f7628159aecaea2acdb0f (diff)
download: markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.tar.gz
markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.tar.bz2
markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.zip
4 files changed, 911 insertions, 47 deletions
diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs
index 62f18ef..ae483a7 100644
--- a/src/construct/gfm_autolink_literal.rs
+++ b/src/construct/gfm_autolink_literal.rs
@@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name};
 use crate::state::{Name as StateName, State};
 use crate::tokenizer::Tokenizer;
 use crate::util::{
-    classify_character::{classify_opt, Kind as CharacterKind},
-    slice::{char_after_index, Position, Slice},
+    classify_character::Kind as CharacterKind,
+    slice::{byte_to_kind, Position, Slice},
 };
 use alloc::vec::Vec;
 
@@ -366,11 +366,8 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State {
         }
         _ => {
             // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
-            if byte_to_kind(
-                tokenizer.parse_state.bytes,
-                tokenizer.point.index,
-                tokenizer.current,
-            ) == CharacterKind::Other
+            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+                == CharacterKind::Other
             {
                 tokenizer.tokenize_state.seen = true;
                 tokenizer.consume();
@@ -473,11 +470,8 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State {
         }
         _ => {
             // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
-            if byte_to_kind(
-                tokenizer.parse_state.bytes,
-                tokenizer.point.index,
-                tokenizer.current,
-            ) == CharacterKind::Whitespace
+            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+                == CharacterKind::Whitespace
             {
                 State::Retry(StateName::GfmAutolinkLiteralPathAfter)
             } else {
@@ -549,11 +543,8 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State {
         }
         _ => {
             // Whitespace is the end of the URL, anything else is continuation.
-            if byte_to_kind(
-                tokenizer.parse_state.bytes,
-                tokenizer.point.index,
-                tokenizer.current,
-            ) == CharacterKind::Whitespace
+            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+                == CharacterKind::Whitespace
             {
                 State::Ok
             } else {
@@ -937,24 +928,3 @@ fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {
 
     split
 }
-
-/// Classify a byte (or `char`).
-fn byte_to_kind(bytes: &[u8], index: usize, byte: Option<u8>) -> CharacterKind {
-    match byte {
-        None => CharacterKind::Whitespace,
-        Some(byte) => {
-            if byte.is_ascii_whitespace() {
-                CharacterKind::Whitespace
-            } else if byte.is_ascii_punctuation() {
-                CharacterKind::Punctuation
-            } else if byte.is_ascii_alphanumeric() {
-                CharacterKind::Other
-            } else {
-                // Otherwise: seems to be an ASCII control, so it seems to be a
-                // non-ASCII `char`.
-                let char = char_after_index(bytes, index);
-                classify_opt(char)
-            }
-        }
-    }
-}
diff --git a/src/construct/mdx_jsx_text.rs b/src/construct/mdx_jsx_text.rs
new file mode 100644
index 0000000..7a33499
--- /dev/null
+++ b/src/construct/mdx_jsx_text.rs
@@ -0,0 +1,877 @@
+//! To do.
+
+use crate::construct::partial_space_or_tab_eol::space_or_tab_eol;
+use crate::event::Name;
+use crate::state::{Name as StateName, State};
+use crate::tokenizer::Tokenizer;
+use crate::util::{classify_character::Kind as CharacterKind, slice::byte_to_kind};
+use alloc::{
+    format,
+    string::{String, ToString},
+};
+use core::str;
+
+/// Start of MDX: JSX (text).
+///
+/// ```markdown
+/// > | a <B /> c
+///       ^
+/// ```
+pub fn start(tokenizer: &mut Tokenizer) -> State {
+    if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.mdx_jsx_text {
+        tokenizer.enter(Name::MdxJsxTextTag);
+        tokenizer.enter(Name::MdxJsxTextTagMarker);
+        tokenizer.consume();
+        tokenizer.exit(Name::MdxJsxTextTagMarker);
+        State::Next(StateName::MdxJsxTextStartAfter)
+    } else {
+        State::Nok
+    }
+}
+
+/// After `<`.
+///
+/// ```markdown
+/// > | a <B /> c
+///        ^
+/// ```
+pub fn start_after(tokenizer: &mut Tokenizer) -> State {
+    // Deviate from JSX, which allows arbitrary whitespace.
+    // See: <https://github.com/micromark/micromark-extension-mdx-jsx/issues/7>.
+    if let Some(b'\t' | b'\n' | b' ') = tokenizer.current {
+        State::Nok
+    } else {
+        tokenizer.attempt(State::Next(StateName::MdxJsxTextNameBefore), State::Nok);
+        State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+    }
+}
+
+/// Before name, self slash, or end of tag for fragments.
+///
+/// ```markdown
+/// > | a <B> c
+///        ^
+/// > | a </B> c
+///        ^
+/// > | a <> b
+///        ^
+/// ```
+pub fn name_before(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Closing tag.
+        Some(b'/') => {
+            tokenizer.enter(Name::MdxJsxTextTagClosingMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagClosingMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextClosingTagNameBefore),
+                State::Nok,
+            );
+            State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // Fragment opening tag.
+        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+        _ => {
+            // To do: unicode.
+            if id_start(tokenizer.current) {
+                tokenizer.enter(Name::MdxJsxTextTagName);
+                tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
+                tokenizer.consume();
+                State::Next(StateName::MdxJsxTextPrimaryName)
+            } else {
+                crash(
+                    tokenizer,
+                    "before name",
+                    &format!(
+                        "a character that can start a name, such as a letter, `$`, or `_`{}",
+                        if tokenizer.current == Some(b'!') {
+                            " (note: to create a comment in MDX, use `{/* text */}`)"
+                        } else {
+                            ""
+                        }
+                    ),
+                )
+            }
+        }
+    }
+}
+
+/// Before name of closing tag or end of closing fragment tag.
+///
+/// ```markdown
+/// > | a </> b
+///         ^
+/// > | a </B> c
+///         ^
+/// ```
+pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Fragment closing tag.
+        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+        // Start of a closing tag name.
+        _ => {
+            // To do: unicode.
+            if id_start(tokenizer.current) {
+                tokenizer.enter(Name::MdxJsxTextTagName);
+                tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
+                tokenizer.consume();
+                State::Next(StateName::MdxJsxTextPrimaryName)
+            } else {
+                crash(
+                    tokenizer,
+                    "before name",
+                    &format!(
+                        "a character that can start a name, such as a letter, `$`, or `_`{}",
+                        if tokenizer.current == Some(b'*' | b'/') {
+                            " (note: JS comments in JSX tags are not supported in MDX)"
+                        } else {
+                            ""
+                        }
+                    ),
+                )
+            }
+        }
+    }
+}
+
+/// In primary name.
+///
+/// ```markdown
+/// > | a <Bc> d
+///         ^
+/// ```
+pub fn primary_name(tokenizer: &mut Tokenizer) -> State {
+    // End of name.
+    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+        || matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{'))
+    {
+        tokenizer.exit(Name::MdxJsxTextTagNamePrimary);
+        tokenizer.attempt(
+            State::Next(StateName::MdxJsxTextPrimaryNameAfter),
+            State::Nok,
+        );
+        State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+    }
+    // Continuation of name: remain.
+    // To do: unicode.
+    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextPrimaryName)
+    } else {
+        crash(
+            tokenizer,
+            "in name",
+            &format!(
+                "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}",
+                if tokenizer.current == Some(b'@') {
+                    " (note: to create a link in MDX, use `[text](url)`)"
+                } else {
+                    ""
+                }
+            ),
+        )
+    }
+}
+
+/// After primary name.
+///
+/// ```markdown
+/// > | a <b.c> d
+///         ^
+/// > | a <b:c> d
+///         ^
+/// ```
+pub fn primary_name_after(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Start of a member name.
+        Some(b'.') => {
+            tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextMemberNameBefore),
+                State::Nok,
+            );
+            State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // Start of a local name.
+        Some(b':') => {
+            tokenizer.enter(Name::MdxJsxTextTagNamePrefixMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagNamePrefixMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextLocalNameBefore),
+                State::Nok,
+            );
+            State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // End of name.
+        _ => {
+            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+            {
+                tokenizer.exit(Name::MdxJsxTextTagName);
+                State::Retry(StateName::MdxJsxTextAttributeBefore)
+            } else {
+                crash(
+                    tokenizer,
+                    "after name",
+                    "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+                )
+            }
+        }
+    }
+}
+
+/// Before member name.
+///
+/// ```markdown
+/// > | a <b.c> d
+///          ^
+/// ```
+pub fn member_name_before(tokenizer: &mut Tokenizer) -> State {
+    // Start of a member name.
+    if id_start(tokenizer.current) {
+        tokenizer.enter(Name::MdxJsxTextTagNameMember);
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextMemberName)
+    } else {
+        crash(
+            tokenizer,
+            "before member name",
+            "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+        )
+    }
+}
+
+/// In member name.
+///
+/// ```markdown
+/// > | a <b.cd> e
+///           ^
+/// ```
+pub fn member_name(tokenizer: &mut Tokenizer) -> State {
+    // End of name.
+    // Note: no `:` allowed here.
+    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+        || matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{'))
+    {
+        tokenizer.exit(Name::MdxJsxTextTagNameMember);
+        tokenizer.attempt(
+            State::Next(StateName::MdxJsxTextMemberNameAfter),
+            State::Nok,
+        );
+        State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+    }
+    // Continuation of name: remain.
+    // To do: unicode.
+    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextMemberName)
+    } else {
+        crash(
+            tokenizer,
+            "in member name",
+            &format!(
+                "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}",
+                if tokenizer.current == Some(b'@') {
+                    " (note: to create a link in MDX, use `[text](url)`)"
+                } else {
+                    ""
+                }
+            ),
+        )
+    }
+}
+
+/// After member name.
+///
+/// ```markdown
+/// > | a <b.c> d
+///           ^
+/// > | a <b.c.d> e
+///           ^
+/// ```
+pub fn member_name_after(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Start of another member name.
+        Some(b'.') => {
+            tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextMemberNameBefore),
+                State::Nok,
+            );
+            State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // End of name.
+        _ => {
+            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+            {
+                tokenizer.exit(Name::MdxJsxTextTagName);
+                State::Retry(StateName::MdxJsxTextAttributeBefore)
+            } else {
+                crash(
+                    tokenizer,
+                    "after member name",
+                    "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+                )
+            }
+        }
+    }
+}
+
+/// Local member name.
+///
+/// ```markdown
+/// > | a <b:c> d
+///          ^
+/// ```
+pub fn local_name_before(tokenizer: &mut Tokenizer) -> State {
+    // Start of a local name.
+    if id_start(tokenizer.current) {
+        tokenizer.enter(Name::MdxJsxTextTagNameLocal);
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextLocalName)
+    } else {
+        crash(
+            tokenizer,
+            "before local name",
+            &format!(
+                "a character that can start a name, such as a letter, `$`, or `_`{}",
+                if matches!(tokenizer.current, Some(b'+' | b'/'..=b'9')) {
+                    " (note: to create a link in MDX, use `[text](url)`)"
+                } else {
+                    ""
+                }
+            ),
+        )
+    }
+}
+
+/// In local name.
+///
+/// ```markdown
+/// > | a <b:cd> e
+///           ^
+/// ```
+pub fn local_name(tokenizer: &mut Tokenizer) -> State {
+    // End of local name (note that we don’t expect another colon, or a member).
+    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+        || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+    {
+        tokenizer.exit(Name::MdxJsxTextTagNameLocal);
+        tokenizer.attempt(State::Next(StateName::MdxJsxTextLocalNameAfter), State::Nok);
+        State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+    }
+    // Continuation of name: remain.
+    // To do: unicode.
+    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextLocalName)
+    } else {
+        crash(
+            tokenizer,
+            "in local name",
+            "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag"
+        )
+    }
+}
+
+/// After local name.
+///
+/// This is like as `primary_name_after`, but we don’t expect colons or
+/// periods.
+///
+/// ```markdown
+/// > | a <b.c> d
+///           ^
+/// > | a <b.c.d> e
+///           ^
+/// ```
+pub fn local_name_after(tokenizer: &mut Tokenizer) -> State {
+    // End of name.
+    if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) {
+        tokenizer.exit(Name::MdxJsxTextTagName);
+        State::Retry(StateName::MdxJsxTextAttributeBefore)
+    } else {
+        crash(
+            tokenizer,
+            "after local name",
+            "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+        )
+    }
+}
+
+/// Before attribute.
+///
+/// ```markdown
+/// > | a <b /> c
+///          ^
+/// > | a <b > c
+///          ^
+/// > | a <b {...c}> d
+///          ^
+/// > | a <b c> d
+///          ^
+/// ```
+pub fn attribute_before(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Self-closing.
+        Some(b'/') => {
+            tokenizer.enter(Name::MdxJsxTextTagSelfClosingMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagSelfClosingMarker);
+            tokenizer.attempt(State::Next(StateName::MdxJsxTextSelfClosing), State::Nok);
+            State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // End of tag.
+        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+        // Attribute expression.
+        Some(b'{') => unreachable!("to do: attribute expression"),
+        _ => {
+            // Start of an attribute name.
+            if id_start(tokenizer.current) {
+                tokenizer.enter(Name::MdxJsxTextTagAttribute);
+                tokenizer.enter(Name::MdxJsxTextTagAttributeName);
+                tokenizer.enter(Name::MdxJsxTextTagAttributePrimaryName);
+                tokenizer.consume();
+                State::Next(StateName::MdxJsxTextAttributePrimaryName)
+            } else {
+                crash(
+                    tokenizer,
+                    "before attribute name",
+                    "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag"
+                )
+            }
+        }
+    }
+}
+
+/// In primary attribute name.
+///
+/// ```markdown
+/// > | a <b cd/> e
+///           ^
+/// > | a <b c:d> e
+///           ^
+/// > | a <b c=d> e
+///           ^
+/// ```
+pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {
+    // End of attribute name or tag.
+    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+        || matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{'))
+    {
+        tokenizer.exit(Name::MdxJsxTextTagAttributePrimaryName);
+        tokenizer.attempt(
+            State::Next(StateName::MdxJsxTextAttributePrimaryNameAfter),
+            State::Nok,
+        );
+        State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+    }
+    // Continuation of the attribute name: remain.
+    // To do: unicode.
+    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextLocalName)
+    } else {
+        crash(
+            tokenizer,
+            "in attribute name",
+            "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag"
+        )
+    }
+}
+
+/// After primary attribute name.
+///
+/// ```markdown
+/// > | a <b c/> d
+///           ^
+/// > | a <b c:d> e
+///           ^
+/// > | a <b c=d> e
+///           ^
+/// ```
+pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Start of a local name.
+        Some(b':') => {
+            tokenizer.enter(Name::MdxJsxTextTagAttributeNamePrefixMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagAttributeNamePrefixMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextAttributeLocalNameBefore),
+                State::Nok,
+            );
+            State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // Initializer: start of an attribute value.
+        Some(b'=') => {
+            tokenizer.exit(Name::MdxJsxTextTagAttributeName);
+            tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextAttributeValueBefore),
+                State::Nok,
+            );
+            State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        _ => {
+            // End of tag / new attribute.
+            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+                == CharacterKind::Whitespace
+                || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+                || id_start(tokenizer.current)
+            {
+                tokenizer.exit(Name::MdxJsxTextTagAttributeName);
+                tokenizer.exit(Name::MdxJsxTextTagAttribute);
+                tokenizer.attempt(
+                    State::Next(StateName::MdxJsxTextAttributeBefore),
+                    State::Nok,
+                );
+                State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+            } else {
+                crash(
+                    tokenizer,
+                    "after attribute name",
+                    "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag"
+                )
+            }
+        }
+    }
+}
+
+/// Before local attribute name.
+///
+/// ```markdown
+/// > | a <b c:d/> e
+///            ^
+/// ```
+pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State {
+    // Start of a local name.
+    if id_start(tokenizer.current) {
+        tokenizer.enter(Name::MdxJsxTextTagAttributeNameLocal);
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextAttributeLocalName)
+    } else {
+        crash(
+            tokenizer,
+            "before local attribute name",
+            "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag"
+        )
+    }
+}
+
+/// In local attribute name.
+///
+/// ```markdown
+/// > | a <b c:de/> f
+///             ^
+/// > | a <b c:d=e/> f
+///             ^
+/// ```
+pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State {
+    // End of local name (note that we don’t expect another colon).
+    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace
+        || matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{'))
+    {
+        tokenizer.exit(Name::MdxJsxTextTagAttributeNameLocal);
+        tokenizer.exit(Name::MdxJsxTextTagAttributeName);
+        tokenizer.attempt(
+            State::Next(StateName::MdxJsxTextAttributeLocalNameAfter),
+            State::Nok,
+        );
+        State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+    }
+    // Continuation of local name: remain.
+    // To do: unicode.
+    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextAttributeLocalName)
+    } else {
+        crash(
+            tokenizer,
+            "in local attribute name",
+            "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag"
+        )
+    }
+}
+
+/// After local attribute name.
+///
+/// ```markdown
+/// > | a <b c:d/> f
+///             ^
+/// > | a <b c:d=e/> f
+///             ^
+/// ```
+pub fn attribute_local_name_after(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Start of an attribute value.
+        Some(b'=') => {
+            tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextAttributeValueBefore),
+                State::Nok,
+            );
+            State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        _ => {
+            // End of name.
+            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+            {
+                tokenizer.exit(Name::MdxJsxTextTagAttribute);
+                State::Retry(StateName::MdxJsxTextAttributeBefore)
+            } else {
+                crash(
+                    tokenizer,
+                    "after local attribute name",
+                    "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag"
+                )
+            }
+        }
+    }
+}
+
+/// After `=`, before value.
+///
+/// ```markdown
+/// > | a <b c="d"/> e
+///            ^
+/// > | a <b c={d}/> e
+///            ^
+/// ```
+pub fn attribute_value_before(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        // Start of double- or single quoted value.
+        Some(b'"' | b'\'') => {
+            tokenizer.tokenize_state.marker = tokenizer.current.unwrap();
+            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral);
+            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+            State::Next(StateName::MdxJsxTextAttributeValueQuotedStart)
+        }
+        // Attribute value expression.
+        Some(b'{') => unreachable!("to do: attribute value expression"),
+        _ => crash(
+            tokenizer,
+            "before attribute value",
+            &format!(
+                "a character that can start an attribute value, such as `\"`, `'`, or `{{`{}",
+                if tokenizer.current == Some(b'<') {
+                    " (note: to use an element or fragment as a prop value in MDX, use `{<element />}`)"
+                } else {
+                    ""
+                }
+            ),
+        ),
+    }
+}
+
+/// Before quoted literal attribute value.
+///
+/// ```markdown
+/// > | a <b c="d"/> e
+///            ^
+/// ```
+pub fn attribute_value_quoted_start(tokenizer: &mut Tokenizer) -> State {
+    if let Some(byte) = tokenizer.current {
+        if byte == tokenizer.tokenize_state.marker {
+            tokenizer.tokenize_state.marker = 0;
+            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker);
+            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral);
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextAttributeValueBefore),
+                State::Nok,
+            );
+            State::Next(StateName::MdxJsxTextEsWhitespaceStart)
+        } else if byte == b'\n' {
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextAttributeValueQuotedStart),
+                State::Nok,
+            );
+            State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+        } else {
+            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralValue);
+            State::Retry(StateName::MdxJsxTextAttributeValueQuoted)
+        }
+    } else {
+        crash(
+            tokenizer,
+            "in attribute value",
+            &format!(
+                "a corresponding closing quote {}",
+                format_byte(tokenizer.tokenize_state.marker)
+            ),
+        )
+    }
+}
+
+/// In quoted literal attribute value.
+///
+/// ```markdown
+/// > | a <b c="d"/> e
+///             ^
+/// ```
+pub fn attribute_value_quoted(tokenizer: &mut Tokenizer) -> State {
+    // To do: doesn’t this break for:
+    // ```markdown
+    // a <b c="d"
+    // "f">
+    if tokenizer.current == Some(tokenizer.tokenize_state.marker)
+        || matches!(tokenizer.current, None | Some(b'\n'))
+    {
+        tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralValue);
+        State::Retry(StateName::MdxJsxTextAttributeValueQuoted)
+    } else {
+        tokenizer.consume();
+        State::Next(StateName::MdxJsxTextAttributeValueQuoted)
+    }
+}
+
+/// After self-closing slash.
+///
+/// ```markdown
+/// > | a <b/> c
+///          ^
+/// ```
+pub fn self_closing(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
+        _ => crash(
+            tokenizer,
+            "after self-closing slash",
+            &format!(
+                "`>` to end the tag{}",
+                if tokenizer.current == Some(b'*' | b'/') {
+                    " (note: JS comments in JSX tags are not supported in MDX)"
+                } else {
+                    ""
+                }
+            ),
+        ),
+    }
+}
+
+/// At final `>`.
+///
+/// ```markdown
+/// > | a <b> c
+///         ^
+/// ```
+pub fn tag_end(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        Some(b'>') => {
+            tokenizer.enter(Name::MdxJsxTextTagMarker);
+            tokenizer.consume();
+            tokenizer.exit(Name::MdxJsxTextTagMarker);
+            tokenizer.exit(Name::MdxJsxTextTag);
+            State::Ok
+        }
+        _ => unreachable!("expected `>`"),
+    }
+}
+
+/// Before optional ECMAScript whitespace.
+///
+/// ```markdown
+/// > | a <a b> c
+///         ^
+/// ```
+pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        Some(b'\n') => {
+            // To do: check if this works for blank lines?
+            tokenizer.attempt(
+                State::Next(StateName::MdxJsxTextEsWhitespaceStart),
+                State::Nok,
+            );
+            State::Retry(space_or_tab_eol(tokenizer))
+        }
+        _ => {
+            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+                == CharacterKind::Whitespace
+            {
+                tokenizer.enter(Name::MdxJsxTextEsWhitespace);
+                State::Retry(StateName::MdxJsxTextEsWhitespaceInside)
+            } else {
+                State::Ok
+            }
+        }
+    }
+}
+
+/// In ECMAScript whitespace.
+///
+/// ```markdown
+/// > | a <a  b> c
+///          ^
+/// ```
+pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        Some(b'\n') => {
+            tokenizer.exit(Name::MdxJsxTextEsWhitespace);
+            State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
+        }
+        // Allow continuation bytes.
+        Some(0x80..=0xBF) => {
+            tokenizer.consume();
+            State::Next(StateName::MdxJsxTextEsWhitespaceInside)
+        }
+        _ => {
+            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
+                == CharacterKind::Whitespace
+            {
+                tokenizer.consume();
+                State::Next(StateName::MdxJsxTextEsWhitespaceInside)
+            } else {
+                tokenizer.exit(Name::MdxJsxTextEsWhitespace);
+                State::Ok
+            }
+        }
+    }
+}
+
+// To do: unicode.
+fn id_start(code: Option<u8>) -> bool {
+    matches!(code, Some(b'$' | b'_' | b'A'..=b'Z' | b'a'..=b'z'))
+}
+
+// To do: unicode.
+fn id_cont(code: Option<u8>) -> bool {
+    matches!(
+        code,
+        Some(b'$' | b'_' | b'A'..=b'Z' | b'0'..=b'9' | b'a'..=b'z')
+    )
+}
+
+fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> ! {
+    // To do: externalize this, and the print mechanism in the tokenizer,
+    // to one proper formatter.
+    // To do: figure out how Rust does errors?
+    let actual = match tokenizer.current {
+        None => "end of file".to_string(),
+        Some(byte) => format_byte(byte),
+    };
+
+    unreachable!(
+        "{}:{}: Unexpected {} {}, expected {}",
+        tokenizer.point.line, tokenizer.point.column, actual, at, expect
+    )
+}
+
+fn format_byte(byte: u8) -> String {
+    match byte {
+        b'`' => "`` ` ``".to_string(),
+        b' '..=b'~' => format!("`{}`", str::from_utf8(&[byte]).unwrap()),
+        _ => format!("U+{:>04X}", byte),
+    }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index de88174..d2843c3 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -30,14 +30,13 @@
 //!
 //! The following constructs are found in markdown (CommonMark):
 //!
-//! *   [attention (strong, emphasis)][attention]
+//! *   [attention][attention] (strong, emphasis, extension: GFM strikethrough)
 //! *   [autolink][]
 //! *   [blank line][blank_line]
 //! *   [block quote][block_quote]
 //! *   [character escape][character_escape]
 //! *   [character reference][character_reference]
 //! *   [code (indented)][code_indented]
-//! *   [code (text)][raw_text]
 //! *   [definition][]
 //! *   [hard break (escape)][hard_break_escape]
 //! *   [heading (atx)][heading_atx]
@@ -49,7 +48,8 @@
 //! *   [label start (link)][label_start_link]
 //! *   [list item][list_item]
 //! *   [paragraph][]
-//! *   [raw (flow)][raw_flow] (code (fenced), math (flow))
+//! *   [raw (flow)][raw_flow] (code (fenced), extensions: math (flow))
+//! *   [raw (text)][raw_text] (code (text), extensions: math (text))
 //! *   [thematic break][thematic_break]
 //!
 //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
@@ -60,11 +60,10 @@
 //! *   [frontmatter][]
 //! *   [gfm autolink literal][gfm_autolink_literal]
 //! *   [gfm footnote definition][gfm_footnote_definition]
+//! *   [gfm label start footnote][gfm_label_start_footnote]
 //! *   [gfm table][gfm_table]
 //! *   [gfm task list item check][gfm_task_list_item_check]
-//! *   [gfm label start footnote][gfm_label_start_footnote]
-//! *   math (text) (in `raw_text`)
-//! *   gfm strikethrough (in attention)
+//! *   [mdx jsx (text)][mdx_jsx_text]
 //!
 //! There are also several small subroutines typically used in different places:
 //!
@@ -163,6 +162,7 @@ pub mod label_end;
 pub mod label_start_image;
 pub mod label_start_link;
 pub mod list_item;
+pub mod mdx_jsx_text;
 pub mod paragraph;
 pub mod partial_bom;
 pub mod partial_data;
diff --git a/src/construct/text.rs b/src/construct/text.rs
index 0168d02..b59fe65 100644
--- a/src/construct/text.rs
+++ b/src/construct/text.rs
@@ -18,6 +18,7 @@
 //! *   [Label start (image)][crate::construct::label_start_image]
 //! *   [Label start (link)][crate::construct::label_start_link]
 //! *   [Label end][crate::construct::label_end]
+//! *   [MDX: JSX (text)][crate::construct::mdx_jsx_text]
 //!
 //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
 //! > [whitespace][crate::construct::partial_whitespace].
@@ -34,7 +35,7 @@ const MARKERS: [u8; 15] = [
     b'$',  // `raw_text` (math (text))
     b'&',  // `character_reference`
     b'*',  // `attention` (emphasis, strong)
-    b'<',  // `autolink`, `html_text`
+    b'<',  // `autolink`, `html_text`, `mdx_jsx_text`
     b'H',  // `gfm_autolink_literal` (`protocol` kind)
     b'W',  // `gfm_autolink_literal` (`www.` kind)
     b'[',  // `label_start_link`
@@ -109,7 +110,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {
             );
             State::Retry(StateName::AttentionStart)
         }
-        // `autolink`, `html_text` (order does not matter)
+        // `autolink`, `html_text` (order does not matter), `mdx_jsx_text` (order matters).
         Some(b'<') => {
             tokenizer.attempt(
                 State::Next(StateName::TextBefore),
@@ -167,11 +168,27 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {
 pub fn before_html(tokenizer: &mut Tokenizer) -> State {
     tokenizer.attempt(
         State::Next(StateName::TextBefore),
-        State::Next(StateName::TextBeforeData),
+        State::Next(StateName::TextBeforeMdxJsx),
     );
     State::Retry(StateName::HtmlTextStart)
 }
 
+/// Before mdx jsx (text).
+///
+/// At `<`, which wasn’t an autolink or html.
+///
+/// ```markdown
+/// > | a <b>
+///       ^
+/// ```
+pub fn before_mdx_jsx(tokenizer: &mut Tokenizer) -> State {
+    tokenizer.attempt(
+        State::Next(StateName::TextBefore),
+        State::Next(StateName::TextBeforeData),
+    );
+    State::Retry(StateName::MdxJsxTextStart)
+}
+
 /// Before hard break escape.
 ///
 /// At `\`, which wasn’t a character escape.
author	Titus Wormer <tituswormer@gmail.com>	2022-09-06 18:30:40 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-06 18:30:40 +0200
commit	6af582ee16d9c54c9719144caabc7705a324c40b (patch)
tree	d80cd71964a38fb4cd1b4c1df8acfc256d4cbcba /src/construct
parent	537bf2d6b7b3a2f7855f7628159aecaea2acdb0f (diff)
download	markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.tar.gz markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.tar.bz2 markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.zip