diff options
| author | 2022-09-06 18:30:40 +0200 | |
|---|---|---|
| committer | 2022-09-06 18:30:40 +0200 | |
| commit | 6af582ee16d9c54c9719144caabc7705a324c40b (patch) | |
| tree | d80cd71964a38fb4cd1b4c1df8acfc256d4cbcba | |
| parent | 537bf2d6b7b3a2f7855f7628159aecaea2acdb0f (diff) | |
| download | markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.tar.gz markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.tar.bz2 markdown-rs-6af582ee16d9c54c9719144caabc7705a324c40b.zip  | |
Add initial states for MDX JSX (text)
Diffstat (limited to '')
| -rw-r--r-- | src/construct/gfm_autolink_literal.rs | 46 | ||||
| -rw-r--r-- | src/construct/mdx_jsx_text.rs | 877 | ||||
| -rw-r--r-- | src/construct/mod.rs | 12 | ||||
| -rw-r--r-- | src/construct/text.rs | 23 | ||||
| -rw-r--r-- | src/event.rs | 22 | ||||
| -rw-r--r-- | src/lib.rs | 29 | ||||
| -rw-r--r-- | src/state.rs | 64 | ||||
| -rw-r--r-- | src/util/slice.rs | 25 | 
8 files changed, 1050 insertions, 48 deletions
diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index 62f18ef..ae483a7 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name};  use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer;  use crate::util::{ -    classify_character::{classify_opt, Kind as CharacterKind}, -    slice::{char_after_index, Position, Slice}, +    classify_character::Kind as CharacterKind, +    slice::{byte_to_kind, Position, Slice},  };  use alloc::vec::Vec; @@ -366,11 +366,8 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. -            if byte_to_kind( -                tokenizer.parse_state.bytes, -                tokenizer.point.index, -                tokenizer.current, -            ) == CharacterKind::Other +            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +                == CharacterKind::Other              {                  tokenizer.tokenize_state.seen = true;                  tokenizer.consume(); @@ -473,11 +470,8 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>. -            if byte_to_kind( -                tokenizer.parse_state.bytes, -                tokenizer.point.index, -                tokenizer.current, -            ) == CharacterKind::Whitespace +            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +                == CharacterKind::Whitespace              {                  State::Retry(StateName::GfmAutolinkLiteralPathAfter)              } else { @@ -549,11 +543,8 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State {          }          _ => {              // Whitespace is the end of the URL, anything else is continuation. -            if byte_to_kind( -                tokenizer.parse_state.bytes, -                tokenizer.point.index, -                tokenizer.current, -            ) == CharacterKind::Whitespace +            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +                == CharacterKind::Whitespace              {                  State::Ok              } else { @@ -937,24 +928,3 @@ fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize {      split  } - -/// Classify a byte (or `char`). -fn byte_to_kind(bytes: &[u8], index: usize, byte: Option<u8>) -> CharacterKind { -    match byte { -        None => CharacterKind::Whitespace, -        Some(byte) => { -            if byte.is_ascii_whitespace() { -                CharacterKind::Whitespace -            } else if byte.is_ascii_punctuation() { -                CharacterKind::Punctuation -            } else if byte.is_ascii_alphanumeric() { -                CharacterKind::Other -            } else { -                // Otherwise: seems to be an ASCII control, so it seems to be a -                // non-ASCII `char`. -                let char = char_after_index(bytes, index); -                classify_opt(char) -            } -        } -    } -} diff --git a/src/construct/mdx_jsx_text.rs b/src/construct/mdx_jsx_text.rs new file mode 100644 index 0000000..7a33499 --- /dev/null +++ b/src/construct/mdx_jsx_text.rs @@ -0,0 +1,877 @@ +//! To do. + +use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{classify_character::Kind as CharacterKind, slice::byte_to_kind}; +use alloc::{ +    format, +    string::{String, ToString}, +}; +use core::str; + +/// Start of MDX: JSX (text). +/// +/// ```markdown +/// > | a <B /> c +///       ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { +    if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.mdx_jsx_text { +        tokenizer.enter(Name::MdxJsxTextTag); +        tokenizer.enter(Name::MdxJsxTextTagMarker); +        tokenizer.consume(); +        tokenizer.exit(Name::MdxJsxTextTagMarker); +        State::Next(StateName::MdxJsxTextStartAfter) +    } else { +        State::Nok +    } +} + +/// After `<`. +/// +/// ```markdown +/// > | a <B /> c +///        ^ +/// ``` +pub fn start_after(tokenizer: &mut Tokenizer) -> State { +    // Deviate from JSX, which allows arbitrary whitespace. +    // See: <https://github.com/micromark/micromark-extension-mdx-jsx/issues/7>. +    if let Some(b'\t' | b'\n' | b' ') = tokenizer.current { +        State::Nok +    } else { +        tokenizer.attempt(State::Next(StateName::MdxJsxTextNameBefore), State::Nok); +        State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +    } +} + +/// Before name, self slash, or end of tag for fragments. +/// +/// ```markdown +/// > | a <B> c +///        ^ +/// > | a </B> c +///        ^ +/// > | a <> b +///        ^ +/// ``` +pub fn name_before(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Closing tag. +        Some(b'/') => { +            tokenizer.enter(Name::MdxJsxTextTagClosingMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagClosingMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextClosingTagNameBefore), +                State::Nok, +            ); +            State::Next(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // Fragment opening tag. +        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), +        _ => { +            // To do: unicode. +            if id_start(tokenizer.current) { +                tokenizer.enter(Name::MdxJsxTextTagName); +                tokenizer.enter(Name::MdxJsxTextTagNamePrimary); +                tokenizer.consume(); +                State::Next(StateName::MdxJsxTextPrimaryName) +            } else { +                crash( +                    tokenizer, +                    "before name", +                    &format!( +                        "a character that can start a name, such as a letter, `$`, or `_`{}", +                        if tokenizer.current == Some(b'!') { +                            " (note: to create a comment in MDX, use `{/* text */}`)" +                        } else { +                            "" +                        } +                    ), +                ) +            } +        } +    } +} + +/// Before name of closing tag or end of closing fragment tag. +/// +/// ```markdown +/// > | a </> b +///         ^ +/// > | a </B> c +///         ^ +/// ``` +pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Fragment closing tag. +        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), +        // Start of a closing tag name. +        _ => { +            // To do: unicode. +            if id_start(tokenizer.current) { +                tokenizer.enter(Name::MdxJsxTextTagName); +                tokenizer.enter(Name::MdxJsxTextTagNamePrimary); +                tokenizer.consume(); +                State::Next(StateName::MdxJsxTextPrimaryName) +            } else { +                crash( +                    tokenizer, +                    "before name", +                    &format!( +                        "a character that can start a name, such as a letter, `$`, or `_`{}", +                        if tokenizer.current == Some(b'*' | b'/') { +                            " (note: JS comments in JSX tags are not supported in MDX)" +                        } else { +                            "" +                        } +                    ), +                ) +            } +        } +    } +} + +/// In primary name. +/// +/// ```markdown +/// > | a <Bc> d +///         ^ +/// ``` +pub fn primary_name(tokenizer: &mut Tokenizer) -> State { +    // End of name. +    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +        || matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{')) +    { +        tokenizer.exit(Name::MdxJsxTextTagNamePrimary); +        tokenizer.attempt( +            State::Next(StateName::MdxJsxTextPrimaryNameAfter), +            State::Nok, +        ); +        State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +    } +    // Continuation of name: remain. +    // To do: unicode. +    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextPrimaryName) +    } else { +        crash( +            tokenizer, +            "in name", +            &format!( +                "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}", +                if tokenizer.current == Some(b'@') { +                    " (note: to create a link in MDX, use `[text](url)`)" +                } else { +                    "" +                } +            ), +        ) +    } +} + +/// After primary name. +/// +/// ```markdown +/// > | a <b.c> d +///         ^ +/// > | a <b:c> d +///         ^ +/// ``` +pub fn primary_name_after(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Start of a member name. +        Some(b'.') => { +            tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextMemberNameBefore), +                State::Nok, +            ); +            State::Next(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // Start of a local name. +        Some(b':') => { +            tokenizer.enter(Name::MdxJsxTextTagNamePrefixMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagNamePrefixMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextLocalNameBefore), +                State::Nok, +            ); +            State::Next(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // End of name. +        _ => { +            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) +            { +                tokenizer.exit(Name::MdxJsxTextTagName); +                State::Retry(StateName::MdxJsxTextAttributeBefore) +            } else { +                crash( +                    tokenizer, +                    "after name", +                    "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" +                ) +            } +        } +    } +} + +/// Before member name. +/// +/// ```markdown +/// > | a <b.c> d +///          ^ +/// ``` +pub fn member_name_before(tokenizer: &mut Tokenizer) -> State { +    // Start of a member name. +    if id_start(tokenizer.current) { +        tokenizer.enter(Name::MdxJsxTextTagNameMember); +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextMemberName) +    } else { +        crash( +            tokenizer, +            "before member name", +            "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" +        ) +    } +} + +/// In member name. +/// +/// ```markdown +/// > | a <b.cd> e +///           ^ +/// ``` +pub fn member_name(tokenizer: &mut Tokenizer) -> State { +    // End of name. +    // Note: no `:` allowed here. +    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +        || matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{')) +    { +        tokenizer.exit(Name::MdxJsxTextTagNameMember); +        tokenizer.attempt( +            State::Next(StateName::MdxJsxTextMemberNameAfter), +            State::Nok, +        ); +        State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +    } +    // Continuation of name: remain. +    // To do: unicode. +    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextMemberName) +    } else { +        crash( +            tokenizer, +            "in member name", +            &format!( +                "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}", +                if tokenizer.current == Some(b'@') { +                    " (note: to create a link in MDX, use `[text](url)`)" +                } else { +                    "" +                } +            ), +        ) +    } +} + +/// After member name. +/// +/// ```markdown +/// > | a <b.c> d +///           ^ +/// > | a <b.c.d> e +///           ^ +/// ``` +pub fn member_name_after(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Start of another member name. +        Some(b'.') => { +            tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextMemberNameBefore), +                State::Nok, +            ); +            State::Next(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // End of name. +        _ => { +            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) +            { +                tokenizer.exit(Name::MdxJsxTextTagName); +                State::Retry(StateName::MdxJsxTextAttributeBefore) +            } else { +                crash( +                    tokenizer, +                    "after member name", +                    "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" +                ) +            } +        } +    } +} + +/// Local member name. +/// +/// ```markdown +/// > | a <b:c> d +///          ^ +/// ``` +pub fn local_name_before(tokenizer: &mut Tokenizer) -> State { +    // Start of a local name. +    if id_start(tokenizer.current) { +        tokenizer.enter(Name::MdxJsxTextTagNameLocal); +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextLocalName) +    } else { +        crash( +            tokenizer, +            "before local name", +            &format!( +                "a character that can start a name, such as a letter, `$`, or `_`{}", +                if matches!(tokenizer.current, Some(b'+' | b'/'..=b'9')) { +                    " (note: to create a link in MDX, use `[text](url)`)" +                } else { +                    "" +                } +            ), +        ) +    } +} + +/// In local name. +/// +/// ```markdown +/// > | a <b:cd> e +///           ^ +/// ``` +pub fn local_name(tokenizer: &mut Tokenizer) -> State { +    // End of local name (note that we don’t expect another colon, or a member). +    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +        || matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) +    { +        tokenizer.exit(Name::MdxJsxTextTagNameLocal); +        tokenizer.attempt(State::Next(StateName::MdxJsxTextLocalNameAfter), State::Nok); +        State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +    } +    // Continuation of name: remain. +    // To do: unicode. +    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextLocalName) +    } else { +        crash( +            tokenizer, +            "in local name", +            "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag" +        ) +    } +} + +/// After local name. +/// +/// This is like as `primary_name_after`, but we don’t expect colons or +/// periods. +/// +/// ```markdown +/// > | a <b.c> d +///           ^ +/// > | a <b.c.d> e +///           ^ +/// ``` +pub fn local_name_after(tokenizer: &mut Tokenizer) -> State { +    // End of name. +    if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) { +        tokenizer.exit(Name::MdxJsxTextTagName); +        State::Retry(StateName::MdxJsxTextAttributeBefore) +    } else { +        crash( +            tokenizer, +            "after local name", +            "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" +        ) +    } +} + +/// Before attribute. +/// +/// ```markdown +/// > | a <b /> c +///          ^ +/// > | a <b > c +///          ^ +/// > | a <b {...c}> d +///          ^ +/// > | a <b c> d +///          ^ +/// ``` +pub fn attribute_before(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Self-closing. +        Some(b'/') => { +            tokenizer.enter(Name::MdxJsxTextTagSelfClosingMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagSelfClosingMarker); +            tokenizer.attempt(State::Next(StateName::MdxJsxTextSelfClosing), State::Nok); +            State::Next(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // End of tag. +        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), +        // Attribute expression. +        Some(b'{') => unreachable!("to do: attribute expression"), +        _ => { +            // Start of an attribute name. +            if id_start(tokenizer.current) { +                tokenizer.enter(Name::MdxJsxTextTagAttribute); +                tokenizer.enter(Name::MdxJsxTextTagAttributeName); +                tokenizer.enter(Name::MdxJsxTextTagAttributePrimaryName); +                tokenizer.consume(); +                State::Next(StateName::MdxJsxTextAttributePrimaryName) +            } else { +                crash( +                    tokenizer, +                    "before attribute name", +                    "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" +                ) +            } +        } +    } +} + +/// In primary attribute name. +/// +/// ```markdown +/// > | a <b cd/> e +///           ^ +/// > | a <b c:d> e +///           ^ +/// > | a <b c=d> e +///           ^ +/// ``` +pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State { +    // End of attribute name or tag. +    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +        || matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{')) +    { +        tokenizer.exit(Name::MdxJsxTextTagAttributePrimaryName); +        tokenizer.attempt( +            State::Next(StateName::MdxJsxTextAttributePrimaryNameAfter), +            State::Nok, +        ); +        State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +    } +    // Continuation of the attribute name: remain. +    // To do: unicode. +    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextLocalName) +    } else { +        crash( +            tokenizer, +            "in attribute name", +            "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag" +        ) +    } +} + +/// After primary attribute name. +/// +/// ```markdown +/// > | a <b c/> d +///           ^ +/// > | a <b c:d> e +///           ^ +/// > | a <b c=d> e +///           ^ +/// ``` +pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Start of a local name. +        Some(b':') => { +            tokenizer.enter(Name::MdxJsxTextTagAttributeNamePrefixMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagAttributeNamePrefixMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextAttributeLocalNameBefore), +                State::Nok, +            ); +            State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // Initializer: start of an attribute value. +        Some(b'=') => { +            tokenizer.exit(Name::MdxJsxTextTagAttributeName); +            tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextAttributeValueBefore), +                State::Nok, +            ); +            State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +        } +        _ => { +            // End of tag / new attribute. +            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +                == CharacterKind::Whitespace +                || matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) +                || id_start(tokenizer.current) +            { +                tokenizer.exit(Name::MdxJsxTextTagAttributeName); +                tokenizer.exit(Name::MdxJsxTextTagAttribute); +                tokenizer.attempt( +                    State::Next(StateName::MdxJsxTextAttributeBefore), +                    State::Nok, +                ); +                State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +            } else { +                crash( +                    tokenizer, +                    "after attribute name", +                    "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag" +                ) +            } +        } +    } +} + +/// Before local attribute name. +/// +/// ```markdown +/// > | a <b c:d/> e +///            ^ +/// ``` +pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State { +    // Start of a local name. +    if id_start(tokenizer.current) { +        tokenizer.enter(Name::MdxJsxTextTagAttributeNameLocal); +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextAttributeLocalName) +    } else { +        crash( +            tokenizer, +            "before local attribute name", +            "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag" +        ) +    } +} + +/// In local attribute name. +/// +/// ```markdown +/// > | a <b c:de/> f +///             ^ +/// > | a <b c:d=e/> f +///             ^ +/// ``` +pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State { +    // End of local name (note that we don’t expect another colon). +    if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace +        || matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{')) +    { +        tokenizer.exit(Name::MdxJsxTextTagAttributeNameLocal); +        tokenizer.exit(Name::MdxJsxTextTagAttributeName); +        tokenizer.attempt( +            State::Next(StateName::MdxJsxTextAttributeLocalNameAfter), +            State::Nok, +        ); +        State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +    } +    // Continuation of local name: remain. +    // To do: unicode. +    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextAttributeLocalName) +    } else { +        crash( +            tokenizer, +            "in local attribute name", +            "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag" +        ) +    } +} + +/// After local attribute name. +/// +/// ```markdown +/// > | a <b c:d/> f +///             ^ +/// > | a <b c:d=e/> f +///             ^ +/// ``` +pub fn attribute_local_name_after(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Start of an attribute value. +        Some(b'=') => { +            tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextAttributeValueBefore), +                State::Nok, +            ); +            State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +        } +        _ => { +            // End of name. +            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) +            { +                tokenizer.exit(Name::MdxJsxTextTagAttribute); +                State::Retry(StateName::MdxJsxTextAttributeBefore) +            } else { +                crash( +                    tokenizer, +                    "after local attribute name", +                    "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag" +                ) +            } +        } +    } +} + +/// After `=`, before value. +/// +/// ```markdown +/// > | a <b c="d"/> e +///            ^ +/// > | a <b c={d}/> e +///            ^ +/// ``` +pub fn attribute_value_before(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        // Start of double- or single quoted value. +        Some(b'"' | b'\'') => { +            tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); +            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral); +            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker); +            State::Next(StateName::MdxJsxTextAttributeValueQuotedStart) +        } +        // Attribute value expression. +        Some(b'{') => unreachable!("to do: attribute value expression"), +        _ => crash( +            tokenizer, +            "before attribute value", +            &format!( +                "a character that can start an attribute value, such as `\"`, `'`, or `{{`{}", +                if tokenizer.current == Some(b'<') { +                    " (note: to use an element or fragment as a prop value in MDX, use `{<element />}`)" +                } else { +                    "" +                } +            ), +        ), +    } +} + +/// Before quoted literal attribute value. +/// +/// ```markdown +/// > | a <b c="d"/> e +///            ^ +/// ``` +pub fn attribute_value_quoted_start(tokenizer: &mut Tokenizer) -> State { +    if let Some(byte) = tokenizer.current { +        if byte == tokenizer.tokenize_state.marker { +            tokenizer.tokenize_state.marker = 0; +            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker); +            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral); +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextAttributeValueBefore), +                State::Nok, +            ); +            State::Next(StateName::MdxJsxTextEsWhitespaceStart) +        } else if byte == b'\n' { +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextAttributeValueQuotedStart), +                State::Nok, +            ); +            State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +        } else { +            tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralValue); +            State::Retry(StateName::MdxJsxTextAttributeValueQuoted) +        } +    } else { +        crash( +            tokenizer, +            "in attribute value", +            &format!( +                "a corresponding closing quote {}", +                format_byte(tokenizer.tokenize_state.marker) +            ), +        ) +    } +} + +/// In quoted literal attribute value. +/// +/// ```markdown +/// > | a <b c="d"/> e +///             ^ +/// ``` +pub fn attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { +    // To do: doesn’t this break for: +    // ```markdown +    // a <b c="d" +    // "f"> +    if tokenizer.current == Some(tokenizer.tokenize_state.marker) +        || matches!(tokenizer.current, None | Some(b'\n')) +    { +        tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralValue); +        State::Retry(StateName::MdxJsxTextAttributeValueQuoted) +    } else { +        tokenizer.consume(); +        State::Next(StateName::MdxJsxTextAttributeValueQuoted) +    } +} + +/// After self-closing slash. +/// +/// ```markdown +/// > | a <b/> c +///          ^ +/// ``` +pub fn self_closing(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), +        _ => crash( +            tokenizer, +            "after self-closing slash", +            &format!( +                "`>` to end the tag{}", +                if tokenizer.current == Some(b'*' | b'/') { +                    " (note: JS comments in JSX tags are not supported in MDX)" +                } else { +                    "" +                } +            ), +        ), +    } +} + +/// At final `>`. +/// +/// ```markdown +/// > | a <b> c +///         ^ +/// ``` +pub fn tag_end(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'>') => { +            tokenizer.enter(Name::MdxJsxTextTagMarker); +            tokenizer.consume(); +            tokenizer.exit(Name::MdxJsxTextTagMarker); +            tokenizer.exit(Name::MdxJsxTextTag); +            State::Ok +        } +        _ => unreachable!("expected `>`"), +    } +} + +/// Before optional ECMAScript whitespace. +/// +/// ```markdown +/// > | a <a b> c +///         ^ +/// ``` +pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'\n') => { +            // To do: check if this works for blank lines? +            tokenizer.attempt( +                State::Next(StateName::MdxJsxTextEsWhitespaceStart), +                State::Nok, +            ); +            State::Retry(space_or_tab_eol(tokenizer)) +        } +        _ => { +            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +                == CharacterKind::Whitespace +            { +                tokenizer.enter(Name::MdxJsxTextEsWhitespace); +                State::Retry(StateName::MdxJsxTextEsWhitespaceInside) +            } else { +                State::Ok +            } +        } +    } +} + +/// In ECMAScript whitespace. +/// +/// ```markdown +/// > | a <a  b> c +///          ^ +/// ``` +pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State { +    match tokenizer.current { +        Some(b'\n') => { +            tokenizer.exit(Name::MdxJsxTextEsWhitespace); +            State::Retry(StateName::MdxJsxTextEsWhitespaceStart) +        } +        // Allow continuation bytes. +        Some(0x80..=0xBF) => { +            tokenizer.consume(); +            State::Next(StateName::MdxJsxTextEsWhitespaceInside) +        } +        _ => { +            if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) +                == CharacterKind::Whitespace +            { +                tokenizer.consume(); +                State::Next(StateName::MdxJsxTextEsWhitespaceInside) +            } else { +                tokenizer.exit(Name::MdxJsxTextEsWhitespace); +                State::Ok +            } +        } +    } +} + +// To do: unicode. +fn id_start(code: Option<u8>) -> bool { +    matches!(code, Some(b'$' | b'_' | b'A'..=b'Z' | b'a'..=b'z')) +} + +// To do: unicode. +fn id_cont(code: Option<u8>) -> bool { +    matches!( +        code, +        Some(b'$' | b'_' | b'A'..=b'Z' | b'0'..=b'9' | b'a'..=b'z') +    ) +} + +fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> ! { +    // To do: externalize this, and the print mechanism in the tokenizer, +    // to one proper formatter. +    // To do: figure out how Rust does errors? +    let actual = match tokenizer.current { +        None => "end of file".to_string(), +        Some(byte) => format_byte(byte), +    }; + +    unreachable!( +        "{}:{}: Unexpected {} {}, expected {}", +        tokenizer.point.line, tokenizer.point.column, actual, at, expect +    ) +} + +fn format_byte(byte: u8) -> String { +    match byte { +        b'`' => "`` ` ``".to_string(), +        b' '..=b'~' => format!("`{}`", str::from_utf8(&[byte]).unwrap()), +        _ => format!("U+{:>04X}", byte), +    } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index de88174..d2843c3 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -30,14 +30,13 @@  //!  //! The following constructs are found in markdown (CommonMark):  //! -//! *   [attention (strong, emphasis)][attention] +//! *   [attention][attention] (strong, emphasis, extension: GFM strikethrough)  //! *   [autolink][]  //! *   [blank line][blank_line]  //! *   [block quote][block_quote]  //! *   [character escape][character_escape]  //! *   [character reference][character_reference]  //! *   [code (indented)][code_indented] -//! *   [code (text)][raw_text]  //! *   [definition][]  //! *   [hard break (escape)][hard_break_escape]  //! *   [heading (atx)][heading_atx] @@ -49,7 +48,8 @@  //! *   [label start (link)][label_start_link]  //! *   [list item][list_item]  //! *   [paragraph][] -//! *   [raw (flow)][raw_flow] (code (fenced), math (flow)) +//! *   [raw (flow)][raw_flow] (code (fenced), extensions: math (flow)) +//! *   [raw (text)][raw_text] (code (text), extensions: math (text))  //! *   [thematic break][thematic_break]  //!  //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by @@ -60,11 +60,10 @@  //! *   [frontmatter][]  //! *   [gfm autolink literal][gfm_autolink_literal]  //! *   [gfm footnote definition][gfm_footnote_definition] +//! *   [gfm label start footnote][gfm_label_start_footnote]  //! *   [gfm table][gfm_table]  //! *   [gfm task list item check][gfm_task_list_item_check] -//! *   [gfm label start footnote][gfm_label_start_footnote] -//! *   math (text) (in `raw_text`) -//! *   gfm strikethrough (in attention) +//! *   [mdx jsx (text)][mdx_jsx_text]  //!  //! There are also several small subroutines typically used in different places:  //! @@ -163,6 +162,7 @@ pub mod label_end;  pub mod label_start_image;  pub mod label_start_link;  pub mod list_item; +pub mod mdx_jsx_text;  pub mod paragraph;  pub mod partial_bom;  pub mod partial_data; diff --git a/src/construct/text.rs b/src/construct/text.rs index 0168d02..b59fe65 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -18,6 +18,7 @@  //! *   [Label start (image)][crate::construct::label_start_image]  //! *   [Label start (link)][crate::construct::label_start_link]  //! *   [Label end][crate::construct::label_end] +//! *   [MDX: JSX (text)][crate::construct::mdx_jsx_text]  //!  //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by  //! > [whitespace][crate::construct::partial_whitespace]. @@ -34,7 +35,7 @@ const MARKERS: [u8; 15] = [      b'$',  // `raw_text` (math (text))      b'&',  // `character_reference`      b'*',  // `attention` (emphasis, strong) -    b'<',  // `autolink`, `html_text` +    b'<',  // `autolink`, `html_text`, `mdx_jsx_text`      b'H',  // `gfm_autolink_literal` (`protocol` kind)      b'W',  // `gfm_autolink_literal` (`www.` kind)      b'[',  // `label_start_link` @@ -109,7 +110,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {              );              State::Retry(StateName::AttentionStart)          } -        // `autolink`, `html_text` (order does not matter) +        // `autolink`, `html_text` (order does not matter), `mdx_jsx_text` (order matters).          Some(b'<') => {              tokenizer.attempt(                  State::Next(StateName::TextBefore), @@ -167,11 +168,27 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {  pub fn before_html(tokenizer: &mut Tokenizer) -> State {      tokenizer.attempt(          State::Next(StateName::TextBefore), -        State::Next(StateName::TextBeforeData), +        State::Next(StateName::TextBeforeMdxJsx),      );      State::Retry(StateName::HtmlTextStart)  } +/// Before mdx jsx (text). +/// +/// At `<`, which wasn’t an autolink or html. +/// +/// ```markdown +/// > | a <b> +///       ^ +/// ``` +pub fn before_mdx_jsx(tokenizer: &mut Tokenizer) -> State { +    tokenizer.attempt( +        State::Next(StateName::TextBefore), +        State::Next(StateName::TextBeforeData), +    ); +    State::Retry(StateName::MdxJsxTextStart) +} +  /// Before hard break escape.  ///  /// At `\`, which wasn’t a character escape. diff --git a/src/event.rs b/src/event.rs index fad2c64..b476d45 100644 --- a/src/event.rs +++ b/src/event.rs @@ -2730,6 +2730,28 @@ pub enum Name {      ///     ^ ^ ^      /// ```      ThematicBreakSequence, + +    // To do: sort. +    MdxJsxTextTag, +    MdxJsxTextTagMarker,        // void +    MdxJsxTextEsWhitespace,     // void +    MdxJsxTextTagClosingMarker, // void +    MdxJsxTextTagName, +    MdxJsxTextTagNamePrimary,       // void? +    MdxJsxTextTagNameMemberMarker,  // void +    MdxJsxTextTagNamePrefixMarker,  // void +    MdxJsxTextTagNameMember,        // void +    MdxJsxTextTagNameLocal,         // void +    MdxJsxTextTagSelfClosingMarker, // void +    MdxJsxTextTagAttribute, +    MdxJsxTextTagAttributeName, +    MdxJsxTextTagAttributePrimaryName, +    MdxJsxTextTagAttributeNamePrefixMarker,  // void +    MdxJsxTextTagAttributeInitializerMarker, // void +    MdxJsxTextTagAttributeNameLocal,         // void +    MdxJsxTextTagAttributeValueLiteral, +    MdxJsxTextTagAttributeValueLiteralMarker, // void +    MdxJsxTextTagAttributeValueLiteralValue,  }  /// List of void events, used to make sure everything is working well. @@ -301,6 +301,13 @@ pub struct Constructs {      ///       ^^^      /// ```      pub math_text: bool, +    /// MDX: JSX (text). +    /// +    /// ```markdown +    /// > | a <Component /> c +    ///       ^^^^^^^^^^^^^ +    /// ``` +    pub mdx_jsx_text: bool,      /// Thematic break.      ///      /// ```markdown @@ -342,6 +349,7 @@ impl Default for Constructs {              list_item: true,              math_flow: false,              math_text: false, +            mdx_jsx_text: false,              thematic_break: true,          }      } @@ -350,6 +358,8 @@ impl Default for Constructs {  impl Constructs {      /// GFM.      /// +    /// <https://github.github.com/gfm/> +    ///      /// This turns on `CommonMark` + GFM.      #[must_use]      pub fn gfm() -> Self { @@ -363,6 +373,25 @@ impl Constructs {              ..Self::default()          }      } + +    /// MDX. +    /// +    /// <https://mdxjs.com> +    /// +    /// This turns on `CommonMark`, turns off some conflicting constructs +    /// (autolinks, code (indented), html), and turns on MDX (JSX, +    /// expressions, ESM). +    #[must_use] +    pub fn mdx() -> Self { +        Self { +            autolink: false, +            code_indented: false, +            html_flow: false, +            html_text: false, +            mdx_jsx_text: true, +            ..Self::default() +        } +    }  }  /// Configuration (optional). diff --git a/src/state.rs b/src/state.rs index d7c0c8a..3294a2f 100644 --- a/src/state.rs +++ b/src/state.rs @@ -358,6 +358,7 @@ pub enum Name {      TextStart,      TextBefore,      TextBeforeHtml, +    TextBeforeMdxJsx,      TextBeforeHardBreakEscape,      TextBeforeLabelStartLink,      TextBeforeData, @@ -374,12 +375,74 @@ pub enum Name {      TitleAtBlankLine,      TitleEscape,      TitleInside, + +    // To do: sort. +    MdxJsxTextEsWhitespaceStart, +    MdxJsxTextEsWhitespaceInside, +    MdxJsxTextStart, +    MdxJsxTextStartAfter, +    MdxJsxTextNameBefore, +    MdxJsxTextClosingTagNameBefore, +    MdxJsxTextTagEnd, +    MdxJsxTextPrimaryName, +    MdxJsxTextPrimaryNameAfter, +    MdxJsxTextMemberNameBefore, +    MdxJsxTextMemberName, +    MdxJsxTextMemberNameAfter, +    MdxJsxTextLocalNameBefore, +    MdxJsxTextLocalName, +    MdxJsxTextLocalNameAfter, +    MdxJsxTextAttributeBefore, +    MdxJsxTextSelfClosing, +    MdxJsxTextAttributePrimaryName, +    MdxJsxTextAttributePrimaryNameAfter, +    MdxJsxTextAttributeLocalNameBefore, +    MdxJsxTextAttributeLocalName, +    MdxJsxTextAttributeLocalNameAfter, +    MdxJsxTextAttributeValueBefore, +    MdxJsxTextAttributeValueQuotedStart, +    MdxJsxTextAttributeValueQuoted,  }  #[allow(clippy::too_many_lines)]  /// Call the corresponding state for a state name.  pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State {      let func = match name { +        // To do: sort. +        Name::MdxJsxTextEsWhitespaceStart => construct::mdx_jsx_text::es_whitespace_start, +        Name::MdxJsxTextEsWhitespaceInside => construct::mdx_jsx_text::es_whitespace_inside, +        Name::MdxJsxTextStart => construct::mdx_jsx_text::start, +        Name::MdxJsxTextStartAfter => construct::mdx_jsx_text::start_after, +        Name::MdxJsxTextNameBefore => construct::mdx_jsx_text::name_before, +        Name::MdxJsxTextClosingTagNameBefore => construct::mdx_jsx_text::closing_tag_name_before, +        Name::MdxJsxTextTagEnd => construct::mdx_jsx_text::tag_end, +        Name::MdxJsxTextPrimaryName => construct::mdx_jsx_text::primary_name, +        Name::MdxJsxTextPrimaryNameAfter => construct::mdx_jsx_text::primary_name_after, +        Name::MdxJsxTextMemberNameBefore => construct::mdx_jsx_text::member_name_before, +        Name::MdxJsxTextMemberName => construct::mdx_jsx_text::member_name, +        Name::MdxJsxTextMemberNameAfter => construct::mdx_jsx_text::member_name_after, +        Name::MdxJsxTextLocalNameBefore => construct::mdx_jsx_text::local_name_before, +        Name::MdxJsxTextLocalName => construct::mdx_jsx_text::local_name, +        Name::MdxJsxTextLocalNameAfter => construct::mdx_jsx_text::local_name_after, +        Name::MdxJsxTextAttributeBefore => construct::mdx_jsx_text::attribute_before, +        Name::MdxJsxTextSelfClosing => construct::mdx_jsx_text::self_closing, +        Name::MdxJsxTextAttributePrimaryName => construct::mdx_jsx_text::attribute_primary_name, +        Name::MdxJsxTextAttributePrimaryNameAfter => { +            construct::mdx_jsx_text::attribute_primary_name_after +        } +        Name::MdxJsxTextAttributeLocalNameBefore => { +            construct::mdx_jsx_text::attribute_local_name_before +        } +        Name::MdxJsxTextAttributeLocalName => construct::mdx_jsx_text::attribute_local_name, +        Name::MdxJsxTextAttributeLocalNameAfter => { +            construct::mdx_jsx_text::attribute_local_name_after +        } +        Name::MdxJsxTextAttributeValueBefore => construct::mdx_jsx_text::attribute_value_before, +        Name::MdxJsxTextAttributeValueQuotedStart => { +            construct::mdx_jsx_text::attribute_value_quoted_start +        } +        Name::MdxJsxTextAttributeValueQuoted => construct::mdx_jsx_text::attribute_value_quoted, +          Name::AttentionStart => construct::attention::start,          Name::AttentionInside => construct::attention::inside, @@ -776,6 +839,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State {          Name::TextStart => construct::text::start,          Name::TextBefore => construct::text::before,          Name::TextBeforeHtml => construct::text::before_html, +        Name::TextBeforeMdxJsx => construct::text::before_mdx_jsx,          Name::TextBeforeHardBreakEscape => construct::text::before_hard_break_escape,          Name::TextBeforeLabelStartLink => construct::text::before_label_start_link,          Name::TextBeforeData => construct::text::before_data, diff --git a/src/util/slice.rs b/src/util/slice.rs index d02a526..54524c3 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -1,7 +1,10 @@  //! Deal with bytes.  use crate::event::{Event, Kind, Point}; -use crate::util::constant::TAB_SIZE; +use crate::util::{ +    classify_character::{classify_opt, Kind as CharacterKind}, +    constant::TAB_SIZE, +};  use alloc::string::String;  use core::str; @@ -27,6 +30,26 @@ pub fn char_after_index(bytes: &[u8], index: usize) -> Option<char> {      String::from_utf8_lossy(&bytes[index..end]).chars().next()  } +/// Classify a byte (or `char`). +pub fn byte_to_kind(bytes: &[u8], index: usize) -> CharacterKind { +    if index == bytes.len() { +        CharacterKind::Whitespace +    } else { +        let byte = bytes[index]; +        if byte.is_ascii_whitespace() { +            CharacterKind::Whitespace +        } else if byte.is_ascii_punctuation() { +            CharacterKind::Punctuation +        } else if byte.is_ascii_alphanumeric() { +            CharacterKind::Other +        } else { +            // Otherwise: seems to be an ASCII control, so it seems to be a +            // non-ASCII `char`. +            classify_opt(char_after_index(bytes, index)) +        } +    } +} +  /// A range between two points.  #[derive(Debug)]  pub struct Position<'a> {  | 
