From 6af582ee16d9c54c9719144caabc7705a324c40b Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Tue, 6 Sep 2022 18:30:40 +0200 Subject: Add initial states for MDX JSX (text) --- src/construct/gfm_autolink_literal.rs | 46 +- src/construct/mdx_jsx_text.rs | 877 ++++++++++++++++++++++++++++++++++ src/construct/mod.rs | 12 +- src/construct/text.rs | 23 +- src/event.rs | 22 + src/lib.rs | 29 ++ src/state.rs | 64 +++ src/util/slice.rs | 25 +- 8 files changed, 1050 insertions(+), 48 deletions(-) create mode 100644 src/construct/mdx_jsx_text.rs diff --git a/src/construct/gfm_autolink_literal.rs b/src/construct/gfm_autolink_literal.rs index 62f18ef..ae483a7 100644 --- a/src/construct/gfm_autolink_literal.rs +++ b/src/construct/gfm_autolink_literal.rs @@ -148,8 +148,8 @@ use crate::event::{Event, Kind, Name}; use crate::state::{Name as StateName, State}; use crate::tokenizer::Tokenizer; use crate::util::{ - classify_character::{classify_opt, Kind as CharacterKind}, - slice::{char_after_index, Position, Slice}, + classify_character::Kind as CharacterKind, + slice::{byte_to_kind, Position, Slice}, }; use alloc::vec::Vec; @@ -366,11 +366,8 @@ pub fn domain_inside(tokenizer: &mut Tokenizer) -> State { } _ => { // Source: . - if byte_to_kind( - tokenizer.parse_state.bytes, - tokenizer.point.index, - tokenizer.current, - ) == CharacterKind::Other + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Other { tokenizer.tokenize_state.seen = true; tokenizer.consume(); @@ -473,11 +470,8 @@ pub fn path_inside(tokenizer: &mut Tokenizer) -> State { } _ => { // Source: . - if byte_to_kind( - tokenizer.parse_state.bytes, - tokenizer.point.index, - tokenizer.current, - ) == CharacterKind::Whitespace + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace { State::Retry(StateName::GfmAutolinkLiteralPathAfter) } else { @@ -549,11 +543,8 @@ pub fn trail(tokenizer: &mut Tokenizer) -> State { } _ => { // Whitespace is the end of the URL, anything else is continuation. - if byte_to_kind( - tokenizer.parse_state.bytes, - tokenizer.point.index, - tokenizer.current, - ) == CharacterKind::Whitespace + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace { State::Ok } else { @@ -937,24 +928,3 @@ fn peek_bytes_truncate(bytes: &[u8], start: usize, mut end: usize) -> usize { split } - -/// Classify a byte (or `char`). -fn byte_to_kind(bytes: &[u8], index: usize, byte: Option) -> CharacterKind { - match byte { - None => CharacterKind::Whitespace, - Some(byte) => { - if byte.is_ascii_whitespace() { - CharacterKind::Whitespace - } else if byte.is_ascii_punctuation() { - CharacterKind::Punctuation - } else if byte.is_ascii_alphanumeric() { - CharacterKind::Other - } else { - // Otherwise: seems to be an ASCII control, so it seems to be a - // non-ASCII `char`. - let char = char_after_index(bytes, index); - classify_opt(char) - } - } - } -} diff --git a/src/construct/mdx_jsx_text.rs b/src/construct/mdx_jsx_text.rs new file mode 100644 index 0000000..7a33499 --- /dev/null +++ b/src/construct/mdx_jsx_text.rs @@ -0,0 +1,877 @@ +//! To do. + +use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; +use crate::event::Name; +use crate::state::{Name as StateName, State}; +use crate::tokenizer::Tokenizer; +use crate::util::{classify_character::Kind as CharacterKind, slice::byte_to_kind}; +use alloc::{ + format, + string::{String, ToString}, +}; +use core::str; + +/// Start of MDX: JSX (text). +/// +/// ```markdown +/// > | a c +/// ^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.mdx_jsx_text { + tokenizer.enter(Name::MdxJsxTextTag); + tokenizer.enter(Name::MdxJsxTextTagMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagMarker); + State::Next(StateName::MdxJsxTextStartAfter) + } else { + State::Nok + } +} + +/// After `<`. +/// +/// ```markdown +/// > | a c +/// ^ +/// ``` +pub fn start_after(tokenizer: &mut Tokenizer) -> State { + // Deviate from JSX, which allows arbitrary whitespace. + // See: . + if let Some(b'\t' | b'\n' | b' ') = tokenizer.current { + State::Nok + } else { + tokenizer.attempt(State::Next(StateName::MdxJsxTextNameBefore), State::Nok); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } +} + +/// Before name, self slash, or end of tag for fragments. +/// +/// ```markdown +/// > | a c +/// ^ +/// > | a c +/// ^ +/// > | a <> b +/// ^ +/// ``` +pub fn name_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Closing tag. + Some(b'/') => { + tokenizer.enter(Name::MdxJsxTextTagClosingMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagClosingMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextClosingTagNameBefore), + State::Nok, + ); + State::Next(StateName::MdxJsxTextEsWhitespaceStart) + } + // Fragment opening tag. + Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), + _ => { + // To do: unicode. + if id_start(tokenizer.current) { + tokenizer.enter(Name::MdxJsxTextTagName); + tokenizer.enter(Name::MdxJsxTextTagNamePrimary); + tokenizer.consume(); + State::Next(StateName::MdxJsxTextPrimaryName) + } else { + crash( + tokenizer, + "before name", + &format!( + "a character that can start a name, such as a letter, `$`, or `_`{}", + if tokenizer.current == Some(b'!') { + " (note: to create a comment in MDX, use `{/* text */}`)" + } else { + "" + } + ), + ) + } + } + } +} + +/// Before name of closing tag or end of closing fragment tag. +/// +/// ```markdown +/// > | a b +/// ^ +/// > | a c +/// ^ +/// ``` +pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Fragment closing tag. + Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), + // Start of a closing tag name. + _ => { + // To do: unicode. + if id_start(tokenizer.current) { + tokenizer.enter(Name::MdxJsxTextTagName); + tokenizer.enter(Name::MdxJsxTextTagNamePrimary); + tokenizer.consume(); + State::Next(StateName::MdxJsxTextPrimaryName) + } else { + crash( + tokenizer, + "before name", + &format!( + "a character that can start a name, such as a letter, `$`, or `_`{}", + if tokenizer.current == Some(b'*' | b'/') { + " (note: JS comments in JSX tags are not supported in MDX)" + } else { + "" + } + ), + ) + } + } + } +} + +/// In primary name. +/// +/// ```markdown +/// > | a d +/// ^ +/// ``` +pub fn primary_name(tokenizer: &mut Tokenizer) -> State { + // End of name. + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + || matches!(tokenizer.current, Some(b'.' | b'/' | b':' | b'>' | b'{')) + { + tokenizer.exit(Name::MdxJsxTextTagNamePrimary); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextPrimaryNameAfter), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Continuation of name: remain. + // To do: unicode. + else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextPrimaryName) + } else { + crash( + tokenizer, + "in name", + &format!( + "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}", + if tokenizer.current == Some(b'@') { + " (note: to create a link in MDX, use `[text](url)`)" + } else { + "" + } + ), + ) + } +} + +/// After primary name. +/// +/// ```markdown +/// > | a d +/// ^ +/// > | a d +/// ^ +/// ``` +pub fn primary_name_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Start of a member name. + Some(b'.') => { + tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextMemberNameBefore), + State::Nok, + ); + State::Next(StateName::MdxJsxTextEsWhitespaceStart) + } + // Start of a local name. + Some(b':') => { + tokenizer.enter(Name::MdxJsxTextTagNamePrefixMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagNamePrefixMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextLocalNameBefore), + State::Nok, + ); + State::Next(StateName::MdxJsxTextEsWhitespaceStart) + } + // End of name. + _ => { + if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) + { + tokenizer.exit(Name::MdxJsxTextTagName); + State::Retry(StateName::MdxJsxTextAttributeBefore) + } else { + crash( + tokenizer, + "after name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" + ) + } + } + } +} + +/// Before member name. +/// +/// ```markdown +/// > | a d +/// ^ +/// ``` +pub fn member_name_before(tokenizer: &mut Tokenizer) -> State { + // Start of a member name. + if id_start(tokenizer.current) { + tokenizer.enter(Name::MdxJsxTextTagNameMember); + tokenizer.consume(); + State::Next(StateName::MdxJsxTextMemberName) + } else { + crash( + tokenizer, + "before member name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" + ) + } +} + +/// In member name. +/// +/// ```markdown +/// > | a e +/// ^ +/// ``` +pub fn member_name(tokenizer: &mut Tokenizer) -> State { + // End of name. + // Note: no `:` allowed here. + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + || matches!(tokenizer.current, Some(b'.' | b'/' | b'>' | b'{')) + { + tokenizer.exit(Name::MdxJsxTextTagNameMember); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextMemberNameAfter), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Continuation of name: remain. + // To do: unicode. + else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextMemberName) + } else { + crash( + tokenizer, + "in member name", + &format!( + "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag{}", + if tokenizer.current == Some(b'@') { + " (note: to create a link in MDX, use `[text](url)`)" + } else { + "" + } + ), + ) + } +} + +/// After member name. +/// +/// ```markdown +/// > | a d +/// ^ +/// > | a e +/// ^ +/// ``` +pub fn member_name_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Start of another member name. + Some(b'.') => { + tokenizer.enter(Name::MdxJsxTextTagNameMemberMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagNameMemberMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextMemberNameBefore), + State::Nok, + ); + State::Next(StateName::MdxJsxTextEsWhitespaceStart) + } + // End of name. + _ => { + if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) + { + tokenizer.exit(Name::MdxJsxTextTagName); + State::Retry(StateName::MdxJsxTextAttributeBefore) + } else { + crash( + tokenizer, + "after member name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" + ) + } + } + } +} + +/// Local member name. +/// +/// ```markdown +/// > | a d +/// ^ +/// ``` +pub fn local_name_before(tokenizer: &mut Tokenizer) -> State { + // Start of a local name. + if id_start(tokenizer.current) { + tokenizer.enter(Name::MdxJsxTextTagNameLocal); + tokenizer.consume(); + State::Next(StateName::MdxJsxTextLocalName) + } else { + crash( + tokenizer, + "before local name", + &format!( + "a character that can start a name, such as a letter, `$`, or `_`{}", + if matches!(tokenizer.current, Some(b'+' | b'/'..=b'9')) { + " (note: to create a link in MDX, use `[text](url)`)" + } else { + "" + } + ), + ) + } +} + +/// In local name. +/// +/// ```markdown +/// > | a e +/// ^ +/// ``` +pub fn local_name(tokenizer: &mut Tokenizer) -> State { + // End of local name (note that we don’t expect another colon, or a member). + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + || matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) + { + tokenizer.exit(Name::MdxJsxTextTagNameLocal); + tokenizer.attempt(State::Next(StateName::MdxJsxTextLocalNameAfter), State::Nok); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Continuation of name: remain. + // To do: unicode. + else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextLocalName) + } else { + crash( + tokenizer, + "in local name", + "a name character such as letters, digits, `$`, or `_`; whitespace before attributes; or the end of the tag" + ) + } +} + +/// After local name. +/// +/// This is like as `primary_name_after`, but we don’t expect colons or +/// periods. +/// +/// ```markdown +/// > | a d +/// ^ +/// > | a e +/// ^ +/// ``` +pub fn local_name_after(tokenizer: &mut Tokenizer) -> State { + // End of name. + if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) { + tokenizer.exit(Name::MdxJsxTextTagName); + State::Retry(StateName::MdxJsxTextAttributeBefore) + } else { + crash( + tokenizer, + "after local name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" + ) + } +} + +/// Before attribute. +/// +/// ```markdown +/// > | a c +/// ^ +/// > | a c +/// ^ +/// > | a d +/// ^ +/// > | a d +/// ^ +/// ``` +pub fn attribute_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Self-closing. + Some(b'/') => { + tokenizer.enter(Name::MdxJsxTextTagSelfClosingMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagSelfClosingMarker); + tokenizer.attempt(State::Next(StateName::MdxJsxTextSelfClosing), State::Nok); + State::Next(StateName::MdxJsxTextEsWhitespaceStart) + } + // End of tag. + Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), + // Attribute expression. + Some(b'{') => unreachable!("to do: attribute expression"), + _ => { + // Start of an attribute name. + if id_start(tokenizer.current) { + tokenizer.enter(Name::MdxJsxTextTagAttribute); + tokenizer.enter(Name::MdxJsxTextTagAttributeName); + tokenizer.enter(Name::MdxJsxTextTagAttributePrimaryName); + tokenizer.consume(); + State::Next(StateName::MdxJsxTextAttributePrimaryName) + } else { + crash( + tokenizer, + "before attribute name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; whitespace before attributes; or the end of the tag" + ) + } + } + } +} + +/// In primary attribute name. +/// +/// ```markdown +/// > | a e +/// ^ +/// > | a e +/// ^ +/// > | a e +/// ^ +/// ``` +pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State { + // End of attribute name or tag. + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + || matches!(tokenizer.current, Some(b'/' | b':' | b'=' | b'>' | b'{')) + { + tokenizer.exit(Name::MdxJsxTextTagAttributePrimaryName); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributePrimaryNameAfter), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Continuation of the attribute name: remain. + // To do: unicode. + else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextLocalName) + } else { + crash( + tokenizer, + "in attribute name", + "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag" + ) + } +} + +/// After primary attribute name. +/// +/// ```markdown +/// > | a d +/// ^ +/// > | a e +/// ^ +/// > | a e +/// ^ +/// ``` +pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Start of a local name. + Some(b':') => { + tokenizer.enter(Name::MdxJsxTextTagAttributeNamePrefixMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagAttributeNamePrefixMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeLocalNameBefore), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Initializer: start of an attribute value. + Some(b'=') => { + tokenizer.exit(Name::MdxJsxTextTagAttributeName); + tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeValueBefore), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + _ => { + // End of tag / new attribute. + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace + || matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) + || id_start(tokenizer.current) + { + tokenizer.exit(Name::MdxJsxTextTagAttributeName); + tokenizer.exit(Name::MdxJsxTextTagAttribute); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeBefore), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } else { + crash( + tokenizer, + "after attribute name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag" + ) + } + } + } +} + +/// Before local attribute name. +/// +/// ```markdown +/// > | a e +/// ^ +/// ``` +pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State { + // Start of a local name. + if id_start(tokenizer.current) { + tokenizer.enter(Name::MdxJsxTextTagAttributeNameLocal); + tokenizer.consume(); + State::Next(StateName::MdxJsxTextAttributeLocalName) + } else { + crash( + tokenizer, + "before local attribute name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag" + ) + } +} + +/// In local attribute name. +/// +/// ```markdown +/// > | a f +/// ^ +/// > | a f +/// ^ +/// ``` +pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State { + // End of local name (note that we don’t expect another colon). + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) == CharacterKind::Whitespace + || matches!(tokenizer.current, Some(b'/' | b'=' | b'>' | b'{')) + { + tokenizer.exit(Name::MdxJsxTextTagAttributeNameLocal); + tokenizer.exit(Name::MdxJsxTextTagAttributeName); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeLocalNameAfter), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Continuation of local name: remain. + // To do: unicode. + else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextAttributeLocalName) + } else { + crash( + tokenizer, + "in local attribute name", + "an attribute name character such as letters, digits, `$`, or `_`; `=` to initialize a value; whitespace before attributes; or the end of the tag" + ) + } +} + +/// After local attribute name. +/// +/// ```markdown +/// > | a f +/// ^ +/// > | a f +/// ^ +/// ``` +pub fn attribute_local_name_after(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Start of an attribute value. + Some(b'=') => { + tokenizer.enter(Name::MdxJsxTextTagAttributeInitializerMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagAttributeInitializerMarker); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeValueBefore), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + _ => { + // End of name. + if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) + { + tokenizer.exit(Name::MdxJsxTextTagAttribute); + State::Retry(StateName::MdxJsxTextAttributeBefore) + } else { + crash( + tokenizer, + "after local attribute name", + "a character that can start an attribute name, such as a letter, `$`, or `_`; `=` to initialize a value; or the end of the tag" + ) + } + } + } +} + +/// After `=`, before value. +/// +/// ```markdown +/// > | a e +/// ^ +/// > | a e +/// ^ +/// ``` +pub fn attribute_value_before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + // Start of double- or single quoted value. + Some(b'"' | b'\'') => { + tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); + tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral); + tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker); + State::Next(StateName::MdxJsxTextAttributeValueQuotedStart) + } + // Attribute value expression. + Some(b'{') => unreachable!("to do: attribute value expression"), + _ => crash( + tokenizer, + "before attribute value", + &format!( + "a character that can start an attribute value, such as `\"`, `'`, or `{{`{}", + if tokenizer.current == Some(b'<') { + " (note: to use an element or fragment as a prop value in MDX, use `{}`)" + } else { + "" + } + ), + ), + } +} + +/// Before quoted literal attribute value. +/// +/// ```markdown +/// > | a e +/// ^ +/// ``` +pub fn attribute_value_quoted_start(tokenizer: &mut Tokenizer) -> State { + if let Some(byte) = tokenizer.current { + if byte == tokenizer.tokenize_state.marker { + tokenizer.tokenize_state.marker = 0; + tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralMarker); + tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteral); + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeValueBefore), + State::Nok, + ); + State::Next(StateName::MdxJsxTextEsWhitespaceStart) + } else if byte == b'\n' { + tokenizer.attempt( + State::Next(StateName::MdxJsxTextAttributeValueQuotedStart), + State::Nok, + ); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } else { + tokenizer.enter(Name::MdxJsxTextTagAttributeValueLiteralValue); + State::Retry(StateName::MdxJsxTextAttributeValueQuoted) + } + } else { + crash( + tokenizer, + "in attribute value", + &format!( + "a corresponding closing quote {}", + format_byte(tokenizer.tokenize_state.marker) + ), + ) + } +} + +/// In quoted literal attribute value. +/// +/// ```markdown +/// > | a e +/// ^ +/// ``` +pub fn attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { + // To do: doesn’t this break for: + // ```markdown + // a + if tokenizer.current == Some(tokenizer.tokenize_state.marker) + || matches!(tokenizer.current, None | Some(b'\n')) + { + tokenizer.exit(Name::MdxJsxTextTagAttributeValueLiteralValue); + State::Retry(StateName::MdxJsxTextAttributeValueQuoted) + } else { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextAttributeValueQuoted) + } +} + +/// After self-closing slash. +/// +/// ```markdown +/// > | a c +/// ^ +/// ``` +pub fn self_closing(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd), + _ => crash( + tokenizer, + "after self-closing slash", + &format!( + "`>` to end the tag{}", + if tokenizer.current == Some(b'*' | b'/') { + " (note: JS comments in JSX tags are not supported in MDX)" + } else { + "" + } + ), + ), + } +} + +/// At final `>`. +/// +/// ```markdown +/// > | a c +/// ^ +/// ``` +pub fn tag_end(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'>') => { + tokenizer.enter(Name::MdxJsxTextTagMarker); + tokenizer.consume(); + tokenizer.exit(Name::MdxJsxTextTagMarker); + tokenizer.exit(Name::MdxJsxTextTag); + State::Ok + } + _ => unreachable!("expected `>`"), + } +} + +/// Before optional ECMAScript whitespace. +/// +/// ```markdown +/// > | a c +/// ^ +/// ``` +pub fn es_whitespace_start(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\n') => { + // To do: check if this works for blank lines? + tokenizer.attempt( + State::Next(StateName::MdxJsxTextEsWhitespaceStart), + State::Nok, + ); + State::Retry(space_or_tab_eol(tokenizer)) + } + _ => { + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace + { + tokenizer.enter(Name::MdxJsxTextEsWhitespace); + State::Retry(StateName::MdxJsxTextEsWhitespaceInside) + } else { + State::Ok + } + } + } +} + +/// In ECMAScript whitespace. +/// +/// ```markdown +/// > | a c +/// ^ +/// ``` +pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(b'\n') => { + tokenizer.exit(Name::MdxJsxTextEsWhitespace); + State::Retry(StateName::MdxJsxTextEsWhitespaceStart) + } + // Allow continuation bytes. + Some(0x80..=0xBF) => { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextEsWhitespaceInside) + } + _ => { + if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index) + == CharacterKind::Whitespace + { + tokenizer.consume(); + State::Next(StateName::MdxJsxTextEsWhitespaceInside) + } else { + tokenizer.exit(Name::MdxJsxTextEsWhitespace); + State::Ok + } + } + } +} + +// To do: unicode. +fn id_start(code: Option) -> bool { + matches!(code, Some(b'$' | b'_' | b'A'..=b'Z' | b'a'..=b'z')) +} + +// To do: unicode. +fn id_cont(code: Option) -> bool { + matches!( + code, + Some(b'$' | b'_' | b'A'..=b'Z' | b'0'..=b'9' | b'a'..=b'z') + ) +} + +fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> ! { + // To do: externalize this, and the print mechanism in the tokenizer, + // to one proper formatter. + // To do: figure out how Rust does errors? + let actual = match tokenizer.current { + None => "end of file".to_string(), + Some(byte) => format_byte(byte), + }; + + unreachable!( + "{}:{}: Unexpected {} {}, expected {}", + tokenizer.point.line, tokenizer.point.column, actual, at, expect + ) +} + +fn format_byte(byte: u8) -> String { + match byte { + b'`' => "`` ` ``".to_string(), + b' '..=b'~' => format!("`{}`", str::from_utf8(&[byte]).unwrap()), + _ => format!("U+{:>04X}", byte), + } +} diff --git a/src/construct/mod.rs b/src/construct/mod.rs index de88174..d2843c3 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -30,14 +30,13 @@ //! //! The following constructs are found in markdown (CommonMark): //! -//! * [attention (strong, emphasis)][attention] +//! * [attention][attention] (strong, emphasis, extension: GFM strikethrough) //! * [autolink][] //! * [blank line][blank_line] //! * [block quote][block_quote] //! * [character escape][character_escape] //! * [character reference][character_reference] //! * [code (indented)][code_indented] -//! * [code (text)][raw_text] //! * [definition][] //! * [hard break (escape)][hard_break_escape] //! * [heading (atx)][heading_atx] @@ -49,7 +48,8 @@ //! * [label start (link)][label_start_link] //! * [list item][list_item] //! * [paragraph][] -//! * [raw (flow)][raw_flow] (code (fenced), math (flow)) +//! * [raw (flow)][raw_flow] (code (fenced), extensions: math (flow)) +//! * [raw (text)][raw_text] (code (text), extensions: math (text)) //! * [thematic break][thematic_break] //! //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by @@ -60,11 +60,10 @@ //! * [frontmatter][] //! * [gfm autolink literal][gfm_autolink_literal] //! * [gfm footnote definition][gfm_footnote_definition] +//! * [gfm label start footnote][gfm_label_start_footnote] //! * [gfm table][gfm_table] //! * [gfm task list item check][gfm_task_list_item_check] -//! * [gfm label start footnote][gfm_label_start_footnote] -//! * math (text) (in `raw_text`) -//! * gfm strikethrough (in attention) +//! * [mdx jsx (text)][mdx_jsx_text] //! //! There are also several small subroutines typically used in different places: //! @@ -163,6 +162,7 @@ pub mod label_end; pub mod label_start_image; pub mod label_start_link; pub mod list_item; +pub mod mdx_jsx_text; pub mod paragraph; pub mod partial_bom; pub mod partial_data; diff --git a/src/construct/text.rs b/src/construct/text.rs index 0168d02..b59fe65 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -18,6 +18,7 @@ //! * [Label start (image)][crate::construct::label_start_image] //! * [Label start (link)][crate::construct::label_start_link] //! * [Label end][crate::construct::label_end] +//! * [MDX: JSX (text)][crate::construct::mdx_jsx_text] //! //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by //! > [whitespace][crate::construct::partial_whitespace]. @@ -34,7 +35,7 @@ const MARKERS: [u8; 15] = [ b'$', // `raw_text` (math (text)) b'&', // `character_reference` b'*', // `attention` (emphasis, strong) - b'<', // `autolink`, `html_text` + b'<', // `autolink`, `html_text`, `mdx_jsx_text` b'H', // `gfm_autolink_literal` (`protocol` kind) b'W', // `gfm_autolink_literal` (`www.` kind) b'[', // `label_start_link` @@ -109,7 +110,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { ); State::Retry(StateName::AttentionStart) } - // `autolink`, `html_text` (order does not matter) + // `autolink`, `html_text` (order does not matter), `mdx_jsx_text` (order matters). Some(b'<') => { tokenizer.attempt( State::Next(StateName::TextBefore), @@ -167,11 +168,27 @@ pub fn before(tokenizer: &mut Tokenizer) -> State { pub fn before_html(tokenizer: &mut Tokenizer) -> State { tokenizer.attempt( State::Next(StateName::TextBefore), - State::Next(StateName::TextBeforeData), + State::Next(StateName::TextBeforeMdxJsx), ); State::Retry(StateName::HtmlTextStart) } +/// Before mdx jsx (text). +/// +/// At `<`, which wasn’t an autolink or html. +/// +/// ```markdown +/// > | a +/// ^ +/// ``` +pub fn before_mdx_jsx(tokenizer: &mut Tokenizer) -> State { + tokenizer.attempt( + State::Next(StateName::TextBefore), + State::Next(StateName::TextBeforeData), + ); + State::Retry(StateName::MdxJsxTextStart) +} + /// Before hard break escape. /// /// At `\`, which wasn’t a character escape. diff --git a/src/event.rs b/src/event.rs index fad2c64..b476d45 100644 --- a/src/event.rs +++ b/src/event.rs @@ -2730,6 +2730,28 @@ pub enum Name { /// ^ ^ ^ /// ``` ThematicBreakSequence, + + // To do: sort. + MdxJsxTextTag, + MdxJsxTextTagMarker, // void + MdxJsxTextEsWhitespace, // void + MdxJsxTextTagClosingMarker, // void + MdxJsxTextTagName, + MdxJsxTextTagNamePrimary, // void? + MdxJsxTextTagNameMemberMarker, // void + MdxJsxTextTagNamePrefixMarker, // void + MdxJsxTextTagNameMember, // void + MdxJsxTextTagNameLocal, // void + MdxJsxTextTagSelfClosingMarker, // void + MdxJsxTextTagAttribute, + MdxJsxTextTagAttributeName, + MdxJsxTextTagAttributePrimaryName, + MdxJsxTextTagAttributeNamePrefixMarker, // void + MdxJsxTextTagAttributeInitializerMarker, // void + MdxJsxTextTagAttributeNameLocal, // void + MdxJsxTextTagAttributeValueLiteral, + MdxJsxTextTagAttributeValueLiteralMarker, // void + MdxJsxTextTagAttributeValueLiteralValue, } /// List of void events, used to make sure everything is working well. diff --git a/src/lib.rs b/src/lib.rs index e3fdfcb..7fd705b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -301,6 +301,13 @@ pub struct Constructs { /// ^^^ /// ``` pub math_text: bool, + /// MDX: JSX (text). + /// + /// ```markdown + /// > | a c + /// ^^^^^^^^^^^^^ + /// ``` + pub mdx_jsx_text: bool, /// Thematic break. /// /// ```markdown @@ -342,6 +349,7 @@ impl Default for Constructs { list_item: true, math_flow: false, math_text: false, + mdx_jsx_text: false, thematic_break: true, } } @@ -350,6 +358,8 @@ impl Default for Constructs { impl Constructs { /// GFM. /// + /// + /// /// This turns on `CommonMark` + GFM. #[must_use] pub fn gfm() -> Self { @@ -363,6 +373,25 @@ impl Constructs { ..Self::default() } } + + /// MDX. + /// + /// + /// + /// This turns on `CommonMark`, turns off some conflicting constructs + /// (autolinks, code (indented), html), and turns on MDX (JSX, + /// expressions, ESM). + #[must_use] + pub fn mdx() -> Self { + Self { + autolink: false, + code_indented: false, + html_flow: false, + html_text: false, + mdx_jsx_text: true, + ..Self::default() + } + } } /// Configuration (optional). diff --git a/src/state.rs b/src/state.rs index d7c0c8a..3294a2f 100644 --- a/src/state.rs +++ b/src/state.rs @@ -358,6 +358,7 @@ pub enum Name { TextStart, TextBefore, TextBeforeHtml, + TextBeforeMdxJsx, TextBeforeHardBreakEscape, TextBeforeLabelStartLink, TextBeforeData, @@ -374,12 +375,74 @@ pub enum Name { TitleAtBlankLine, TitleEscape, TitleInside, + + // To do: sort. + MdxJsxTextEsWhitespaceStart, + MdxJsxTextEsWhitespaceInside, + MdxJsxTextStart, + MdxJsxTextStartAfter, + MdxJsxTextNameBefore, + MdxJsxTextClosingTagNameBefore, + MdxJsxTextTagEnd, + MdxJsxTextPrimaryName, + MdxJsxTextPrimaryNameAfter, + MdxJsxTextMemberNameBefore, + MdxJsxTextMemberName, + MdxJsxTextMemberNameAfter, + MdxJsxTextLocalNameBefore, + MdxJsxTextLocalName, + MdxJsxTextLocalNameAfter, + MdxJsxTextAttributeBefore, + MdxJsxTextSelfClosing, + MdxJsxTextAttributePrimaryName, + MdxJsxTextAttributePrimaryNameAfter, + MdxJsxTextAttributeLocalNameBefore, + MdxJsxTextAttributeLocalName, + MdxJsxTextAttributeLocalNameAfter, + MdxJsxTextAttributeValueBefore, + MdxJsxTextAttributeValueQuotedStart, + MdxJsxTextAttributeValueQuoted, } #[allow(clippy::too_many_lines)] /// Call the corresponding state for a state name. pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { let func = match name { + // To do: sort. + Name::MdxJsxTextEsWhitespaceStart => construct::mdx_jsx_text::es_whitespace_start, + Name::MdxJsxTextEsWhitespaceInside => construct::mdx_jsx_text::es_whitespace_inside, + Name::MdxJsxTextStart => construct::mdx_jsx_text::start, + Name::MdxJsxTextStartAfter => construct::mdx_jsx_text::start_after, + Name::MdxJsxTextNameBefore => construct::mdx_jsx_text::name_before, + Name::MdxJsxTextClosingTagNameBefore => construct::mdx_jsx_text::closing_tag_name_before, + Name::MdxJsxTextTagEnd => construct::mdx_jsx_text::tag_end, + Name::MdxJsxTextPrimaryName => construct::mdx_jsx_text::primary_name, + Name::MdxJsxTextPrimaryNameAfter => construct::mdx_jsx_text::primary_name_after, + Name::MdxJsxTextMemberNameBefore => construct::mdx_jsx_text::member_name_before, + Name::MdxJsxTextMemberName => construct::mdx_jsx_text::member_name, + Name::MdxJsxTextMemberNameAfter => construct::mdx_jsx_text::member_name_after, + Name::MdxJsxTextLocalNameBefore => construct::mdx_jsx_text::local_name_before, + Name::MdxJsxTextLocalName => construct::mdx_jsx_text::local_name, + Name::MdxJsxTextLocalNameAfter => construct::mdx_jsx_text::local_name_after, + Name::MdxJsxTextAttributeBefore => construct::mdx_jsx_text::attribute_before, + Name::MdxJsxTextSelfClosing => construct::mdx_jsx_text::self_closing, + Name::MdxJsxTextAttributePrimaryName => construct::mdx_jsx_text::attribute_primary_name, + Name::MdxJsxTextAttributePrimaryNameAfter => { + construct::mdx_jsx_text::attribute_primary_name_after + } + Name::MdxJsxTextAttributeLocalNameBefore => { + construct::mdx_jsx_text::attribute_local_name_before + } + Name::MdxJsxTextAttributeLocalName => construct::mdx_jsx_text::attribute_local_name, + Name::MdxJsxTextAttributeLocalNameAfter => { + construct::mdx_jsx_text::attribute_local_name_after + } + Name::MdxJsxTextAttributeValueBefore => construct::mdx_jsx_text::attribute_value_before, + Name::MdxJsxTextAttributeValueQuotedStart => { + construct::mdx_jsx_text::attribute_value_quoted_start + } + Name::MdxJsxTextAttributeValueQuoted => construct::mdx_jsx_text::attribute_value_quoted, + Name::AttentionStart => construct::attention::start, Name::AttentionInside => construct::attention::inside, @@ -776,6 +839,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State { Name::TextStart => construct::text::start, Name::TextBefore => construct::text::before, Name::TextBeforeHtml => construct::text::before_html, + Name::TextBeforeMdxJsx => construct::text::before_mdx_jsx, Name::TextBeforeHardBreakEscape => construct::text::before_hard_break_escape, Name::TextBeforeLabelStartLink => construct::text::before_label_start_link, Name::TextBeforeData => construct::text::before_data, diff --git a/src/util/slice.rs b/src/util/slice.rs index d02a526..54524c3 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -1,7 +1,10 @@ //! Deal with bytes. use crate::event::{Event, Kind, Point}; -use crate::util::constant::TAB_SIZE; +use crate::util::{ + classify_character::{classify_opt, Kind as CharacterKind}, + constant::TAB_SIZE, +}; use alloc::string::String; use core::str; @@ -27,6 +30,26 @@ pub fn char_after_index(bytes: &[u8], index: usize) -> Option { String::from_utf8_lossy(&bytes[index..end]).chars().next() } +/// Classify a byte (or `char`). +pub fn byte_to_kind(bytes: &[u8], index: usize) -> CharacterKind { + if index == bytes.len() { + CharacterKind::Whitespace + } else { + let byte = bytes[index]; + if byte.is_ascii_whitespace() { + CharacterKind::Whitespace + } else if byte.is_ascii_punctuation() { + CharacterKind::Punctuation + } else if byte.is_ascii_alphanumeric() { + CharacterKind::Other + } else { + // Otherwise: seems to be an ASCII control, so it seems to be a + // non-ASCII `char`. + classify_opt(char_after_index(bytes, index)) + } + } +} + /// A range between two points. #[derive(Debug)] pub struct Position<'a> { -- cgit