Add support for unicode identifiers in JSX

author: Titus Wormer <tituswormer@gmail.com> 2022-09-07 11:07:41 +0200
committer: Titus Wormer <tituswormer@gmail.com> 2022-09-07 11:07:41 +0200
commit: e6018e52ee6ad9a8f8a0672b75bf515faf74af1f (patch)
tree: ca556a799ed20ef2d9e0ae9109a9b7819da02c6c
parent: 6af582ee16d9c54c9719144caabc7705a324c40b (diff)
download: markdown-rs-e6018e52ee6ad9a8f8a0672b75bf515faf74af1f.tar.gz
markdown-rs-e6018e52ee6ad9a8f8a0672b75bf515faf74af1f.tar.bz2
markdown-rs-e6018e52ee6ad9a8f8a0672b75bf515faf74af1f.zip
2 files changed, 97 insertions, 24 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 9120430..53b2a62 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,8 @@ harness = false
 
 [dependencies]
 log = "0.4"
+unicode-id = { version = "0.3", features = ["no_std"] }
+
 
 [dev-dependencies]
 env_logger = "0.9"
diff --git a/src/construct/mdx_jsx_text.rs b/src/construct/mdx_jsx_text.rs
index 7a33499..deeb3e9 100644
--- a/src/construct/mdx_jsx_text.rs
+++ b/src/construct/mdx_jsx_text.rs
@@ -4,12 +4,16 @@ use crate::construct::partial_space_or_tab_eol::space_or_tab_eol;
 use crate::event::Name;
 use crate::state::{Name as StateName, State};
 use crate::tokenizer::Tokenizer;
-use crate::util::{classify_character::Kind as CharacterKind, slice::byte_to_kind};
+use crate::util::{
+    classify_character::Kind as CharacterKind,
+    slice::{byte_to_kind, char_after_index},
+};
 use alloc::{
     format,
     string::{String, ToString},
 };
 use core::str;
+use unicode_id::UnicodeID;
 
 /// Start of MDX: JSX (text).
 ///
@@ -73,7 +77,9 @@ pub fn name_before(tokenizer: &mut Tokenizer) -> State {
         Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
         _ => {
             // To do: unicode.
-            if id_start(tokenizer.current) {
+            let char_opt = char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index);
+
+            if id_start(char_opt) {
                 tokenizer.enter(Name::MdxJsxTextTagName);
                 tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
                 tokenizer.consume();
@@ -111,7 +117,9 @@ pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State {
         // Start of a closing tag name.
         _ => {
             // To do: unicode.
-            if id_start(tokenizer.current) {
+            let char_opt = char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index);
+
+            if id_start(char_opt) {
                 tokenizer.enter(Name::MdxJsxTextTagName);
                 tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
                 tokenizer.consume();
@@ -153,8 +161,14 @@ pub fn primary_name(tokenizer: &mut Tokenizer) -> State {
         State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
     }
     // Continuation of name: remain.
+    // Allow continuation bytes.
     // To do: unicode.
-    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+    else if matches!(tokenizer.current, Some(0x80..=0xBF))
+        || id_cont(char_after_index(
+            tokenizer.parse_state.bytes,
+            tokenizer.point.index,
+        ))
+    {
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextPrimaryName)
     } else {
@@ -207,7 +221,11 @@ pub fn primary_name_after(tokenizer: &mut Tokenizer) -> State {
         }
         // End of name.
         _ => {
-            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+                || id_start(char_after_index(
+                    tokenizer.parse_state.bytes,
+                    tokenizer.point.index,
+                ))
             {
                 tokenizer.exit(Name::MdxJsxTextTagName);
                 State::Retry(StateName::MdxJsxTextAttributeBefore)
@@ -230,7 +248,10 @@ pub fn primary_name_after(tokenizer: &mut Tokenizer) -> State {
 /// ```
 pub fn member_name_before(tokenizer: &mut Tokenizer) -> State {
     // Start of a member name.
-    if id_start(tokenizer.current) {
+    if id_start(char_after_index(
+        tokenizer.parse_state.bytes,
+        tokenizer.point.index,
+    )) {
         tokenizer.enter(Name::MdxJsxTextTagNameMember);
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextMemberName)
@@ -264,7 +285,12 @@ pub fn member_name(tokenizer: &mut Tokenizer) -> State {
     }
     // Continuation of name: remain.
     // To do: unicode.
-    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+    else if matches!(tokenizer.current, Some(0x80..=0xBF))
+        || id_cont(char_after_index(
+            tokenizer.parse_state.bytes,
+            tokenizer.point.index,
+        ))
+    {
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextMemberName)
     } else {
@@ -306,7 +332,11 @@ pub fn member_name_after(tokenizer: &mut Tokenizer) -> State {
         }
         // End of name.
         _ => {
-            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+                || id_start(char_after_index(
+                    tokenizer.parse_state.bytes,
+                    tokenizer.point.index,
+                ))
             {
                 tokenizer.exit(Name::MdxJsxTextTagName);
                 State::Retry(StateName::MdxJsxTextAttributeBefore)
@@ -329,7 +359,10 @@ pub fn member_name_after(tokenizer: &mut Tokenizer) -> State {
 /// ```
 pub fn local_name_before(tokenizer: &mut Tokenizer) -> State {
     // Start of a local name.
-    if id_start(tokenizer.current) {
+    if id_start(char_after_index(
+        tokenizer.parse_state.bytes,
+        tokenizer.point.index,
+    )) {
         tokenizer.enter(Name::MdxJsxTextTagNameLocal);
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextLocalName)
@@ -366,7 +399,12 @@ pub fn local_name(tokenizer: &mut Tokenizer) -> State {
     }
     // Continuation of name: remain.
     // To do: unicode.
-    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+    else if matches!(tokenizer.current, Some(0x80..=0xBF))
+        || id_cont(char_after_index(
+            tokenizer.parse_state.bytes,
+            tokenizer.point.index,
+        ))
+    {
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextLocalName)
     } else {
@@ -391,7 +429,12 @@ pub fn local_name(tokenizer: &mut Tokenizer) -> State {
 /// ```
 pub fn local_name_after(tokenizer: &mut Tokenizer) -> State {
     // End of name.
-    if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current) {
+    if matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+        || id_start(char_after_index(
+            tokenizer.parse_state.bytes,
+            tokenizer.point.index,
+        ))
+    {
         tokenizer.exit(Name::MdxJsxTextTagName);
         State::Retry(StateName::MdxJsxTextAttributeBefore)
     } else {
@@ -431,7 +474,10 @@ pub fn attribute_before(tokenizer: &mut Tokenizer) -> State {
         Some(b'{') => unreachable!("to do: attribute expression"),
         _ => {
             // Start of an attribute name.
-            if id_start(tokenizer.current) {
+            if id_start(char_after_index(
+                tokenizer.parse_state.bytes,
+                tokenizer.point.index,
+            )) {
                 tokenizer.enter(Name::MdxJsxTextTagAttribute);
                 tokenizer.enter(Name::MdxJsxTextTagAttributeName);
                 tokenizer.enter(Name::MdxJsxTextTagAttributePrimaryName);
@@ -472,7 +518,12 @@ pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {
     }
     // Continuation of the attribute name: remain.
     // To do: unicode.
-    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+    else if matches!(tokenizer.current, Some(0x80..=0xBF))
+        || id_cont(char_after_index(
+            tokenizer.parse_state.bytes,
+            tokenizer.point.index,
+        ))
+    {
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextLocalName)
     } else {
@@ -524,7 +575,10 @@ pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State {
             if byte_to_kind(tokenizer.parse_state.bytes, tokenizer.point.index)
                 == CharacterKind::Whitespace
                 || matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
-                || id_start(tokenizer.current)
+                || id_start(char_after_index(
+                    tokenizer.parse_state.bytes,
+                    tokenizer.point.index,
+                ))
             {
                 tokenizer.exit(Name::MdxJsxTextTagAttributeName);
                 tokenizer.exit(Name::MdxJsxTextTagAttribute);
@@ -552,7 +606,10 @@ pub fn attribute_primary_name_after(tokenizer: &mut Tokenizer) -> State {
 /// ```
 pub fn attribute_local_name_before(tokenizer: &mut Tokenizer) -> State {
     // Start of a local name.
-    if id_start(tokenizer.current) {
+    if id_start(char_after_index(
+        tokenizer.parse_state.bytes,
+        tokenizer.point.index,
+    )) {
         tokenizer.enter(Name::MdxJsxTextTagAttributeNameLocal);
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextAttributeLocalName)
@@ -588,7 +645,12 @@ pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State {
     }
     // Continuation of local name: remain.
     // To do: unicode.
-    else if matches!(tokenizer.current, Some(b'-')) || id_cont(tokenizer.current) {
+    else if matches!(tokenizer.current, Some(0x80..=0xBF))
+        || id_cont(char_after_index(
+            tokenizer.parse_state.bytes,
+            tokenizer.point.index,
+        ))
+    {
         tokenizer.consume();
         State::Next(StateName::MdxJsxTextAttributeLocalName)
     } else {
@@ -623,7 +685,11 @@ pub fn attribute_local_name_after(tokenizer: &mut Tokenizer) -> State {
         }
         _ => {
             // End of name.
-            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{')) || id_start(tokenizer.current)
+            if matches!(tokenizer.current, Some(b'/' | b'>' | b'{'))
+                || id_start(char_after_index(
+                    tokenizer.parse_state.bytes,
+                    tokenizer.point.index,
+                ))
             {
                 tokenizer.exit(Name::MdxJsxTextTagAttribute);
                 State::Retry(StateName::MdxJsxTextAttributeBefore)
@@ -841,16 +907,21 @@ pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State {
 }
 
 // To do: unicode.
-fn id_start(code: Option<u8>) -> bool {
-    matches!(code, Some(b'$' | b'_' | b'A'..=b'Z' | b'a'..=b'z'))
+fn id_start(code: Option<char>) -> bool {
+    if let Some(char) = code {
+        UnicodeID::is_id_start(char) || matches!(char, '$' | '_')
+    } else {
+        false
+    }
 }
 
 // To do: unicode.
-fn id_cont(code: Option<u8>) -> bool {
-    matches!(
-        code,
-        Some(b'$' | b'_' | b'A'..=b'Z' | b'0'..=b'9' | b'a'..=b'z')
-    )
+fn id_cont(code: Option<char>) -> bool {
+    if let Some(char) = code {
+        UnicodeID::is_id_continue(char) || matches!(char, '-' | '\u{200c}' | '\u{200d}')
+    } else {
+        false
+    }
 }
 
 fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> ! {
author	Titus Wormer <tituswormer@gmail.com>	2022-09-07 11:07:41 +0200
committer	Titus Wormer <tituswormer@gmail.com>	2022-09-07 11:07:41 +0200
commit	e6018e52ee6ad9a8f8a0672b75bf515faf74af1f (patch)
tree	ca556a799ed20ef2d9e0ae9109a9b7819da02c6c
parent	6af582ee16d9c54c9719144caabc7705a324c40b (diff)
download	markdown-rs-e6018e52ee6ad9a8f8a0672b75bf515faf74af1f.tar.gz markdown-rs-e6018e52ee6ad9a8f8a0672b75bf515faf74af1f.tar.bz2 markdown-rs-e6018e52ee6ad9a8f8a0672b75bf515faf74af1f.zip