aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-09-07 15:53:06 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-09-07 15:53:06 +0200
commit1d92666865b35341e076efbefddf6e73b5e1542e (patch)
tree11c05985ec7679f73473e7ea2c769465698e2f08 /src/construct
parente6018e52ee6ad9a8f8a0672b75bf515faf74af1f (diff)
downloadmarkdown-rs-1d92666865b35341e076efbefddf6e73b5e1542e.tar.gz
markdown-rs-1d92666865b35341e076efbefddf6e73b5e1542e.tar.bz2
markdown-rs-1d92666865b35341e076efbefddf6e73b5e1542e.zip
Add support for recoverable syntax errors
Diffstat (limited to 'src/construct')
-rw-r--r--src/construct/document.rs21
-rw-r--r--src/construct/mdx_jsx_text.rs88
2 files changed, 56 insertions, 53 deletions
diff --git a/src/construct/document.rs b/src/construct/document.rs
index e31e58d..57c5f3a 100644
--- a/src/construct/document.rs
+++ b/src/construct/document.rs
@@ -14,7 +14,7 @@ use crate::state::{Name as StateName, State};
use crate::subtokenize::divide_events;
use crate::tokenizer::{Container, ContainerState, Tokenizer};
use crate::util::skip;
-use alloc::{boxed::Box, vec::Vec};
+use alloc::{boxed::Box, string::String, vec::Vec};
/// Phases where we can exit containers.
#[derive(Debug, PartialEq)]
@@ -266,7 +266,9 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.document_continued
!= tokenizer.tokenize_state.document_container_stack.len()
{
- exit_containers(tokenizer, &Phase::Prefix);
+ if let Err(message) = exit_containers(tokenizer, &Phase::Prefix) {
+ return State::Error(message);
+ }
}
// We are “piercing” into the flow with a new container.
@@ -361,6 +363,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
let state = tokenizer
.tokenize_state
.document_child_state
+ .take()
.unwrap_or(State::Next(StateName::FlowStart));
tokenizer.tokenize_state.document_exits.push(None);
@@ -439,13 +442,17 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.document_continued
!= tokenizer.tokenize_state.document_container_stack.len()
{
- exit_containers(tokenizer, &Phase::After);
+ if let Err(message) = exit_containers(tokenizer, &Phase::After) {
+ return State::Error(message);
+ }
}
match tokenizer.current {
None => {
tokenizer.tokenize_state.document_continued = 0;
- exit_containers(tokenizer, &Phase::Eof);
+ if let Err(message) = exit_containers(tokenizer, &Phase::Eof) {
+ return State::Error(message);
+ }
resolve(tokenizer);
State::Ok
}
@@ -461,7 +468,7 @@ pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
}
/// Close containers (and flow if needed).
-fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
+fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) -> Result<(), String> {
let mut stack_close = tokenizer
.tokenize_state
.document_container_stack
@@ -477,7 +484,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
.take()
.unwrap_or(State::Next(StateName::FlowStart));
- child.flush(state, false);
+ child.flush(state, false)?;
}
if !stack_close.is_empty() {
@@ -524,6 +531,8 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
}
child.interrupt = false;
+
+ Ok(())
}
// Inject everything together.
diff --git a/src/construct/mdx_jsx_text.rs b/src/construct/mdx_jsx_text.rs
index deeb3e9..4c71fec 100644
--- a/src/construct/mdx_jsx_text.rs
+++ b/src/construct/mdx_jsx_text.rs
@@ -76,10 +76,10 @@ pub fn name_before(tokenizer: &mut Tokenizer) -> State {
// Fragment opening tag.
Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
_ => {
- // To do: unicode.
- let char_opt = char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index);
-
- if id_start(char_opt) {
+ if id_start(char_after_index(
+ tokenizer.parse_state.bytes,
+ tokenizer.point.index,
+ )) {
tokenizer.enter(Name::MdxJsxTextTagName);
tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
tokenizer.consume();
@@ -111,34 +111,32 @@ pub fn name_before(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
pub fn closing_tag_name_before(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- // Fragment closing tag.
- Some(b'>') => State::Retry(StateName::MdxJsxTextTagEnd),
- // Start of a closing tag name.
- _ => {
- // To do: unicode.
- let char_opt = char_after_index(tokenizer.parse_state.bytes, tokenizer.point.index);
-
- if id_start(char_opt) {
- tokenizer.enter(Name::MdxJsxTextTagName);
- tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
- tokenizer.consume();
- State::Next(StateName::MdxJsxTextPrimaryName)
- } else {
- crash(
- tokenizer,
- "before name",
- &format!(
- "a character that can start a name, such as a letter, `$`, or `_`{}",
- if tokenizer.current == Some(b'*' | b'/') {
- " (note: JS comments in JSX tags are not supported in MDX)"
- } else {
- ""
- }
- ),
- )
- }
- }
+ // Fragment closing tag.
+ if let Some(b'>') = tokenizer.current {
+ State::Retry(StateName::MdxJsxTextTagEnd)
+ }
+ // Start of a closing tag name.
+ else if id_start(char_after_index(
+ tokenizer.parse_state.bytes,
+ tokenizer.point.index,
+ )) {
+ tokenizer.enter(Name::MdxJsxTextTagName);
+ tokenizer.enter(Name::MdxJsxTextTagNamePrimary);
+ tokenizer.consume();
+ State::Next(StateName::MdxJsxTextPrimaryName)
+ } else {
+ crash(
+ tokenizer,
+ "before name",
+ &format!(
+ "a character that can start a name, such as a letter, `$`, or `_`{}",
+ if tokenizer.current == Some(b'*' | b'/') {
+ " (note: JS comments in JSX tags are not supported in MDX)"
+ } else {
+ ""
+ }
+ ),
+ )
}
}
@@ -162,7 +160,6 @@ pub fn primary_name(tokenizer: &mut Tokenizer) -> State {
}
// Continuation of name: remain.
// Allow continuation bytes.
- // To do: unicode.
else if matches!(tokenizer.current, Some(0x80..=0xBF))
|| id_cont(char_after_index(
tokenizer.parse_state.bytes,
@@ -284,7 +281,7 @@ pub fn member_name(tokenizer: &mut Tokenizer) -> State {
State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
}
// Continuation of name: remain.
- // To do: unicode.
+ // Allow continuation bytes.
else if matches!(tokenizer.current, Some(0x80..=0xBF))
|| id_cont(char_after_index(
tokenizer.parse_state.bytes,
@@ -398,7 +395,7 @@ pub fn local_name(tokenizer: &mut Tokenizer) -> State {
State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
}
// Continuation of name: remain.
- // To do: unicode.
+ // Allow continuation bytes.
else if matches!(tokenizer.current, Some(0x80..=0xBF))
|| id_cont(char_after_index(
tokenizer.parse_state.bytes,
@@ -516,8 +513,8 @@ pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {
);
State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
}
- // Continuation of the attribute name: remain.
- // To do: unicode.
+ // Continuation of name: remain.
+ // Allow continuation bytes.
else if matches!(tokenizer.current, Some(0x80..=0xBF))
|| id_cont(char_after_index(
tokenizer.parse_state.bytes,
@@ -525,7 +522,7 @@ pub fn attribute_primary_name(tokenizer: &mut Tokenizer) -> State {
))
{
tokenizer.consume();
- State::Next(StateName::MdxJsxTextLocalName)
+ State::Next(StateName::MdxJsxTextAttributePrimaryName)
} else {
crash(
tokenizer,
@@ -643,8 +640,8 @@ pub fn attribute_local_name(tokenizer: &mut Tokenizer) -> State {
);
State::Retry(StateName::MdxJsxTextEsWhitespaceStart)
}
- // Continuation of local name: remain.
- // To do: unicode.
+ // Continuation of name: remain.
+ // Allow continuation bytes.
else if matches!(tokenizer.current, Some(0x80..=0xBF))
|| id_cont(char_after_index(
tokenizer.parse_state.bytes,
@@ -906,7 +903,6 @@ pub fn es_whitespace_inside(tokenizer: &mut Tokenizer) -> State {
}
}
-// To do: unicode.
fn id_start(code: Option<char>) -> bool {
if let Some(char) = code {
UnicodeID::is_id_start(char) || matches!(char, '$' | '_')
@@ -915,7 +911,6 @@ fn id_start(code: Option<char>) -> bool {
}
}
-// To do: unicode.
fn id_cont(code: Option<char>) -> bool {
if let Some(char) = code {
UnicodeID::is_id_continue(char) || matches!(char, '-' | '\u{200c}' | '\u{200d}')
@@ -924,25 +919,24 @@ fn id_cont(code: Option<char>) -> bool {
}
}
-fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> ! {
+fn crash(tokenizer: &Tokenizer, at: &str, expect: &str) -> State {
// To do: externalize this, and the print mechanism in the tokenizer,
// to one proper formatter.
- // To do: figure out how Rust does errors?
let actual = match tokenizer.current {
None => "end of file".to_string(),
Some(byte) => format_byte(byte),
};
- unreachable!(
+ State::Error(format!(
"{}:{}: Unexpected {} {}, expected {}",
tokenizer.point.line, tokenizer.point.column, actual, at, expect
- )
+ ))
}
fn format_byte(byte: u8) -> String {
match byte {
b'`' => "`` ` ``".to_string(),
b' '..=b'~' => format!("`{}`", str::from_utf8(&[byte]).unwrap()),
- _ => format!("U+{:>04X}", byte),
+ _ => format!("character U+{:>04X}", byte),
}
}