aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-11 13:31:20 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-08-11 13:31:20 +0200
commit2d35cbfceace81a217cd0fbdae7a8777c7a6465e (patch)
treee5e69d44c5c00d1dc70f4e3a227f67fd5c771389
parent053a2603e4bd5ec9caf40617b52136e5ef3fcf0a (diff)
downloadmarkdown-rs-2d35cbfceace81a217cd0fbdae7a8777c7a6465e.tar.gz
markdown-rs-2d35cbfceace81a217cd0fbdae7a8777c7a6465e.tar.bz2
markdown-rs-2d35cbfceace81a217cd0fbdae7a8777c7a6465e.zip
Refactor internal docs, code style of tokenizer
Diffstat (limited to '')
-rw-r--r--src/construct/code_fenced.rs22
-rw-r--r--src/construct/code_text.rs8
-rw-r--r--src/construct/definition.rs13
-rw-r--r--src/construct/html_flow.rs10
-rw-r--r--src/construct/html_text.rs86
-rw-r--r--src/construct/label_end.rs36
-rw-r--r--src/construct/label_start_image.rs2
-rw-r--r--src/construct/label_start_link.rs2
-rw-r--r--src/construct/partial_data.rs8
-rw-r--r--src/construct/partial_destination.rs2
-rw-r--r--src/construct/partial_label.rs17
-rw-r--r--src/construct/partial_title.rs17
-rw-r--r--src/content/document.rs42
-rw-r--r--src/content/string.rs2
-rw-r--r--src/content/text.rs10
-rw-r--r--src/subtokenize.rs6
-rw-r--r--src/tokenizer.rs518
17 files changed, 420 insertions, 381 deletions
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index 0d4345a..26e1148 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -162,7 +162,7 @@ pub fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
if let Some(b'`' | b'~') = tokenizer.current {
tokenizer.tokenize_state.marker = tokenizer.current.unwrap();
- tokenizer.tokenize_state.prefix = prefix;
+ tokenizer.tokenize_state.size_c = prefix;
tokenizer.enter(Token::CodeFencedFenceSequence);
State::Retry(StateName::CodeFencedSequenceOpen)
} else {
@@ -196,7 +196,7 @@ pub fn sequence_open(tokenizer: &mut Tokenizer) -> State {
}
_ => {
tokenizer.tokenize_state.marker = 0;
- tokenizer.tokenize_state.prefix = 0;
+ tokenizer.tokenize_state.size_c = 0;
tokenizer.tokenize_state.size = 0;
State::Nok
}
@@ -259,7 +259,7 @@ pub fn info(tokenizer: &mut Tokenizer) -> State {
Some(b'`') if tokenizer.tokenize_state.marker == b'`' => {
tokenizer.concrete = false;
tokenizer.tokenize_state.marker = 0;
- tokenizer.tokenize_state.prefix = 0;
+ tokenizer.tokenize_state.size_c = 0;
tokenizer.tokenize_state.size = 0;
State::Nok
}
@@ -307,7 +307,7 @@ pub fn meta(tokenizer: &mut Tokenizer) -> State {
Some(b'`') if tokenizer.tokenize_state.marker == b'`' => {
tokenizer.concrete = false;
tokenizer.tokenize_state.marker = 0;
- tokenizer.tokenize_state.prefix = 0;
+ tokenizer.tokenize_state.size_c = 0;
tokenizer.tokenize_state.size = 0;
State::Nok
}
@@ -410,14 +410,14 @@ pub fn before_sequence_close(tokenizer: &mut Tokenizer) -> State {
pub fn sequence_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'`' | b'~') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => {
- tokenizer.tokenize_state.size_other += 1;
+ tokenizer.tokenize_state.size_b += 1;
tokenizer.consume();
State::Next(StateName::CodeFencedSequenceClose)
}
- _ if tokenizer.tokenize_state.size_other >= CODE_FENCED_SEQUENCE_SIZE_MIN
- && tokenizer.tokenize_state.size_other >= tokenizer.tokenize_state.size =>
+ _ if tokenizer.tokenize_state.size_b >= CODE_FENCED_SEQUENCE_SIZE_MIN
+ && tokenizer.tokenize_state.size_b >= tokenizer.tokenize_state.size =>
{
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
tokenizer.exit(Token::CodeFencedFenceSequence);
let name = space_or_tab(tokenizer);
tokenizer.attempt(
@@ -427,7 +427,7 @@ pub fn sequence_close(tokenizer: &mut Tokenizer) -> State {
)
}
_ => {
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
State::Nok
}
}
@@ -474,7 +474,7 @@ pub fn content_before(tokenizer: &mut Tokenizer) -> State {
/// | ~~~
/// ```
pub fn content_start(tokenizer: &mut Tokenizer) -> State {
- let name = space_or_tab_min_max(tokenizer, 0, tokenizer.tokenize_state.prefix);
+ let name = space_or_tab_min_max(tokenizer, 0, tokenizer.tokenize_state.size_c);
tokenizer.attempt(
name,
State::Next(StateName::CodeFencedBeforeContentChunk),
@@ -536,7 +536,7 @@ pub fn content_chunk(tokenizer: &mut Tokenizer) -> State {
pub fn after(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Token::CodeFenced);
tokenizer.tokenize_state.marker = 0;
- tokenizer.tokenize_state.prefix = 0;
+ tokenizer.tokenize_state.size_c = 0;
tokenizer.tokenize_state.size = 0;
// Feel free to interrupt.
tokenizer.interrupt = false;
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
index 2c8faf3..d7ada3d 100644
--- a/src/construct/code_text.rs
+++ b/src/construct/code_text.rs
@@ -185,16 +185,16 @@ pub fn data(tokenizer: &mut Tokenizer) -> State {
pub fn sequence_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'`') => {
- tokenizer.tokenize_state.size_other += 1;
+ tokenizer.tokenize_state.size_b += 1;
tokenizer.consume();
State::Next(StateName::CodeTextSequenceClose)
}
_ => {
- if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_other {
+ if tokenizer.tokenize_state.size == tokenizer.tokenize_state.size_b {
tokenizer.exit(Token::CodeTextSequence);
tokenizer.exit(Token::CodeText);
tokenizer.tokenize_state.size = 0;
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
State::Ok
} else {
let index = tokenizer.events.len();
@@ -202,7 +202,7 @@ pub fn sequence_close(tokenizer: &mut Tokenizer) -> State {
// More or less accents: mark as data.
tokenizer.events[index - 1].token_type = Token::CodeTextData;
tokenizer.events[index].token_type = Token::CodeTextData;
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
State::Retry(StateName::CodeTextBetween)
}
}
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 62d0f3b..5db611b 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -174,7 +174,12 @@ pub fn label_after(tokenizer: &mut Tokenizer) -> State {
}
}
-/// To do.
+/// After the marker.
+///
+/// ```markdown
+/// > | [a]: b "c"
+/// ^
+/// ```
pub fn marker_after(tokenizer: &mut Tokenizer) -> State {
let name = space_or_tab_eol(tokenizer);
tokenizer.attempt(
@@ -196,7 +201,7 @@ pub fn destination_before(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_3 = Token::DefinitionDestinationLiteralMarker;
tokenizer.tokenize_state.token_4 = Token::DefinitionDestinationRaw;
tokenizer.tokenize_state.token_5 = Token::DefinitionDestinationString;
- tokenizer.tokenize_state.size_other = usize::MAX;
+ tokenizer.tokenize_state.size_b = usize::MAX;
tokenizer.attempt(
StateName::DestinationStart,
State::Next(StateName::DefinitionDestinationAfter),
@@ -216,7 +221,7 @@ pub fn destination_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_3 = Token::Data;
tokenizer.tokenize_state.token_4 = Token::Data;
tokenizer.tokenize_state.token_5 = Token::Data;
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
tokenizer.attempt(
StateName::DefinitionTitleBefore,
State::Next(StateName::DefinitionAfter),
@@ -231,7 +236,7 @@ pub fn destination_missing(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_3 = Token::Data;
tokenizer.tokenize_state.token_4 = Token::Data;
tokenizer.tokenize_state.token_5 = Token::Data;
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
State::Nok
}
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index b49b231..7a346e9 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -508,7 +508,7 @@ pub fn complete_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
State::Next(StateName::HtmlFlowCompleteAttributeValueBefore)
}
Some(b'"' | b'\'') => {
- tokenizer.tokenize_state.marker_other = tokenizer.current.unwrap();
+ tokenizer.tokenize_state.marker_b = tokenizer.current.unwrap();
tokenizer.consume();
State::Next(StateName::HtmlFlowCompleteAttributeValueQuoted)
}
@@ -528,13 +528,11 @@ pub fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'\n') => {
tokenizer.tokenize_state.marker = 0;
- tokenizer.tokenize_state.marker_other = 0;
+ tokenizer.tokenize_state.marker_b = 0;
State::Nok
}
- Some(b'"' | b'\'')
- if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker_other =>
- {
- tokenizer.tokenize_state.marker_other = 0;
+ Some(b'"' | b'\'') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker_b => {
+ tokenizer.tokenize_state.marker_b = 0;
tokenizer.consume();
State::Next(StateName::HtmlFlowCompleteAttributeValueQuotedAfter)
}
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index df6bd99..7474dbf 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -207,10 +207,11 @@ pub fn comment_start_dash(tokenizer: &mut Tokenizer) -> State {
pub fn comment(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None => State::Nok,
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextComment);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextComment),
+ State::Nok,
+ ),
Some(b'-') => {
tokenizer.consume();
State::Next(StateName::HtmlTextCommentClose)
@@ -269,10 +270,11 @@ pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State {
pub fn cdata(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None => State::Nok,
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextCdata);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextCdata),
+ State::Nok,
+ ),
Some(b']') => {
tokenizer.consume();
State::Next(StateName::HtmlTextCdataClose)
@@ -323,10 +325,11 @@ pub fn cdata_end(tokenizer: &mut Tokenizer) -> State {
pub fn declaration(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'>') => State::Retry(StateName::HtmlTextEnd),
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextDeclaration);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextDeclaration),
+ State::Nok,
+ ),
_ => {
tokenizer.consume();
State::Next(StateName::HtmlTextDeclaration)
@@ -343,10 +346,11 @@ pub fn declaration(tokenizer: &mut Tokenizer) -> State {
pub fn instruction(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None => State::Nok,
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextInstruction);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextInstruction),
+ State::Nok,
+ ),
Some(b'?') => {
tokenizer.consume();
State::Next(StateName::HtmlTextInstructionClose)
@@ -413,10 +417,11 @@ pub fn tag_close(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextTagCloseBetween);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextTagCloseBetween),
+ State::Nok,
+ ),
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Next(StateName::HtmlTextTagCloseBetween)
@@ -451,10 +456,11 @@ pub fn tag_open(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state = Some(StateName::HtmlTextTagOpenBetween);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextTagOpenBetween),
+ State::Nok,
+ ),
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Next(StateName::HtmlTextTagOpenBetween)
@@ -498,11 +504,11 @@ pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state =
- Some(StateName::HtmlTextTagOpenAttributeNameAfter);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextTagOpenAttributeNameAfter),
+ State::Nok,
+ ),
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Next(StateName::HtmlTextTagOpenAttributeNameAfter)
@@ -525,11 +531,11 @@ pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State {
pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok,
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state =
- Some(StateName::HtmlTextTagOpenAttributeValueBefore);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextTagOpenAttributeValueBefore),
+ State::Nok,
+ ),
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Next(StateName::HtmlTextTagOpenAttributeValueBefore)
@@ -558,11 +564,11 @@ pub fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.marker = 0;
State::Nok
}
- Some(b'\n') => {
- tokenizer.tokenize_state.return_state =
- Some(StateName::HtmlTextTagOpenAttributeValueQuoted);
- State::Retry(StateName::HtmlTextLineEndingBefore)
- }
+ Some(b'\n') => tokenizer.attempt(
+ StateName::HtmlTextLineEndingBefore,
+ State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted),
+ State::Nok,
+ ),
Some(b'"' | b'\'') if tokenizer.current.unwrap() == tokenizer.tokenize_state.marker => {
tokenizer.tokenize_state.marker = 0;
tokenizer.consume();
@@ -678,5 +684,5 @@ pub fn line_ending_after(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn line_ending_after_prefix(tokenizer: &mut Tokenizer) -> State {
tokenizer.enter(Token::HtmlTextData);
- State::Retry(tokenizer.tokenize_state.return_state.take().unwrap())
+ State::Ok
}
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 3337cec..a25f917 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -170,12 +170,12 @@ use crate::util::{
pub fn start(tokenizer: &mut Tokenizer) -> State {
if Some(b']') == tokenizer.current && tokenizer.parse_state.constructs.label_end {
let mut label_start_index = None;
- let mut index = tokenizer.label_start_stack.len();
+ let mut index = tokenizer.tokenize_state.label_start_stack.len();
while index > 0 {
index -= 1;
- if !tokenizer.label_start_stack[index].balanced {
+ if !tokenizer.tokenize_state.label_start_stack[index].balanced {
label_start_index = Some(index);
break;
}
@@ -184,6 +184,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
// If there is an okay opening:
if let Some(label_start_index) = label_start_index {
let label_start = tokenizer
+ .tokenize_state
.label_start_stack
.get_mut(label_start_index)
.unwrap();
@@ -221,7 +222,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
pub fn after(tokenizer: &mut Tokenizer) -> State {
- let start = &tokenizer.label_start_stack[tokenizer.tokenize_state.start];
+ let start = &tokenizer.tokenize_state.label_start_stack[tokenizer.tokenize_state.start];
let defined = tokenizer
.parse_state
.definitions
@@ -298,17 +299,23 @@ pub fn reference_not_full(tokenizer: &mut Tokenizer) -> State {
pub fn ok(tokenizer: &mut Tokenizer) -> State {
let label_start_index = tokenizer.tokenize_state.start;
// Remove this one and everything after it.
- let mut left = tokenizer.label_start_stack.split_off(label_start_index);
+ let mut left = tokenizer
+ .tokenize_state
+ .label_start_stack
+ .split_off(label_start_index);
// Remove this one from `left`, as we’ll move it to `media_list`.
let label_start = left.remove(0);
- tokenizer.label_start_list_loose.append(&mut left);
+ tokenizer
+ .tokenize_state
+ .label_start_list_loose
+ .append(&mut left);
let is_link = tokenizer.events[label_start.start.0].token_type == Token::LabelLink;
if is_link {
let mut index = 0;
- while index < tokenizer.label_start_stack.len() {
- let label_start = &mut tokenizer.label_start_stack[index];
+ while index < tokenizer.tokenize_state.label_start_stack.len() {
+ let label_start = &mut tokenizer.tokenize_state.label_start_stack[index];
if tokenizer.events[label_start.start.0].token_type == Token::LabelLink {
label_start.inactive = true;
}
@@ -316,7 +323,7 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State {
}
}
- tokenizer.media_list.push(Media {
+ tokenizer.tokenize_state.media_list.push(Media {
start: label_start.start,
end: (tokenizer.tokenize_state.end, tokenizer.events.len() - 1),
});
@@ -340,6 +347,7 @@ pub fn ok(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn nok(tokenizer: &mut Tokenizer) -> State {
tokenizer
+ .tokenize_state
.label_start_stack
.get_mut(tokenizer.tokenize_state.start)
.unwrap()
@@ -398,7 +406,7 @@ pub fn resource_open(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_3 = Token::ResourceDestinationLiteralMarker;
tokenizer.tokenize_state.token_4 = Token::ResourceDestinationRaw;
tokenizer.tokenize_state.token_5 = Token::ResourceDestinationString;
- tokenizer.tokenize_state.size_other = RESOURCE_DESTINATION_BALANCE_MAX;
+ tokenizer.tokenize_state.size_b = RESOURCE_DESTINATION_BALANCE_MAX;
tokenizer.attempt(
StateName::DestinationStart,
@@ -420,7 +428,7 @@ pub fn resource_destination_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_3 = Token::Data;
tokenizer.tokenize_state.token_4 = Token::Data;
tokenizer.tokenize_state.token_5 = Token::Data;
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
let name = space_or_tab_eol(tokenizer);
tokenizer.attempt(
name,
@@ -436,7 +444,7 @@ pub fn resource_destination_missing(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.token_3 = Token::Data;
tokenizer.tokenize_state.token_4 = Token::Data;
tokenizer.tokenize_state.token_5 = Token::Data;
- tokenizer.tokenize_state.size_other = 0;
+ tokenizer.tokenize_state.size_b = 0;
State::Nok
}
@@ -605,9 +613,9 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State {
/// images, or turns them back into data.
#[allow(clippy::too_many_lines)]
pub fn resolve_media(tokenizer: &mut Tokenizer) {
- let mut left = tokenizer.label_start_list_loose.split_off(0);
- let mut left_2 = tokenizer.label_start_stack.split_off(0);
- let media = tokenizer.media_list.split_off(0);
+ let mut left = tokenizer.tokenize_state.label_start_list_loose.split_off(0);
+ let mut left_2 = tokenizer.tokenize_state.label_start_stack.split_off(0);
+ let media = tokenizer.tokenize_state.media_list.split_off(0);
left.append(&mut left_2);
let events = &tokenizer.events;
diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs
index 1730fc3..629e836 100644
--- a/src/construct/label_start_image.rs
+++ b/src/construct/label_start_image.rs
@@ -64,7 +64,7 @@ pub fn open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
tokenizer.exit(Token::LabelMarker);
tokenizer.exit(Token::LabelImage);
- tokenizer.label_start_stack.push(LabelStart {
+ tokenizer.tokenize_state.label_start_stack.push(LabelStart {
start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1),
balanced: false,
inactive: false,
diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs
index c47941c..6eb7b40 100644
--- a/src/construct/label_start_link.rs
+++ b/src/construct/label_start_link.rs
@@ -46,7 +46,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
tokenizer.exit(Token::LabelMarker);
tokenizer.exit(Token::LabelLink);
- tokenizer.label_start_stack.push(LabelStart {
+ tokenizer.tokenize_state.label_start_stack.push(LabelStart {
start: (start, tokenizer.events.len() - 1),
balanced: false,
inactive: false,
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs
index a68f359..0ad67c5 100644
--- a/src/construct/partial_data.rs
+++ b/src/construct/partial_data.rs
@@ -17,8 +17,8 @@ use crate::tokenizer::{EventType, State, StateName, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- // Make sure to eat the first `stop`.
- Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => {
+ // Make sure to eat the first `markers`.
+ Some(byte) if tokenizer.tokenize_state.markers.contains(&byte) => {
tokenizer.enter(Token::Data);
tokenizer.consume();
State::Next(StateName::DataInside)
@@ -42,7 +42,7 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Token::LineEnding);
State::Next(StateName::DataAtBreak)
}
- Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => {
+ Some(byte) if tokenizer.tokenize_state.markers.contains(&byte) => {
tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data));
State::Ok
}
@@ -62,7 +62,7 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State {
pub fn inside(tokenizer: &mut Tokenizer) -> State {
let done = match tokenizer.current {
None | Some(b'\n') => true,
- Some(byte) if tokenizer.tokenize_state.stop.contains(&byte) => true,
+ Some(byte) if tokenizer.tokenize_state.markers.contains(&byte) => true,
_ => false,
};
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index 26fadc4..735fb38 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -182,7 +182,7 @@ pub fn raw(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.size = 0;
State::Ok
}
- Some(b'(') if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_other => {
+ Some(b'(') if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_b => {
tokenizer.consume();
tokenizer.tokenize_state.size += 1;
State::Next(StateName::DestinationRaw)
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index a151841..6447961 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -142,13 +142,26 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State {
}
}
-/// To do.
+/// In a label, after whitespace.
+///
+/// ```markdown
+/// | [a␊
+/// > | b]
+/// ^
+/// ```
pub fn eol_after(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.connect = true;
State::Retry(StateName::LabelAtBreak)
}
-/// To do.
+/// In a label, at a blank line.
+///
+/// ```markdown
+/// | [a␊
+/// > | ␊
+/// ^
+/// | b]
+/// ```
pub fn at_blank_line(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.marker = 0;
tokenizer.tokenize_state.connect = false;
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index 0b81418..209240e 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -133,13 +133,26 @@ pub fn at_break(tokenizer: &mut Tokenizer) -> State {
}
}
-/// To do.
+/// In a title, after whitespace.
+///
+/// ```markdown
+/// | "a␊
+/// > | b"
+/// ^
+/// ```
pub fn after_eol(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.connect = true;
State::Retry(StateName::TitleAtBreak)
}
-/// To do.
+/// In a title, at a blank line.
+///
+/// ```markdown
+/// | "a␊
+/// > | ␊
+/// ^
+/// | b"
+/// ```
pub fn at_blank_line(tokenizer: &mut Tokenizer) -> State {
tokenizer.tokenize_state.marker = 0;
tokenizer.tokenize_state.connect = false;
diff --git a/src/content/document.rs b/src/content/document.rs
index 98f8a7d..49ca919 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -59,7 +59,7 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let state = tokenizer.push(
(0, 0),
(parse_state.bytes.len(), 0),
- StateName::DocumentStart,
+ State::Next(StateName::DocumentStart),
);
tokenizer.flush(state, true);
@@ -105,7 +105,7 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- tokenizer.tokenize_state.child_tokenizer = Some(Box::new(Tokenizer::new(
+ tokenizer.tokenize_state.document_child = Some(Box::new(Tokenizer::new(
tokenizer.point.clone(),
tokenizer.parse_state,
)));
@@ -173,7 +173,7 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
if tokenizer.tokenize_state.document_continued
== tokenizer.tokenize_state.document_container_stack.len()
{
- let child = tokenizer.tokenize_state.child_tokenizer.as_ref().unwrap();
+ let child = tokenizer.tokenize_state.document_child.as_ref().unwrap();
tokenizer.interrupt = child.interrupt;
@@ -209,7 +209,12 @@ pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
)
}
-/// To do.
+/// Maybe before a new container, but not a block quote.
+//
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {
// List item?
// We replace the empty block quote container for this new list one.
@@ -227,7 +232,12 @@ pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State
)
}
-/// To do.
+/// Maybe before a new container, but not a list.
+//
+/// ```markdown
+/// > | a
+/// ^
+/// ```
pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State {
// It wasn’t a new block quote or a list.
// Swap the new container (in the middle) with the existing one (at the end).
@@ -283,7 +293,7 @@ pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
- let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+ let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
child.lazy = tokenizer.tokenize_state.document_continued
!= tokenizer.tokenize_state.document_container_stack.len();
@@ -312,7 +322,12 @@ pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
}
}
-/// To do.
+/// In flow.
+//
+/// ```markdown
+/// > | * ab
+/// ^
+/// ```
pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
None => {
@@ -340,23 +355,18 @@ pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
/// ^ ^
/// ```
pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
- let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+ let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
let state = tokenizer
.tokenize_state
.document_child_state
.unwrap_or(State::Next(StateName::FlowStart));
- let name = match state {
- State::Next(name) => name,
- _ => unreachable!("expected state name"),
- };
-
tokenizer.tokenize_state.document_exits.push(None);
let state = child.push(
(child.point.index, child.point.vs),
(tokenizer.point.index, tokenizer.point.vs),
- name,
+ state,
);
let paragraph = matches!(state, State::Next(StateName::ParagraphInside))
@@ -403,7 +413,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
.document_container_stack
.split_off(tokenizer.tokenize_state.document_continued);
- let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+ let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
// Flush if needed.
if *phase != Phase::After {
@@ -463,7 +473,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {
// Inject everything together.
fn resolve(tokenizer: &mut Tokenizer) {
- let child = tokenizer.tokenize_state.child_tokenizer.as_mut().unwrap();
+ let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
// First, add the container exits into `child`.
let mut child_index = 0;
diff --git a/src/content/string.rs b/src/content/string.rs
index 75cd56a..5dfceb0 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -20,7 +20,7 @@ const MARKERS: [u8; 2] = [b'&', b'\\'];
/// Start of string.
pub fn start(tokenizer: &mut Tokenizer) -> State {
tokenizer.register_resolver("whitespace".to_string(), Box::new(resolve));
- tokenizer.tokenize_state.stop = &MARKERS;
+ tokenizer.tokenize_state.markers = &MARKERS;
State::Retry(StateName::StringBefore)
}
diff --git a/src/content/text.rs b/src/content/text.rs
index ee70f33..4e93779 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -38,7 +38,7 @@ const MARKERS: [u8; 9] = [
/// Start of text.
pub fn start(tokenizer: &mut Tokenizer) -> State {
tokenizer.register_resolver("whitespace".to_string(), Box::new(resolve));
- tokenizer.tokenize_state.stop = &MARKERS;
+ tokenizer.tokenize_state.markers = &MARKERS;
State::Retry(StateName::TextBefore)
}
@@ -91,7 +91,7 @@ pub fn before(tokenizer: &mut Tokenizer) -> State {
}
}
-/// To do.
+/// At `<`, which wasn’t an autolink: before HTML?
pub fn before_html(tokenizer: &mut Tokenizer) -> State {
tokenizer.attempt(
StateName::HtmlTextStart,
@@ -100,7 +100,7 @@ pub fn before_html(tokenizer: &mut Tokenizer) -> State {
)
}
-/// To do.
+/// At `\`, which wasn’t a character escape: before a hard break?
pub fn before_hard_break_escape(tokenizer: &mut Tokenizer) -> State {
tokenizer.attempt(
StateName::HardBreakEscapeStart,
@@ -110,10 +110,6 @@ pub fn before_hard_break_escape(tokenizer: &mut Tokenizer) -> State {
}
/// At data.
-///
-/// ```markdown
-/// |qwe
-/// ```
pub fn before_data(tokenizer: &mut Tokenizer) -> State {
tokenizer.attempt(
StateName::DataStart,
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
index 3d923d3..bf6a106 100644
--- a/src/subtokenize.rs
+++ b/src/subtokenize.rs
@@ -99,10 +99,7 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> bool {
state = tokenizer.push(
(enter.point.index, enter.point.vs),
(end.index, end.vs),
- match state {
- State::Next(func) => func,
- _ => unreachable!("cannot be ok/nok"),
- },
+ state,
);
link_index = link_curr.next;
@@ -112,7 +109,6 @@ pub fn subtokenize(events: &mut Vec<Event>, parse_state: &ParseState) -> bool {
divide_events(&mut map, events, index, &mut tokenizer.events);
- // To do: check `tokenizer.events` if there is a deep content type?
done = false;
}
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3cdd2d3..04a8cc3 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -29,11 +29,16 @@ pub enum ContentType {
Text,
}
-/// To do.
+/// How to handle a byte.
#[derive(Debug, PartialEq)]
pub enum ByteAction {
+ /// This is a normal byte.
+ ///
+ /// Includes replaced bytes.
Normal(u8),
+ /// This is a new byte.
Insert(u8),
+ /// This byte must be ignored.
Ignore,
}
@@ -84,22 +89,6 @@ pub struct Event {
pub link: Option<Link>,
}
-#[derive(Debug, PartialEq)]
-enum AttemptKind {
- Attempt,
- Check,
-}
-
-/// To do.
-#[derive(Debug)]
-struct Attempt {
- /// To do.
- ok: State,
- nok: State,
- kind: AttemptKind,
- state: Option<InternalState>,
-}
-
/// Callback that can be registered and is called when the tokenizer is done.
///
/// Resolvers are supposed to change the list of events, because parsing is
@@ -107,6 +96,7 @@ struct Attempt {
/// the compiler and other users.
pub type Resolver = dyn FnOnce(&mut Tokenizer);
+/// Names of functions to move to.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum StateName {
AttentionStart,
@@ -447,62 +437,73 @@ pub struct ContainerState {
pub size: usize,
}
+/// Different kinds of attempts.
+#[derive(Debug, PartialEq)]
+enum AttemptKind {
+ /// Discard what was tokenizer when unsuccessful.
+ Attempt,
+ /// Discard always.
+ Check,
+}
+
+/// How to handle [`State::Ok`][] or [`State::Nok`][].
+#[derive(Debug)]
+struct Attempt {
+ /// Where to go to when successful.
+ ok: State,
+ /// Where to go to when unsuccessful.
+ nok: State,
+ /// Kind of attempt.
+ kind: AttemptKind,
+ /// If needed, the progress to revert to.
+ ///
+ /// It is not needed to discard an [`AttemptKind::Attempt`] that has a
+ /// `nok` of [`State::Nok`][], because that means it is used in *another*
+ /// attempt, which will receive that `Nok`, and has to handle it.
+ progress: Option<Progress>,
+}
+
/// The internal state of a tokenizer, not to be confused with states from the
/// state machine, this instead is all the information about where we currently
/// are and what’s going on.
#[derive(Debug, Clone)]
-struct InternalState {
- /// Length of `events`. We only add to events, so reverting will just pop stuff off.
+struct Progress {
+ /// Length of `events`.
+ ///
+ /// It’s not allowed to remove events, so reverting will just pop stuff off.
events_len: usize,
- /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt.
+ /// Length of the stack.
+ ///
+ /// It’s not allowed to decrease the stack in an attempt.
stack_len: usize,
/// Previous code.
previous: Option<u8>,
/// Current code.
current: Option<u8>,
- /// Current relative and absolute position in the file.
+ /// Current place in the file.
point: Point,
}
-/// To do
+/// A lot of shared fields used to tokenize things.
#[allow(clippy::struct_excessive_bools)]
pub struct TokenizeState<'a> {
- /// To do.
- pub connect: bool,
- /// To do.
+ // Couple complex fields used to tokenize the document.
+ /// Tokenizer, used to tokenize flow in document.
+ pub document_child: Option<Box<Tokenizer<'a>>>,
+ /// State, used to tokenize containers.
+ pub document_child_state: Option<State>,
+ /// Stack of currently active containers.
pub document_container_stack: Vec<ContainerState>,
- /// To do.
- pub document_exits: Vec<Option<Vec<Event>>>,
- /// To do.
+ /// How many active containers continued.
pub document_continued: usize,
- /// To do.
- pub document_paragraph_before: bool,
- /// To do.
+ /// Index of last `data`.
pub document_data_index: Option<usize>,
- /// To do.
- pub document_child_state: Option<State>,
- /// To do.
- pub child_tokenizer: Option<Box<Tokenizer<'a>>>,
- /// To do.
- pub marker: u8,
- /// To do.
- pub marker_other: u8,
- /// To do.
- pub prefix: usize,
- /// To do.
- pub return_state: Option<StateName>,
- /// To do.
- pub seen: bool,
- /// To do.
- pub size: usize,
- /// To do.
- pub size_other: usize,
- /// To do.
- pub start: usize,
- /// To do.
- pub end: usize,
- /// To do.
- pub stop: &'static [u8],
+ /// Container exits by line number.
+ pub document_exits: Vec<Option<Vec<Event>>>,
+ /// Whether the previous flow was a paragraph.
+ pub document_paragraph_before: bool,
+
+ // Couple of very frequent settings for parsing whitespace.
pub space_or_tab_eol_content_type: Option<ContentType>,
pub space_or_tab_eol_connect: bool,
pub space_or_tab_eol_ok: bool,
@@ -512,11 +513,50 @@ pub struct TokenizeState<'a> {
pub space_or_tab_max: usize,
pub space_or_tab_size: usize,
pub space_or_tab_token: Token,
- /// To do.
+
+ // Couple of media related fields.
+ /// Stack of label (start) that could form images and links.
+ ///
+ /// Used when tokenizing [text content][crate::content::text].
+ pub label_start_stack: Vec<LabelStart>,
+ /// Stack of label (start) that cannot form images and links.
+ ///
+ /// Used when tokenizing [text content][crate::content::text].
+ pub label_start_list_loose: Vec<LabelStart>,
+ /// Stack of images and links.
+ ///
+ /// Used when tokenizing [text content][crate::content::text].
+ pub media_list: Vec<Media>,
+
+ /// Whether to connect tokens.
+ pub connect: bool,
+ /// Marker.
+ pub marker: u8,
+ /// Secondary marker.
+ pub marker_b: u8,
+ /// Several markers.
+ pub markers: &'static [u8],
+ /// Whether something was seen.
+ pub seen: bool,
+ /// Size.
+ pub size: usize,
+ /// Secondary size.
+ pub size_b: usize,
+ /// Tertiary size.
+ pub size_c: usize,
+ /// Index.
+ pub start: usize,
+ /// Index.
+ pub end: usize,
+ /// Slot for a token type.
pub token_1: Token,
+ /// Slot for a token type.
pub token_2: Token,
+ /// Slot for a token type.
pub token_3: Token,
+ /// Slot for a token type.
pub token_4: Token,
+ /// Slot for a token type.
pub token_5: Token,
}
@@ -525,9 +565,9 @@ pub struct TokenizeState<'a> {
pub struct Tokenizer<'a> {
/// Jump between line endings.
column_start: Vec<(usize, usize)>,
- // First line.
+ // First line where this tokenizer starts.
first_line: usize,
- /// First point after the last line ending.
+ /// Current point after the last line ending (excluding jump).
line_start: Point,
/// Track whether the current byte is already consumed (`true`) or expected
/// to be consumed (`false`).
@@ -536,7 +576,7 @@ pub struct Tokenizer<'a> {
consumed: bool,
/// Track whether this tokenizer is done.
resolved: bool,
- /// To do.
+ /// Stack of how to handle attempts.
attempts: Vec<Attempt>,
/// Current byte.
pub current: Option<u8>,
@@ -544,7 +584,7 @@ pub struct Tokenizer<'a> {
pub previous: Option<u8>,
/// Current relative and absolute place in the file.
pub point: Point,
- /// Semantic labels of one or more codes in `codes`.
+ /// Semantic labels.
pub events: Vec<Event>,
/// Hierarchy of semantic labels.
///
@@ -559,20 +599,8 @@ pub struct Tokenizer<'a> {
pub resolver_ids: Vec<String>,
/// Shared parsing state across tokenizers.
pub parse_state: &'a ParseState<'a>,
- /// To do.
+ /// A lot of shared fields used to tokenize things.
pub tokenize_state: TokenizeState<'a>,
- /// Stack of label (start) that could form images and links.
- ///
- /// Used when tokenizing [text content][crate::content::text].
- pub label_start_stack: Vec<LabelStart>,
- /// Stack of label (start) that cannot form images and links.
- ///
- /// Used when tokenizing [text content][crate::content::text].
- pub label_start_list_loose: Vec<LabelStart>,
- /// Stack of images and links.
- ///
- /// Used when tokenizing [text content][crate::content::text].
- pub media_list: Vec<Media>,
/// Whether we would be interrupting something.
///
/// Used when tokenizing [flow content][crate::content::flow].
@@ -613,17 +641,19 @@ impl<'a> Tokenizer<'a> {
document_paragraph_before: false,
document_data_index: None,
document_child_state: None,
- child_tokenizer: None,
+ document_child: None,
marker: 0,
- marker_other: 0,
- prefix: 0,
+ marker_b: 0,
+ markers: &[],
seen: false,
size: 0,
- size_other: 0,
+ size_b: 0,
+ size_c: 0,
start: 0,
end: 0,
- stop: &[],
- return_state: None,
+ label_start_stack: vec![],
+ label_start_list_loose: vec![],
+ media_list: vec![],
space_or_tab_eol_content_type: None,
space_or_tab_eol_connect: false,
space_or_tab_eol_ok: false,
@@ -640,15 +670,11 @@ impl<'a> Tokenizer<'a> {
token_5: Token::Data,
},
map: EditMap::new(),
- label_start_stack: vec![],
- label_start_list_loose: vec![],
- media_list: vec![],
interrupt: false,
concrete: false,
lazy: false,
- // Assume about 10 resolvers.
- resolvers: Vec::with_capacity(10),
- resolver_ids: Vec::with_capacity(10),
+ resolvers: vec![],
+ resolver_ids: vec![],
}
}
@@ -698,7 +724,7 @@ impl<'a> Tokenizer<'a> {
}
/// Prepare for a next code to get consumed.
- pub fn expect(&mut self, byte: Option<u8>) {
+ fn expect(&mut self, byte: Option<u8>) {
debug_assert!(self.consumed, "expected previous byte to be consumed");
self.consumed = false;
self.current = byte;
@@ -721,7 +747,7 @@ impl<'a> Tokenizer<'a> {
}
/// Move to the next (virtual) byte.
- pub fn move_one(&mut self) {
+ fn move_one(&mut self) {
match byte_action(self.parse_state.bytes, &self.point) {
ByteAction::Ignore => {
self.point.index += 1;
@@ -756,7 +782,7 @@ impl<'a> Tokenizer<'a> {
}
/// Move (virtual) bytes.
- pub fn move_to(&mut self, to: (usize, usize)) {
+ fn move_to(&mut self, to: (usize, usize)) {
let (to_index, to_vs) = to;
while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs {
self.move_one();
@@ -838,9 +864,9 @@ impl<'a> Tokenizer<'a> {
});
}
- /// Capture the internal state.
- fn capture(&mut self) -> InternalState {
- InternalState {
+ /// Capture the tokenizer progress.
+ fn capture(&mut self) -> Progress {
+ Progress {
previous: self.previous,
current: self.current,
point: self.point.clone(),
@@ -849,8 +875,8 @@ impl<'a> Tokenizer<'a> {
}
}
- /// Apply the internal state.
- fn free(&mut self, previous: InternalState) {
+ /// Apply tokenizer progress.
+ fn free(&mut self, previous: Progress) {
self.previous = previous.previous;
self.current = previous.current;
self.point = previous.point;
@@ -866,123 +892,168 @@ impl<'a> Tokenizer<'a> {
self.stack.truncate(previous.stack_len);
}
- /// Parse with `name` and its future states, to check if it result in
- /// [`State::Ok`][] or [`State::Nok`][], revert on both cases, and then
- /// call `done` with whether it was successful or not.
- ///
- /// This captures the current state of the tokenizer, returns a wrapped
- /// state that captures all codes and feeds them to `name` and its
- /// future states until it yields `State::Ok` or `State::Nok`.
- /// It then applies the captured state, calls `done`, and feeds all
- /// captured codes to its future states.
+ /// Parse with `name` and its future states, to see if that results in
+ /// [`State::Ok`][] or [`State::Nok`][], then revert in both cases.
pub fn check(&mut self, name: StateName, ok: State, nok: State) -> State {
- attempt_impl(self, name, ok, nok, AttemptKind::Check)
+ // Always capture (and restore) when checking.
+ // No need to capture (and restore) when `nok` is `State::Nok`, because the
+ // parent attempt will do it.
+ let progress = Some(self.capture());
+
+ self.attempts.push(Attempt {
+ kind: AttemptKind::Check,
+ progress,
+ ok,
+ nok,
+ });
+
+ call_impl(self, name)
}
- /// Parse with `name` and its future states, to check if it results in
- /// [`State::Ok`][] or [`State::Nok`][], revert on the case of
- /// `State::Nok`, and then call `done` with whether it was successful or
- /// not.
- ///
- /// This captures the current state of the tokenizer, returns a wrapped
- /// state that captures all codes and feeds them to `name` and its
- /// future states until it yields `State::Ok`, at which point it calls
- /// `done` and yields its result.
- /// If instead `State::Nok` was yielded, the captured state is applied,
- /// `done` is called, and all captured codes are fed to its future states.
+ /// Parse with `name` and its future states, to see if that results in
+ /// [`State::Ok`][] or [`State::Nok`][], revert in the case of
+ /// `State::Nok`.
pub fn attempt(&mut self, name: StateName, ok: State, nok: State) -> State {
- attempt_impl(self, name, ok, nok, AttemptKind::Attempt)
- }
+ // Always capture (and restore) when checking.
+ // No need to capture (and restore) when `nok` is `State::Nok`, because the
+ // parent attempt will do it.
+ let progress = if nok == State::Nok {
+ None
+ } else {
+ Some(self.capture())
+ };
- /// Feed a list of `codes` into `start`.
- ///
- /// This is set up to support repeatedly calling `feed`, and thus streaming
- /// markdown into the state machine, and normally pauses after feeding.
- // Note: if needed: accept `vs`?
- pub fn push(&mut self, min: (usize, usize), max: (usize, usize), name: StateName) -> State {
- debug_assert!(!self.resolved, "cannot feed after drain");
+ self.attempts.push(Attempt {
+ kind: AttemptKind::Attempt,
+ progress,
+ ok,
+ nok,
+ });
- // debug_assert!(min >= self.point.index, "cannot move backwards");
+ call_impl(self, name)
+ }
- if min.0 > self.point.index || (min.0 == self.point.index && min.1 > self.point.vs) {
- self.move_to(min);
- }
+ /// Tokenize.
+ pub fn push(&mut self, from: (usize, usize), to: (usize, usize), state: State) -> State {
+ push_impl(self, from, to, state, false)
+ }
- let mut state = State::Next(name);
+ /// Flush.
+ pub fn flush(&mut self, state: State, resolve: bool) {
+ let to = (self.point.index, self.point.vs);
+ push_impl(self, to, to, state, true);
- while self.point.index < max.0 || (self.point.index == max.0 && self.point.vs < max.1) {
- match state {
- State::Ok | State::Nok => {
- if let Some(attempt) = self.attempts.pop() {
- state = attempt_done_impl(self, attempt, state);
- } else {
- break;
- }
- }
- State::Next(name) => {
- let action = byte_action(self.parse_state.bytes, &self.point);
- state = feed_action_impl(self, &Some(action), name);
- }
- State::Retry(name) => {
- log::debug!(" retry {:?}", name);
- state = call_impl(self, name);
- }
+ if resolve {
+ self.resolved = true;
+
+ while !self.resolvers.is_empty() {
+ let resolver = self.resolvers.remove(0);
+ resolver(self);
}
+
+ self.map.consume(&mut self.events);
}
+ }
+}
- state
+/// Move back past ignored bytes.
+fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) {
+ while point.index > 0 {
+ point.index -= 1;
+ let action = byte_action(tokenizer.parse_state.bytes, point);
+ if !matches!(action, ByteAction::Ignore) {
+ point.index += 1;
+ break;
+ }
}
+}
- /// Flush the tokenizer.
- pub fn flush(&mut self, mut state: State, resolve: bool) {
- let max = self.point.index;
+/// Run the tokenizer.
+fn push_impl(
+ tokenizer: &mut Tokenizer,
+ from: (usize, usize),
+ to: (usize, usize),
+ mut state: State,
+ flush: bool,
+) -> State {
+ debug_assert!(!tokenizer.resolved, "cannot feed after drain");
+ debug_assert!(
+ from.0 > tokenizer.point.index
+ || (from.0 == tokenizer.point.index && from.1 >= tokenizer.point.vs),
+ "cannot move backwards"
+ );
+
+ tokenizer.move_to(from);
+
+ loop {
+ match state {
+ State::Ok | State::Nok => {
+ if let Some(attempt) = tokenizer.attempts.pop() {
+ if attempt.kind == AttemptKind::Check || state == State::Nok {
+ if let Some(progress) = attempt.progress {
+ tokenizer.free(progress);
+ }
+ }
- self.consumed = true;
+ tokenizer.consumed = true;
- loop {
- match state {
- State::Ok | State::Nok => {
- if let Some(attempt) = self.attempts.pop() {
- state = attempt_done_impl(self, attempt, state);
+ let next = if state == State::Ok {
+ attempt.ok
} else {
- break;
- }
- }
- State::Next(name) => {
- // We sometimes move back when flushing, so then we use those codes.
- state = feed_action_impl(
- self,
- &if self.point.index == max {
- None
- } else {
- Some(byte_action(self.parse_state.bytes, &self.point))
- },
- name,
- );
- }
- State::Retry(name) => {
- log::debug!(" retry {:?}", name);
- state = call_impl(self, name);
+ attempt.nok
+ };
+
+ log::debug!("attempt: `{:?}` -> `{:?}`", state, next);
+ state = next;
+ } else {
+ break;
}
}
- }
-
- self.consumed = true;
- debug_assert!(matches!(state, State::Ok), "must be ok");
+ State::Next(name) => {
+ let action = if tokenizer.point.index < to.0
+ || (tokenizer.point.index == to.0 && tokenizer.point.vs < to.1)
+ {
+ Some(byte_action(tokenizer.parse_state.bytes, &tokenizer.point))
+ } else if flush {
+ None
+ } else {
+ break;
+ };
- if resolve {
- self.resolved = true;
+ if let Some(ByteAction::Ignore) = action {
+ tokenizer.move_one();
+ } else {
+ let byte =
+ if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action {
+ Some(byte)
+ } else {
+ None
+ };
- while !self.resolvers.is_empty() {
- let resolver = self.resolvers.remove(0);
- resolver(self);
+ log::debug!("feed: `{:?}` to {:?}", byte, name);
+ tokenizer.expect(byte);
+ state = call_impl(tokenizer, name);
+ };
+ }
+ State::Retry(name) => {
+ log::debug!("retry: {:?}", name);
+ state = call_impl(tokenizer, name);
}
-
- self.map.consume(&mut self.events);
}
}
+
+ tokenizer.consumed = true;
+
+ if flush {
+ debug_assert!(matches!(state, State::Ok), "must be ok");
+ } else {
+ debug_assert!(matches!(state, State::Next(_)), "must have a next state");
+ }
+
+ state
}
+/// Figure out how to handle a byte.
fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
if point.index < bytes.len() {
let byte = bytes[point.index];
@@ -1024,73 +1095,8 @@ fn byte_action(bytes: &[u8], point: &Point) -> ByteAction {
}
}
-/// Internal utility to wrap states to also capture codes.
-///
-/// Recurses into itself.
-/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check].
-fn attempt_impl(
- tokenizer: &mut Tokenizer,
- name: StateName,
- ok: State,
- nok: State,
- kind: AttemptKind,
-) -> State {
- // Always capture (and restore) when checking.
- // No need to capture (and restore) when `nok` is `State::Nok`, because the
- // parent attempt will do it.
- let state = if kind == AttemptKind::Check || nok != State::Nok {
- Some(tokenizer.capture())
- } else {
- None
- };
-
- tokenizer.attempts.push(Attempt {
- ok,
- nok,
- kind,
- state,
- });
-
- call_impl(tokenizer, name)
-}
-
-fn attempt_done_impl(tokenizer: &mut Tokenizer, attempt: Attempt, state: State) -> State {
- if attempt.kind == AttemptKind::Check || state == State::Nok {
- if let Some(state) = attempt.state {
- tokenizer.free(state);
- }
- }
-
- tokenizer.consumed = true;
- if state == State::Ok {
- attempt.ok
- } else {
- attempt.nok
- }
-}
-
-fn feed_action_impl(
- tokenizer: &mut Tokenizer,
- action: &Option<ByteAction>,
- name: StateName,
-) -> State {
- if let Some(ByteAction::Ignore) = action {
- tokenizer.move_one();
- State::Next(name)
- } else {
- let byte = if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = action {
- Some(*byte)
- } else {
- None
- };
-
- log::debug!("feed: `{:?}` to {:?}", byte, name);
- tokenizer.expect(byte);
- call_impl(tokenizer, name)
- }
-}
-
#[allow(clippy::too_many_lines)]
+/// Call the corresponding function for a state name.
fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State {
let func = match name {
StateName::AttentionStart => construct::attention::start,
@@ -1422,15 +1428,3 @@ fn call_impl(tokenizer: &mut Tokenizer, name: StateName) -> State {
func(tokenizer)
}
-
-fn move_point_back(tokenizer: &mut Tokenizer, point: &mut Point) {
- // Move back past ignored bytes.
- while point.index > 0 {
- point.index -= 1;
- let action = byte_action(tokenizer.parse_state.bytes, point);
- if !matches!(action, ByteAction::Ignore) {
- point.index += 1;
- break;
- }
- }
-}