aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-29 18:22:59 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-29 18:22:59 +0200
commit0eeff9148e327183e532752f46421a75506dd7a6 (patch)
tree4f0aed04f90aa759ce96a2e87aa719e7fa95c450
parent148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff)
downloadmarkdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz
markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2
markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip
Refactor to improve states
* Remove custom kind wrappers, use plain bytes instead * Remove `Into`s, use the explicit expected types instead * Refactor to use `slice.as_str` in most places * Remove unneeded unique check before adding a definition * Use a shared CDATA prefix in constants * Inline byte checks into matches * Pass bytes back from parser instead of whole parse state * Refactor to work more often on bytes * Rename custom `size` to `len`
-rw-r--r--build.rs2
-rw-r--r--src/compiler.rs540
-rw-r--r--src/constant.rs9
-rw-r--r--src/construct/attention.rs88
-rw-r--r--src/construct/autolink.rs57
-rw-r--r--src/construct/character_escape.rs3
-rw-r--r--src/construct/character_reference.rs132
-rw-r--r--src/construct/code_fenced.rs123
-rw-r--r--src/construct/code_indented.rs37
-rw-r--r--src/construct/code_text.rs7
-rw-r--r--src/construct/definition.rs21
-rw-r--r--src/construct/hard_break_escape.rs4
-rw-r--r--src/construct/heading_atx.rs28
-rw-r--r--src/construct/heading_setext.rs96
-rw-r--r--src/construct/html_flow.rs212
-rw-r--r--src/construct/html_text.rs46
-rw-r--r--src/construct/label_end.rs47
-rw-r--r--src/construct/label_start_image.rs3
-rw-r--r--src/construct/list.rs135
-rw-r--r--src/construct/paragraph.rs3
-rw-r--r--src/construct/partial_bom.rs37
-rw-r--r--src/construct/partial_destination.rs53
-rw-r--r--src/construct/partial_label.rs101
-rw-r--r--src/construct/partial_title.rs93
-rw-r--r--src/construct/partial_whitespace.rs18
-rw-r--r--src/construct/thematic_break.rs85
-rw-r--r--src/content/document.rs7
-rw-r--r--src/lib.rs4
-rw-r--r--src/parser.rs5
-rw-r--r--src/unicode.rs2
-rw-r--r--src/util/decode_character_reference.rs42
-rw-r--r--src/util/encode.rs48
-rw-r--r--src/util/normalize_identifier.rs39
-rw-r--r--src/util/sanitize_uri.rs2
-rw-r--r--src/util/slice.rs36
-rw-r--r--tests/code_fenced.rs6
36 files changed, 941 insertions, 1230 deletions
diff --git a/build.rs b/build.rs
index 4397c58..1a3549b 100644
--- a/build.rs
+++ b/build.rs
@@ -130,7 +130,7 @@ async fn punctuation() {
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
-/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
+/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
diff --git a/src/compiler.rs b/src/compiler.rs
index de76142..e0ab1e9 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -1,6 +1,5 @@
//! Turn events into a string of HTML.
use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC};
-use crate::construct::character_reference::Kind as CharacterReferenceKind;
use crate::token::Token;
use crate::tokenizer::{Event, EventType};
use crate::util::normalize_identifier::normalize_identifier;
@@ -68,14 +67,14 @@ struct CompileContext<'a> {
pub code_flow_seen_data: Option<bool>,
pub code_fenced_fences_count: Option<usize>,
pub code_text_inside: bool,
- pub character_reference_kind: Option<CharacterReferenceKind>,
+ pub character_reference_marker: Option<u8>,
pub expect_first_item: Option<bool>,
pub media_stack: Vec<Media>,
pub definitions: Vec<(String, Definition)>,
pub tight_stack: Vec<bool>,
/// Fields used to influance the current compilation.
pub slurp_one_line_ending: bool,
- pub tags: bool,
+ pub in_image_alt: bool,
pub encode_html: bool,
pub last_was_tag: bool,
/// Configuration
@@ -104,13 +103,13 @@ impl<'a> CompileContext<'a> {
code_flow_seen_data: None,
code_fenced_fences_count: None,
code_text_inside: false,
- character_reference_kind: None,
+ character_reference_marker: None,
expect_first_item: None,
media_stack: vec![],
definitions: vec![],
tight_stack: vec![],
slurp_one_line_ending: false,
- tags: true,
+ in_image_alt: false,
encode_html: true,
last_was_tag: false,
protocol_href: if options.allow_dangerous_protocol {
@@ -140,8 +139,7 @@ impl<'a> CompileContext<'a> {
self.buffers.pop().expect("Cannot resume w/o buffer")
}
- pub fn push<'x, S: Into<&'x str>>(&mut self, value: S) {
- let value = value.into();
+ pub fn push(&mut self, value: &str) {
self.buffers
.last_mut()
.expect("Cannot push w/o buffer")
@@ -149,17 +147,8 @@ impl<'a> CompileContext<'a> {
self.last_was_tag = false;
}
- pub fn push_raw<'x, S: Into<&'x str>>(&mut self, value: S) {
- let value = value.into();
- self.push(&*encode(value, self.encode_html));
- }
-
- pub fn tag<'x, S: Into<&'x str>>(&mut self, value: S) {
- if self.tags {
- let value = value.into();
- self.push(&*encode(value, false));
- self.last_was_tag = true;
- }
+ pub fn push_raw(&mut self, value: &str) {
+ self.push(&encode(value, self.encode_html));
}
/// Get the current buffer.
@@ -172,7 +161,7 @@ impl<'a> CompileContext<'a> {
/// Add a line ending.
pub fn line_ending(&mut self) {
let eol = self.line_ending_default.as_str().to_string();
- self.push(&*eol);
+ self.push(&eol);
}
/// Add a line ending if needed (as in, there’s no eol/eof already).
@@ -210,7 +199,7 @@ pub fn compile(events: &[Event], bytes: &[u8], options: &Options) -> String {
&& (event.token_type == Token::BlankLineEnding || event.token_type == Token::LineEnding)
{
line_ending_inferred = Some(LineEnding::from_str(
- &Slice::from_position(bytes, &Position::from_exit_event(events, index)).serialize(),
+ Slice::from_position(bytes, &Position::from_exit_event(events, index)).as_str(),
));
break;
}
@@ -398,14 +387,16 @@ fn on_enter_buffer(context: &mut CompileContext) {
fn on_enter_block_quote(context: &mut CompileContext) {
context.tight_stack.push(false);
context.line_ending_if_needed();
- context.tag("<blockquote>");
+ context.push("<blockquote>");
+ context.last_was_tag = true;
}
/// Handle [`Enter`][EventType::Enter]:[`CodeIndented`][Token::CodeIndented].
fn on_enter_code_indented(context: &mut CompileContext) {
context.code_flow_seen_data = Some(false);
context.line_ending_if_needed();
- context.tag("<pre><code>");
+ context.push("<pre><code>");
+ context.last_was_tag = true;
}
/// Handle [`Enter`][EventType::Enter]:[`CodeFenced`][Token::CodeFenced].
@@ -413,14 +404,18 @@ fn on_enter_code_fenced(context: &mut CompileContext) {
context.code_flow_seen_data = Some(false);
context.line_ending_if_needed();
// Note that no `>` is used, which is added later.
- context.tag("<pre><code");
+ context.push("<pre><code");
+ context.last_was_tag = true;
context.code_fenced_fences_count = Some(0);
}
/// Handle [`Enter`][EventType::Enter]:[`CodeText`][Token::CodeText].
fn on_enter_code_text(context: &mut CompileContext) {
context.code_text_inside = true;
- context.tag("<code>");
+ if !context.in_image_alt {
+ context.push("<code>");
+ context.last_was_tag = true;
+ }
context.buffer();
}
@@ -445,7 +440,10 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) {
/// Handle [`Enter`][EventType::Enter]:[`Emphasis`][Token::Emphasis].
fn on_enter_emphasis(context: &mut CompileContext) {
- context.tag("<em>");
+ if !context.in_image_alt {
+ context.push("<em>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Enter`][EventType::Enter]:[`HtmlFlow`][Token::HtmlFlow].
@@ -473,7 +471,7 @@ fn on_enter_image(context: &mut CompileContext) {
destination: None,
title: None,
});
- context.tags = false; // Disallow tags.
+ context.in_image_alt = true; // Disallow tags.
}
/// Handle [`Enter`][EventType::Enter]:[`Link`][Token::Link].
@@ -546,14 +544,12 @@ fn on_enter_list(context: &mut CompileContext) {
context.tight_stack.push(!loose);
context.line_ending_if_needed();
// Note: no `>`.
- context.tag(&*format!(
- "<{}",
- if *token_type == Token::ListOrdered {
- "ol"
- } else {
- "ul"
- }
- ));
+ context.push(if *token_type == Token::ListOrdered {
+ "<ol"
+ } else {
+ "<ul"
+ });
+ context.last_was_tag = true;
context.expect_first_item = Some(true);
}
@@ -562,11 +558,14 @@ fn on_enter_list_item_marker(context: &mut CompileContext) {
let expect_first_item = context.expect_first_item.take().unwrap();
if expect_first_item {
- context.tag(">");
+ context.push(">");
+ context.last_was_tag = true;
}
context.line_ending_if_needed();
- context.tag("<li>");
+
+ context.push("<li>");
+ context.last_was_tag = true;
context.expect_first_item = Some(false);
// “Hack” to prevent a line ending from showing up if the item is empty.
context.last_was_tag = false;
@@ -578,15 +577,15 @@ fn on_enter_paragraph(context: &mut CompileContext) {
if !tight {
context.line_ending_if_needed();
- context.tag("<p>");
+ context.push("<p>");
+ context.last_was_tag = true;
}
}
/// Handle [`Enter`][EventType::Enter]:[`Resource`][Token::Resource].
fn on_enter_resource(context: &mut CompileContext) {
context.buffer(); // We can have line endings in the resource, ignore them.
- let media = context.media_stack.last_mut().unwrap();
- media.destination = Some("".to_string());
+ context.media_stack.last_mut().unwrap().destination = Some("".to_string());
}
/// Handle [`Enter`][EventType::Enter]:[`ResourceDestinationString`][Token::ResourceDestinationString].
@@ -599,47 +598,67 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) {
/// Handle [`Enter`][EventType::Enter]:[`Strong`][Token::Strong].
fn on_enter_strong(context: &mut CompileContext) {
- context.tag("<strong>");
+ if !context.in_image_alt {
+ context.push("<strong>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail].
fn on_exit_autolink_email(context: &mut CompileContext) {
- let value = Slice::from_position(
+ let slice = Slice::from_position(
context.bytes,
&Position::from_exit_event(context.events, context.index),
- )
- .serialize();
+ );
+ let value = slice.as_str();
- context.tag(&*format!(
- "<a href=\"{}\">",
- sanitize_uri(
- format!("mailto:{}", value.as_str()).as_str(),
- &context.protocol_href
- )
- ));
- context.push_raw(&*value);
- context.tag("</a>");
+ if !context.in_image_alt {
+ context.push("<a href=\"");
+ context.push(&sanitize_uri(
+ &format!("mailto:{}", value),
+ &context.protocol_href,
+ ));
+ context.push("\">");
+ context.last_was_tag = true;
+ }
+
+ context.push_raw(value);
+
+ if !context.in_image_alt {
+ context.push("</a>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol].
fn on_exit_autolink_protocol(context: &mut CompileContext) {
- let value = Slice::from_position(
+ let slice = Slice::from_position(
context.bytes,
&Position::from_exit_event(context.events, context.index),
- )
- .serialize();
+ );
+ let value = slice.as_str();
- context.tag(&*format!(
- "<a href=\"{}\">",
- sanitize_uri(value.as_str(), &context.protocol_href)
- ));
- context.push_raw(&*value);
- context.tag("</a>");
+ if !context.in_image_alt {
+ context.push("<a href=\"");
+ context.push(&sanitize_uri(value, &context.protocol_href));
+ context.push("\">");
+ context.last_was_tag = true;
+ }
+
+ context.push_raw(value);
+
+ if !context.in_image_alt {
+ context.push("</a>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:{[`HardBreakEscape`][Token::HardBreakEscape],[`HardBreakTrailing`][Token::HardBreakTrailing]}.
fn on_exit_break(context: &mut CompileContext) {
- context.tag("<br />");
+ if !context.in_image_alt {
+ context.push("<br />");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:[`BlankLineEnding`][Token::BlankLineEnding].
@@ -654,56 +673,58 @@ fn on_exit_block_quote(context: &mut CompileContext) {
context.tight_stack.pop();
context.line_ending_if_needed();
context.slurp_one_line_ending = false;
- context.tag("</blockquote>");
+ context.push("</blockquote>");
+ context.last_was_tag = true;
}
/// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarker`][Token::CharacterReferenceMarker].
fn on_exit_character_reference_marker(context: &mut CompileContext) {
- context.character_reference_kind = Some(CharacterReferenceKind::Named);
+ context.character_reference_marker = Some(b'&');
}
/// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarkerHexadecimal`][Token::CharacterReferenceMarkerHexadecimal].
fn on_exit_character_reference_marker_hexadecimal(context: &mut CompileContext) {
- context.character_reference_kind = Some(CharacterReferenceKind::Hexadecimal);
+ context.character_reference_marker = Some(b'x');
}
/// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarkerNumeric`][Token::CharacterReferenceMarkerNumeric].
fn on_exit_character_reference_marker_numeric(context: &mut CompileContext) {
- context.character_reference_kind = Some(CharacterReferenceKind::Decimal);
+ context.character_reference_marker = Some(b'#');
}
/// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceValue`][Token::CharacterReferenceValue].
fn on_exit_character_reference_value(context: &mut CompileContext) {
- let kind = context
- .character_reference_kind
+ let marker = context
+ .character_reference_marker
.take()
.expect("expected `character_reference_kind` to be set");
- let reference = Slice::from_position(
+ let slice = Slice::from_position(
context.bytes,
&Position::from_exit_event(context.events, context.index),
- )
- .serialize();
+ );
+ let value = slice.as_str();
- let ref_string = reference.as_str();
- let value = match kind {
- CharacterReferenceKind::Decimal => decode_numeric(ref_string, 10).to_string(),
- CharacterReferenceKind::Hexadecimal => decode_numeric(ref_string, 16).to_string(),
- CharacterReferenceKind::Named => decode_named(ref_string),
+ let value = match marker {
+ b'#' => decode_numeric(value, 10),
+ b'x' => decode_numeric(value, 16),
+ b'&' => decode_named(value),
+ _ => panic!("impossible"),
};
- context.push_raw(&*value);
+ context.push_raw(&value);
}
/// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk].
fn on_exit_code_flow_chunk(context: &mut CompileContext) {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
context.code_flow_seen_data = Some(true);
- context.push_raw(&*value);
+ context.push_raw(
+ &Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ // Must serialize to get virtual spaces.
+ .serialize(),
+ );
}
/// Handle [`Exit`][EventType::Exit]:[`CodeFencedFence`][Token::CodeFencedFence].
@@ -715,7 +736,8 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {
};
if count == 0 {
- context.tag(">");
+ context.push(">");
+ context.last_was_tag = true;
context.slurp_one_line_ending = true;
}
@@ -725,7 +747,10 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:[`CodeFencedFenceInfo`][Token::CodeFencedFenceInfo].
fn on_exit_code_fenced_fence_info(context: &mut CompileContext) {
let value = context.resume();
- context.tag(&*format!(" class=\"language-{}\"", value));
+ context.push(" class=\"language-");
+ context.push(&value);
+ context.push("\"");
+ context.last_was_tag = true;
}
/// Handle [`Exit`][EventType::Exit]:{[`CodeFenced`][Token::CodeFenced],[`CodeIndented`][Token::CodeIndented]}.
@@ -752,7 +777,8 @@ fn on_exit_code_flow(context: &mut CompileContext) {
context.line_ending_if_needed();
}
- context.tag("</code></pre>");
+ context.push("</code></pre>");
+ context.last_was_tag = true;
if let Some(count) = context.code_fenced_fences_count.take() {
if count < 2 {
@@ -781,12 +807,16 @@ fn on_exit_code_text(context: &mut CompileContext) {
}
context.code_text_inside = false;
- context.push(&*if trim {
+ context.push(&if trim {
result[1..(result.len() - 1)].to_string()
} else {
result
});
- context.tag("</code>");
+
+ if !context.in_image_alt {
+ context.push("</code>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:*.
@@ -798,72 +828,63 @@ fn on_exit_drop(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}.
fn on_exit_data(context: &mut CompileContext) {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
- // Just output it.
- context.push_raw(&*value);
+ context.push_raw(
+ Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .as_str(),
+ );
}
/// Handle [`Exit`][EventType::Exit]:[`Definition`][Token::Definition].
fn on_exit_definition(context: &mut CompileContext) {
- let definition = context.media_stack.pop().unwrap();
- let reference_id = normalize_identifier(&definition.reference_id.unwrap());
- let destination = definition.destination;
- let title = definition.title;
-
context.resume();
-
- let mut index = 0;
-
- while index < context.definitions.len() {
- if context.definitions[index].0 == reference_id {
- return;
- }
-
- index += 1;
- }
-
- context
- .definitions
- .push((reference_id, Definition { destination, title }));
+ let media = context.media_stack.pop().unwrap();
+ let id = normalize_identifier(&media.reference_id.unwrap());
+
+ context.definitions.push((
+ id,
+ Definition {
+ destination: media.destination,
+ title: media.title,
+ },
+ ));
}
/// Handle [`Exit`][EventType::Exit]:[`DefinitionDestinationString`][Token::DefinitionDestinationString].
fn on_exit_definition_destination_string(context: &mut CompileContext) {
let buf = context.resume();
- let definition = context.media_stack.last_mut().unwrap();
- definition.destination = Some(buf);
+ context.media_stack.last_mut().unwrap().destination = Some(buf);
context.encode_html = true;
}
/// Handle [`Exit`][EventType::Exit]:[`DefinitionLabelString`][Token::DefinitionLabelString].
fn on_exit_definition_label_string(context: &mut CompileContext) {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
// Discard label, use the source content instead.
context.resume();
- let definition = context.media_stack.last_mut().unwrap();
- definition.reference_id = Some(value);
+ context.media_stack.last_mut().unwrap().reference_id = Some(
+ // To do: lifetimes, reference bytes?
+ Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize(),
+ );
}
/// Handle [`Exit`][EventType::Exit]:[`DefinitionTitleString`][Token::DefinitionTitleString].
fn on_exit_definition_title_string(context: &mut CompileContext) {
let buf = context.resume();
- let definition = context.media_stack.last_mut().unwrap();
- definition.title = Some(buf);
+ context.media_stack.last_mut().unwrap().title = Some(buf);
}
/// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Emphasis].
fn on_exit_emphasis(context: &mut CompileContext) {
- context.tag("</em>");
+ if !context.in_image_alt {
+ context.push("</em>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:[`HeadingAtx`][Token::HeadingAtx].
@@ -873,7 +894,10 @@ fn on_exit_heading_atx(context: &mut CompileContext) {
.take()
.expect("`atx_opening_sequence_size` must be set in headings");
- context.tag(&*format!("</h{}>", rank));
+ context.push("</h");
+ context.push(&rank.to_string());
+ context.push(">");
+ context.last_was_tag = true;
}
/// Handle [`Exit`][EventType::Exit]:[`HeadingAtxSequence`][Token::HeadingAtxSequence].
@@ -884,17 +908,20 @@ fn on_exit_heading_atx_sequence(context: &mut CompileContext) {
context.bytes,
&Position::from_exit_event(context.events, context.index),
)
- .size();
+ .len();
context.line_ending_if_needed();
context.atx_opening_sequence_size = Some(rank);
- context.tag(&*format!("<h{}>", rank));
+ context.push("<h");
+ context.push(&rank.to_string());
+ context.push(">");
+ context.last_was_tag = true;
}
}
/// Handle [`Exit`][EventType::Exit]:[`HeadingAtxText`][Token::HeadingAtxText].
fn on_exit_heading_atx_text(context: &mut CompileContext) {
let value = context.resume();
- context.push(&*value);
+ context.push(&value);
}
/// Handle [`Exit`][EventType::Exit]:[`HeadingSetextText`][Token::HeadingSetextText].
@@ -915,12 +942,18 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) {
&Position::from_exit_event(context.events, context.index),
)
.head();
- let level = if head == Some(b'-') { 2 } else { 1 };
+ let rank = if head == Some(b'-') { "2" } else { "1" };
context.line_ending_if_needed();
- context.tag(&*format!("<h{}>", level));
- context.push(&*text);
- context.tag(&*format!("</h{}>", level));
+ context.push("<h");
+ context.push(rank);
+ context.push(">");
+ context.last_was_tag = true;
+ context.push(&text);
+ context.push("</h");
+ context.push(rank);
+ context.push(">");
+ context.last_was_tag = true;
}
/// Handle [`Exit`][EventType::Exit]:{[`HtmlFlow`][Token::HtmlFlow],[`HtmlText`][Token::HtmlText]}.
@@ -930,32 +963,31 @@ fn on_exit_html(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:{[`HtmlFlowData`][Token::HtmlFlowData],[`HtmlTextData`][Token::HtmlTextData]}.
fn on_exit_html_data(context: &mut CompileContext) {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
- context.push_raw(&*value);
+ context.push_raw(
+ Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .as_str(),
+ );
}
/// Handle [`Exit`][EventType::Exit]:[`Label`][Token::Label].
fn on_exit_label(context: &mut CompileContext) {
let buf = context.resume();
- let media = context.media_stack.last_mut().unwrap();
- media.label = Some(buf);
+ context.media_stack.last_mut().unwrap().label = Some(buf);
}
/// Handle [`Exit`][EventType::Exit]:[`LabelText`][Token::LabelText].
fn on_exit_label_text(context: &mut CompileContext) {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
- let media = context.media_stack.last_mut().unwrap();
- media.label_id = Some(value);
+ context.media_stack.last_mut().unwrap().label_id = Some(
+ // To do: lifetimes, reference bytes?
+ Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize(),
+ );
}
/// Handle [`Exit`][EventType::Exit]:[`LineEnding`][Token::LineEnding].
@@ -965,26 +997,28 @@ fn on_exit_line_ending(context: &mut CompileContext) {
} else if context.slurp_one_line_ending {
context.slurp_one_line_ending = false;
} else {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
- context.push_raw(&*value);
+ context.push_raw(
+ Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .as_str(),
+ );
}
}
/// Handle [`Exit`][EventType::Exit]:{[`ListOrdered`][Token::ListOrdered],[`ListUnordered`][Token::ListUnordered]}.
fn on_exit_list(context: &mut CompileContext) {
- let tag_name = if context.events[context.index].token_type == Token::ListOrdered {
- "ol"
- } else {
- "ul"
- };
context.tight_stack.pop();
context.line_ending();
- context.tag(&*format!("</{}>", tag_name));
+ context.push(
+ if context.events[context.index].token_type == Token::ListOrdered {
+ "</ol>"
+ } else {
+ "</ul>"
+ },
+ );
+ context.last_was_tag = true;
}
/// Handle [`Exit`][EventType::Exit]:[`ListItem`][Token::ListItem].
@@ -1010,7 +1044,8 @@ fn on_exit_list_item(context: &mut CompileContext) {
context.line_ending_if_needed();
}
- context.tag("</li>");
+ context.push("</li>");
+ context.last_was_tag = true;
}
/// Handle [`Exit`][EventType::Exit]:[`ListItemValue`][Token::ListItemValue].
@@ -1018,17 +1053,17 @@ fn on_exit_list_item_value(context: &mut CompileContext) {
let expect_first_item = context.expect_first_item.unwrap();
if expect_first_item {
- let value = Slice::from_position(
+ let slice = Slice::from_position(
context.bytes,
&Position::from_exit_event(context.events, context.index),
- )
- .serialize();
- let value = value.parse::<u32>().ok().unwrap();
+ );
+ let value = slice.as_str().parse::<u32>().ok().unwrap();
if value != 1 {
- context.tag(" start=\"");
- context.tag(&*value.to_string());
- context.tag("\"");
+ context.push(" start=\"");
+ context.push(&value.to_string());
+ context.push("\"");
+ context.last_was_tag = true;
}
}
}
@@ -1048,68 +1083,98 @@ fn on_exit_media(context: &mut CompileContext) {
index += 1;
}
- context.tags = !is_in_image;
+ context.in_image_alt = is_in_image;
let media = context.media_stack.pop().unwrap();
+ let label = media.label.unwrap();
+ let in_image_alt = context.in_image_alt;
let id = media
.reference_id
.or(media.label_id)
.map(|id| normalize_identifier(&id));
- let label = media.label.unwrap();
- let mut definition = None;
- if let Some(id) = id {
- let mut index = 0;
+ let definition_index = if media.destination.is_none() {
+ id.and_then(|id| {
+ let mut index = 0;
- while index < context.definitions.len() {
- if context.definitions[index].0 == id {
- definition = Some(&context.definitions[index].1);
- break;
- }
+ while index < context.definitions.len() {
+ if context.definitions[index].0 == id {
+ return Some(index);
+ }
- index += 1;
- }
- }
+ index += 1;
+ }
- let destination = if media.destination.is_some() {
- &media.destination
+ None
+ })
} else {
- &definition.unwrap().destination
- };
- let title = if media.destination.is_some() {
- &media.title
- } else {
- &definition.unwrap().title
+ None
};
- let destination = if let Some(destination) = destination {
- destination
- } else {
- ""
- };
+ if !in_image_alt {
+ if media.image {
+ context.push("<img src=\"");
+ } else {
+ context.push("<a href=\"");
+ };
- let title = if let Some(title) = title {
- format!(" title=\"{}\"", title)
- } else {
- "".to_string()
- };
+ let destination = if let Some(index) = definition_index {
+ context.definitions[index].1.destination.as_ref()
+ } else {
+ media.destination.as_ref()
+ };
+
+ if let Some(destination) = destination {
+ context.push(&sanitize_uri(
+ destination,
+ if media.image {
+ &context.protocol_src
+ } else {
+ &context.protocol_href
+ },
+ ));
+ }
+
+ if media.image {
+ context.push("\" alt=\"");
+ };
+ }
if media.image {
- context.tag(&*format!(
- "<img src=\"{}\" alt=\"",
- sanitize_uri(destination, &context.protocol_src),
- ));
- context.push(&*label);
- context.tag(&*format!("\"{} />", title));
- } else {
- context.tag(&*format!(
- "<a href=\"{}\"{}>",
- sanitize_uri(destination, &context.protocol_href),
- title,
- ));
- context.push(&*label);
- context.tag("</a>");
- };
+ context.push(&label);
+ }
+
+ if !in_image_alt {
+ context.push("\"");
+
+ let title = if let Some(index) = definition_index {
+ context.definitions[index].1.title.clone()
+ } else {
+ media.title
+ };
+
+ if let Some(title) = title {
+ context.push(" title=\"");
+ context.push(&title);
+ context.push("\"");
+ };
+
+ if media.image {
+ context.push(" /");
+ }
+
+ context.push(">");
+ context.last_was_tag = true;
+ }
+
+ if !media.image {
+ context.push(&label);
+
+ if !in_image_alt {
+ context.push("</a>");
+ context.last_was_tag = true;
+ }
+ }
}
/// Handle [`Exit`][EventType::Exit]:[`Paragraph`][Token::Paragraph].
@@ -1119,46 +1184,49 @@ fn on_exit_paragraph(context: &mut CompileContext) {
if *tight {
context.slurp_one_line_ending = true;
} else {
- context.tag("</p>");
+ context.push("</p>");
+ context.last_was_tag = true;
}
}
/// Handle [`Exit`][EventType::Exit]:[`ReferenceString`][Token::ReferenceString].
fn on_exit_reference_string(context: &mut CompileContext) {
- let value = Slice::from_position(
- context.bytes,
- &Position::from_exit_event(context.events, context.index),
- )
- .serialize();
-
// Drop stuff.
context.resume();
- let media = context.media_stack.last_mut().unwrap();
- media.reference_id = Some(value);
+ // To do: lifetimes, reference bytes.
+ context.media_stack.last_mut().unwrap().reference_id = Some(
+ Slice::from_position(
+ context.bytes,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize(),
+ );
}
/// Handle [`Exit`][EventType::Exit]:[`ResourceDestinationString`][Token::ResourceDestinationString].
fn on_exit_resource_destination_string(context: &mut CompileContext) {
let buf = context.resume();
- let media = context.media_stack.last_mut().unwrap();
- media.destination = Some(buf);
+ context.media_stack.last_mut().unwrap().destination = Some(buf);
context.encode_html = true;
}
/// Handle [`Exit`][EventType::Exit]:[`ResourceTitleString`][Token::ResourceTitleString].
fn on_exit_resource_title_string(context: &mut CompileContext) {
let buf = context.resume();
- let media = context.media_stack.last_mut().unwrap();
- media.title = Some(buf);
+ context.media_stack.last_mut().unwrap().title = Some(buf);
}
/// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Strong].
fn on_exit_strong(context: &mut CompileContext) {
- context.tag("</strong>");
+ if !context.in_image_alt {
+ context.push("</strong>");
+ context.last_was_tag = true;
+ }
}
/// Handle [`Exit`][EventType::Exit]:[`ThematicBreak`][Token::ThematicBreak].
fn on_exit_thematic_break(context: &mut CompileContext) {
context.line_ending_if_needed();
- context.tag("<hr />");
+ context.push("<hr />");
+ context.last_was_tag = true;
}
diff --git a/src/constant.rs b/src/constant.rs
index d84dda5..6ef851c 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -165,6 +165,15 @@ pub const HTML_BLOCK_NAMES: [&str; 61] = [
"ul",
];
+/// Magic string of CDATA (after `<![`).
+///
+/// Used in the **cdata** production of [HTML (flow)][html_flow] and
+/// [HTML (text)][html_text].
+///
+/// [html_flow]: crate::construct::html_flow
+/// [html_text]: crate::construct::html_text
+pub const HTML_CDATA_PREFIX: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'['];
+
/// List of HTML tag names that form the **raw** production of
/// [HTML (flow)][html_flow].
///
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index b042645..583fde2 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -88,54 +88,11 @@ enum GroupKind {
Other,
}
-/// Type of sequence.
-#[derive(Debug, PartialEq)]
-enum MarkerKind {
- /// In a run with asterisks.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// *a*
- /// ```
- Asterisk,
- /// In a run with underscores.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// _a_
- /// ```
- Underscore,
-}
-
-impl MarkerKind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- MarkerKind::Asterisk => b'*',
- MarkerKind::Underscore => b'_',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `*` or `_`.
- fn from_byte(byte: u8) -> MarkerKind {
- match byte {
- b'*' => MarkerKind::Asterisk,
- b'_' => MarkerKind::Underscore,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// Attentention sequence that we can take markers from.
#[derive(Debug)]
struct Sequence {
- /// Marker used in this sequence.
- marker: MarkerKind,
+ /// Marker as a byte (`u8`) used in this sequence.
+ marker: u8,
/// The depth in events where this sequence resides.
balance: usize,
/// The index into events where this sequence’s `Enter` currently resides.
@@ -160,9 +117,9 @@ struct Sequence {
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => {
+ Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => {
tokenizer.enter(Token::AttentionSequence);
- inside(tokenizer, MarkerKind::from_byte(byte))
+ inside(tokenizer, tokenizer.current.unwrap())
}
_ => State::Nok,
}
@@ -174,14 +131,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// > | **
/// ^^
/// ```
-fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State {
- if tokenizer.current == Some(marker.as_byte()) {
- tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, marker)))
- } else {
- tokenizer.exit(Token::AttentionSequence);
- tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention));
- State::Ok
+fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State {
+ match tokenizer.current {
+ Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => {
+ tokenizer.consume();
+ State::Fn(Box::new(move |t| inside(t, marker)))
+ }
+ _ => {
+ tokenizer.exit(Token::AttentionSequence);
+ tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention));
+ State::Ok
+ }
}
}
@@ -219,16 +179,10 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]);
let char_after = string_after.chars().next();
- let marker = MarkerKind::from_byte(
- Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
- .head()
- .unwrap(),
- );
- let before = classify_character(if enter.point.index > 0 {
- char_before
- } else {
- None
- });
+ let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point)
+ .head()
+ .unwrap();
+ let before = classify_character(char_before);
let after = classify_character(char_after);
let open = after == GroupKind::Other
|| (after == GroupKind::Punctuation && before != GroupKind::Other);
@@ -245,12 +199,12 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
start_point: enter.point.clone(),
end_point: exit.point.clone(),
size: exit.point.index - enter.point.index,
- open: if marker == MarkerKind::Asterisk {
+ open: if marker == b'*' {
open
} else {
open && (before != GroupKind::Other || !close)
},
- close: if marker == MarkerKind::Asterisk {
+ close: if marker == b'*' {
close
} else {
close && (after != GroupKind::Other || !open)
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
index b843af8..c0514ae 100644
--- a/src/construct/autolink.rs
+++ b/src/construct/autolink.rs
@@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_alphabetic() => {
+ // ASCII alphabetic.
+ Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(scheme_or_email_atext))
}
- Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer),
- _ => State::Nok,
+ _ => email_atext(tokenizer),
}
}
@@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Token::AutolinkProtocol);
end(tokenizer)
}
- Some(byte) if byte.is_ascii_control() => State::Nok,
- None | Some(b' ') => State::Nok,
+ // ASCII control or space.
+ None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok,
Some(_) => {
tokenizer.consume();
State::Fn(Box::new(url_inside))
@@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0)))
}
- Some(byte) if is_ascii_atext(byte) => {
+ // ASCII atext.
+ //
+ // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or
+ // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027
+ // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`),
+ // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`),
+ // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE
+ // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE
+ // (`~`).
+ //
+ // See:
+ // **\[RFC5322]**:
+ // [Internet Message Format](https://tools.ietf.org/html/rfc5322).
+ // P. Resnick.
+ // IETF.
+ //
+ // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
+ Some(
+ b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~',
+ ) => {
tokenizer.consume();
State::Fn(Box::new(email_atext))
}
@@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {
/// ```
fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size),
+ // ASCII alphanumeric.
+ Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size),
_ => State::Nok,
}
}
@@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State {
tokenizer.consume();
State::Fn(Box::new(move |t| email_value(t, size + 1)))
}
- Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => {
+ // ASCII alphanumeric.
+ Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |t| email_label(t, size + 1)))
}
@@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State {
_ => unreachable!("expected `>`"),
}
}
-
-/// Check whether the character code represents an ASCII atext.
-///
-/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in
-/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`),
-/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F
-/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E
-/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE
-/// (`{`) to U+007E TILDE (`~`).
-///
-/// See:
-/// **\[RFC5322]**:
-/// [Internet Message Format](https://tools.ietf.org/html/rfc5322).
-/// P. Resnick.
-/// IETF.
-///
-/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric
-fn is_ascii_atext(byte: u8) -> bool {
- matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~')
-}
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
index 02e8b62..4419d7a 100644
--- a/src/construct/character_escape.rs
+++ b/src/construct/character_escape.rs
@@ -63,7 +63,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_punctuation() => {
+ // ASCII punctuation.
+ Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => {
tokenizer.enter(Token::CharacterEscapeValue);
tokenizer.consume();
tokenizer.exit(Token::CharacterEscapeValue);
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs
index 90763c1..cd489a4 100644
--- a/src/construct/character_reference.rs
+++ b/src/construct/character_reference.rs
@@ -66,67 +66,18 @@ use crate::constant::{
CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
};
use crate::token::Token;
-use crate::tokenizer::{Point, State, Tokenizer};
-use crate::util::slice::{Position, Slice};
-
-/// Kind of a character reference.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Kind {
- /// Numeric decimal character reference.
- ///
- /// ```markdown
- /// > | a&#x9;b
- /// ^^^^^
- /// ```
- Decimal,
- /// Numeric hexadecimal character reference.
- ///
- /// ```markdown
- /// > | a&#123;b
- /// ^^^^^^
- /// ```
- Hexadecimal,
- /// Named character reference.
- ///
- /// ```markdown
- /// > | a&amp;b
- /// ^^^^^
- /// ```
- Named,
-}
-
-impl Kind {
- /// Get the maximum size of characters allowed in the value of a character
- /// reference.
- fn max(&self) -> usize {
- match self {
- Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
- Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
- Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX,
- }
- }
-
- /// Check if a byte ([`u8`]) is allowed.
- fn allowed(&self, byte: u8) -> bool {
- let check = match self {
- Kind::Hexadecimal => u8::is_ascii_hexdigit,
- Kind::Decimal => u8::is_ascii_digit,
- Kind::Named => u8::is_ascii_alphanumeric,
- };
-
- check(&byte)
- }
-}
+use crate::tokenizer::{State, Tokenizer};
+use crate::util::slice::Slice;
/// State needed to parse character references.
#[derive(Debug, Clone)]
struct Info {
- /// Place of value start.
- start: Point,
- /// Size of value.
- size: usize,
- /// Kind of character reference.
- kind: Kind,
+ /// Index of where value starts.
+ start: usize,
+ /// Marker of character reference.
+ marker: u8,
+ /// Maximum number of characters in the value for this kind.
+ max: usize,
}
/// Start of a character reference.
@@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State {
value(
tokenizer,
Info {
- start: tokenizer.point.clone(),
- size: 0,
- kind: Kind::Named,
+ start: tokenizer.point.index,
+ marker: b'&',
+ max: CHARACTER_REFERENCE_NAMED_SIZE_MAX,
},
)
}
@@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {
tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal);
tokenizer.enter(Token::CharacterReferenceValue);
let info = Info {
- start: tokenizer.point.clone(),
- size: 0,
- kind: Kind::Hexadecimal,
+ start: tokenizer.point.index,
+ marker: b'x',
+ max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,
};
State::Fn(Box::new(|t| value(t, info)))
} else {
tokenizer.enter(Token::CharacterReferenceValue);
let info = Info {
- start: tokenizer.point.clone(),
- size: 0,
- kind: Kind::Decimal,
+ start: tokenizer.point.index,
+ marker: b'#',
+ max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,
};
value(tokenizer, info)
}
@@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {
/// > | a&#x9;b
/// ^
/// ```
-fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {
+fn value(tokenizer: &mut Tokenizer, info: Info) -> State {
+ let size = tokenizer.point.index - info.start;
+
match tokenizer.current {
- Some(b';') if info.size > 0 => {
- if Kind::Named == info.kind {
- // To do: fix slice.
- let value = Slice::from_position(
+ Some(b';') if size > 0 => {
+ // Named.
+ if info.marker == b'&' {
+ // Guaranteed to be valid ASCII bytes.
+ let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &info.start,
- end: &tokenizer.point,
- },
- )
- .serialize();
+ info.start,
+ tokenizer.point.index,
+ );
+ let name = slice.as_str();
- if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) {
+ if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) {
return State::Nok;
}
}
@@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.exit(Token::CharacterReference);
State::Ok
}
- Some(byte) => {
- if info.size < info.kind.max() && info.kind.allowed(byte) {
- info.size += 1;
- tokenizer.consume();
- State::Fn(Box::new(|t| value(t, info)))
- } else {
- State::Nok
- }
+ // ASCII digit, for named, decimal, and hexadecimal references.
+ Some(b'0'..=b'9') if size < info.max => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| value(t, info)))
+ }
+ // ASCII hex letters, for named and hexadecimal references.
+ Some(b'A'..=b'F' | b'a'..=b'f')
+ if matches!(info.marker, b'&' | b'x') && size < info.max =>
+ {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| value(t, info)))
+ }
+ // Non-hex ASCII alphabeticals, for named references.
+ Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| value(t, info)))
}
_ => State::Nok,
}
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index 21e9259..c4c3e86 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -110,53 +110,6 @@ use crate::token::Token;
use crate::tokenizer::{ContentType, State, Tokenizer};
use crate::util::slice::{Position, Slice};
-/// Kind of fences.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Kind {
- /// Grave accent (tick) code.
- ///
- /// ## Example
- ///
- /// ````markdown
- /// ```rust
- /// println!("I <3 🦀");
- /// ```
- /// ````
- GraveAccent,
- /// Tilde code.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ~~~rust
- /// println!("I <3 🦀");
- /// ~~~
- /// ```
- Tilde,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- Kind::GraveAccent => b'`',
- Kind::Tilde => b'~',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `~` or `` ` ``.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'`' => Kind::GraveAccent,
- b'~' => Kind::Tilde,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse code (fenced).
#[derive(Debug, Clone)]
struct Info {
@@ -165,8 +118,8 @@ struct Info {
/// Number of tabs or spaces of indentation before the opening fence
/// sequence.
prefix: usize,
- /// Kind of fences.
- kind: Kind,
+ /// Marker of fences (`u8`).
+ marker: u8,
}
/// Start of fenced code.
@@ -178,15 +131,20 @@ struct Info {
/// | ~~~
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
if tokenizer.parse_state.constructs.code_fenced {
tokenizer.enter(Token::CodeFenced);
tokenizer.enter(Token::CodeFencedFence);
- tokenizer.go(space_or_tab_min_max(0, max), before_sequence_open)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before_sequence_open,
+ )(tokenizer)
} else {
State::Nok
}
@@ -210,23 +168,22 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
tokenizer.parse_state.bytes,
&Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1),
)
- .size();
+ .len();
}
}
- match tokenizer.current {
- Some(byte) if matches!(byte, b'`' | b'~') => {
- tokenizer.enter(Token::CodeFencedFenceSequence);
- sequence_open(
- tokenizer,
- Info {
- prefix,
- size: 0,
- kind: Kind::from_byte(byte),
- },
- )
- }
- _ => State::Nok,
+ if let Some(b'`' | b'~') = tokenizer.current {
+ tokenizer.enter(Token::CodeFencedFenceSequence);
+ sequence_open(
+ tokenizer,
+ Info {
+ prefix,
+ size: 0,
+ marker: tokenizer.current.unwrap(),
+ },
+ )
+ } else {
+ State::Nok
}
}
@@ -240,7 +197,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {
tokenizer.consume();
State::Fn(Box::new(|t| {
info.size += 1;
@@ -302,7 +259,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.exit(Token::CodeFencedFenceInfo);
tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer)
}
- Some(b'`') if info.kind == Kind::GraveAccent => State::Nok,
+ Some(b'`') if info.marker == b'`' => State::Nok,
Some(_) => {
tokenizer.consume();
State::Fn(Box::new(|t| info_inside(t, info)))
@@ -352,7 +309,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.concrete = true;
at_break(tokenizer, info)
}
- Some(b'`') if info.kind == Kind::GraveAccent => State::Nok,
+ Some(b'`') if info.marker == b'`' => State::Nok,
_ => {
tokenizer.consume();
State::Fn(Box::new(|t| meta(t, info)))
@@ -432,14 +389,18 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ^
/// ```
fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
tokenizer.enter(Token::CodeFencedFence);
- tokenizer.go(space_or_tab_min_max(0, max), |t| close_before(t, info))(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ |t| close_before(t, info),
+ )(tokenizer)
}
/// In a closing fence, after optional whitespace, before sequence.
@@ -452,7 +413,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {
tokenizer.enter(Token::CodeFencedFenceSequence);
close_sequence(tokenizer, info, 0)
}
@@ -470,7 +431,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {
tokenizer.consume();
State::Fn(Box::new(move |t| close_sequence(t, info, size + 1)))
}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 4a3a9f6..81a3080 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -62,11 +62,11 @@ use crate::tokenizer::{State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
// Do not interrupt paragraphs.
- if tokenizer.interrupt || !tokenizer.parse_state.constructs.code_indented {
- State::Nok
- } else {
+ if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented {
tokenizer.enter(Token::CodeIndented);
tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer)
+ } else {
+ State::Nok
}
}
@@ -129,29 +129,26 @@ fn after(tokenizer: &mut Tokenizer) -> State {
/// | bbb
/// ```
fn further_start(tokenizer: &mut Tokenizer) -> State {
- if tokenizer.lazy {
- State::Nok
- } else {
- match tokenizer.current {
- Some(b'\n') => {
- tokenizer.enter(Token::LineEnding);
- tokenizer.consume();
- tokenizer.exit(Token::LineEnding);
- State::Fn(Box::new(further_start))
- }
- _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
- Box::new(if ok { further_end } else { further_begin })
- })(tokenizer),
+ match tokenizer.current {
+ Some(b'\n') if !tokenizer.lazy => {
+ tokenizer.enter(Token::LineEnding);
+ tokenizer.consume();
+ tokenizer.exit(Token::LineEnding);
+ State::Fn(Box::new(further_start))
}
+ _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
+ Box::new(if ok { further_end } else { further_begin })
+ })(tokenizer),
+ _ => State::Nok,
}
}
-/// After a proper indent.
+/// At an eol, which is followed by an indented line.
///
/// ```markdown
-/// | aaa
-/// > | bbb
-/// ^
+/// > | aaa
+/// ^
+/// | bbb
/// ```
fn further_end(_tokenizer: &mut Tokenizer) -> State {
State::Ok
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
index b36a208..d70fbc2 100644
--- a/src/construct/code_text.rs
+++ b/src/construct/code_text.rs
@@ -95,14 +95,13 @@ use crate::tokenizer::{State, Tokenizer};
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let len = tokenizer.events.len();
-
match tokenizer.current {
Some(b'`')
if tokenizer.parse_state.constructs.code_text
&& (tokenizer.previous != Some(b'`')
- || (len > 0
- && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) =>
+ || (!tokenizer.events.is_empty()
+ && tokenizer.events[tokenizer.events.len() - 1].token_type
+ == Token::CharacterEscape)) =>
{
tokenizer.enter(Token::CodeText);
tokenizer.enter(Token::CodeTextSequence);
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index 14755c9..bd7df82 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -110,17 +110,18 @@ use crate::util::skip::opt_back as skip_opt_back;
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let definition_before = !tokenizer.events.is_empty()
- && tokenizer.events[skip_opt_back(
- &tokenizer.events,
- tokenizer.events.len() - 1,
- &[Token::LineEnding, Token::SpaceOrTab],
- )]
- .token_type
- == Token::Definition;
-
// Do not interrupt paragraphs (but do follow definitions).
- if (!tokenizer.interrupt || definition_before) && tokenizer.parse_state.constructs.definition {
+ let possible = !tokenizer.interrupt
+ || (!tokenizer.events.is_empty()
+ && tokenizer.events[skip_opt_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::LineEnding, Token::SpaceOrTab],
+ )]
+ .token_type
+ == Token::Definition);
+
+ if possible && tokenizer.parse_state.constructs.definition {
tokenizer.enter(Token::Definition);
// Note: arbitrary whitespace allowed even if code (indented) is on.
tokenizer.attempt_opt(space_or_tab(), before)(tokenizer)
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
index cdbc192..d09bf54 100644
--- a/src/construct/hard_break_escape.rs
+++ b/src/construct/hard_break_escape.rs
@@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => {
tokenizer.enter(Token::HardBreakEscape);
tokenizer.consume();
- State::Fn(Box::new(inside))
+ State::Fn(Box::new(after))
}
_ => State::Nok,
}
@@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// | b
/// ```
-fn inside(tokenizer: &mut Tokenizer) -> State {
+fn after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
Some(b'\n') => {
tokenizer.exit(Token::HardBreakEscape);
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 9a73b77..aa388ee 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -66,15 +66,19 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer};
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.heading_atx {
tokenizer.enter(Token::HeadingAtx);
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -101,19 +105,19 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// > | ## aa
/// ^
/// ```
-fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State {
+fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- None | Some(b'\n') if rank > 0 => {
+ None | Some(b'\n') if size > 0 => {
tokenizer.exit(Token::HeadingAtxSequence);
at_break(tokenizer)
}
- Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
+ Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |tokenizer| {
- sequence_open(tokenizer, rank + 1)
+ sequence_open(tokenizer, size + 1)
}))
}
- _ if rank > 0 => {
+ _ if size > 0 => {
tokenizer.exit(Token::HeadingAtxSequence);
tokenizer.go(space_or_tab(), at_break)(tokenizer)
}
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
index 2a4adbf..98d7843 100644
--- a/src/construct/heading_setext.rs
+++ b/src/construct/heading_setext.rs
@@ -63,52 +63,6 @@ use crate::token::Token;
use crate::tokenizer::{EventType, State, Tokenizer};
use crate::util::skip::opt_back as skip_opt_back;
-/// Kind of underline.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Kind {
- /// Dash (rank 2) heading.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// alpha
- /// -----
- /// ```
- Dash,
-
- /// Equals to (rank 1) heading.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// alpha
- /// =====
- /// ```
- EqualsTo,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- Kind::Dash => b'-',
- Kind::EqualsTo => b'=',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `-` or `=`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'-' => Kind::Dash,
- b'=' => Kind::EqualsTo,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// At a line ending, presumably an underline.
///
/// ```markdown
@@ -117,23 +71,29 @@ impl Kind {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
- let paragraph_before = !tokenizer.events.is_empty()
- && tokenizer.events[skip_opt_back(
- &tokenizer.events,
- tokenizer.events.len() - 1,
- &[Token::LineEnding, Token::SpaceOrTab],
- )]
- .token_type
- == Token::Paragraph;
-
- // Require a paragraph before and do not allow on a lazy line.
- if paragraph_before && !tokenizer.lazy && tokenizer.parse_state.constructs.heading_setext {
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ if tokenizer.parse_state.constructs.heading_setext
+ && !tokenizer.lazy
+ // Require a paragraph before.
+ && (!tokenizer.events.is_empty()
+ && tokenizer.events[skip_opt_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::LineEnding, Token::SpaceOrTab],
+ )]
+ .token_type
+ == Token::Paragraph)
+ {
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -148,9 +108,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if matches!(byte, b'-' | b'=') => {
+ Some(b'-' | b'=') => {
tokenizer.enter(Token::HeadingSetextUnderline);
- inside(tokenizer, Kind::from_byte(byte))
+ inside(tokenizer, tokenizer.current.unwrap())
}
_ => State::Nok,
}
@@ -163,11 +123,11 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// > | ==
/// ^
/// ```
-fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State {
+fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State {
match tokenizer.current {
- Some(byte) if byte == kind.as_byte() => {
+ Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => {
tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, kind)))
+ State::Fn(Box::new(move |t| inside(t, marker)))
}
_ => {
tokenizer.exit(Token::HeadingSetextUnderline);
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index 5860c5d..064da35 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -98,17 +98,17 @@
//! [html_block_names]: crate::constant::HTML_BLOCK_NAMES
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
-use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE};
+use crate::constant::{
+ HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE,
+};
use crate::construct::{
blank_line::start as blank_line,
partial_non_lazy_continuation::start as partial_non_lazy_continuation,
partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions},
};
use crate::token::Token;
-use crate::tokenizer::{Point, State, Tokenizer};
-use crate::util::slice::{Position, Slice};
-
-const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'['];
+use crate::tokenizer::{State, Tokenizer};
+use crate::util::slice::Slice;
/// Kind of HTML (flow).
#[derive(Debug, PartialEq)]
@@ -129,49 +129,6 @@ enum Kind {
Complete,
}
-/// Type of quote, if we’re in a quoted attribute, in complete (condition 7).
-#[derive(Debug, PartialEq)]
-enum QuoteKind {
- /// In a double quoted (`"`) attribute value.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// <a b="c" />
- /// ```
- Double,
- /// In a single quoted (`'`) attribute value.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// <a b='c' />
- /// ```
- Single,
-}
-
-impl QuoteKind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- QuoteKind::Double => b'"',
- QuoteKind::Single => b'\'',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `"` or `'`.
- fn from_byte(byte: u8) -> QuoteKind {
- match byte {
- b'"' => QuoteKind::Double,
- b'\'' => QuoteKind::Single,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse HTML (flow).
#[derive(Debug)]
struct Info {
@@ -179,12 +136,10 @@ struct Info {
kind: Kind,
/// Whether this is a start tag (`<` not followed by `/`).
start_tag: bool,
- /// Used depending on `kind` to collect all parsed bytes.
- start: Option<Point>,
- /// Collected index, for various reasons.
- size: usize,
+ /// Start index of a tag name or cdata prefix.
+ start: usize,
/// Current quote, when in a double or single quoted attribute value.
- quote: Option<QuoteKind>,
+ quote: u8,
}
/// Start of HTML (flow), before optional whitespace.
@@ -194,19 +149,17 @@ struct Info {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.html_flow {
tokenizer.enter(Token::HtmlFlow);
tokenizer.go(
space_or_tab_with_options(SpaceOrTabOptions {
kind: Token::HtmlFlowData,
min: 0,
- max,
+ max: if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
connect: false,
content_type: None,
}),
@@ -249,9 +202,8 @@ fn open(tokenizer: &mut Tokenizer) -> State {
kind: Kind::Basic,
// Assume closing tag (or no tag).
start_tag: false,
- start: None,
- size: 0,
- quote: None,
+ start: 0,
+ quote: 0,
};
match tokenizer.current {
@@ -261,7 +213,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {
}
Some(b'/') => {
tokenizer.consume();
- info.start = Some(tokenizer.point.clone());
+ info.start = tokenizer.point.index;
State::Fn(Box::new(|t| tag_close_start(t, info)))
}
Some(b'?') => {
@@ -273,9 +225,10 @@ fn open(tokenizer: &mut Tokenizer) -> State {
// right now, so we do need to search for `>`, similar to declarations.
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
info.start_tag = true;
- info.start = Some(tokenizer.point.clone());
+ info.start = tokenizer.point.index;
tag_name(tokenizer, info)
}
_ => State::Nok,
@@ -299,12 +252,6 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
info.kind = Kind::Comment;
State::Fn(Box::new(|t| comment_open_inside(t, info)))
}
- Some(b'[') => {
- tokenizer.consume();
- info.kind = Kind::Cdata;
- info.size = 0;
- State::Fn(Box::new(|t| cdata_open_inside(t, info)))
- }
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
info.kind = Kind::Declaration;
@@ -312,6 +259,12 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.concrete = true;
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
+ Some(b'[') => {
+ tokenizer.consume();
+ info.kind = Kind::Cdata;
+ info.start = tokenizer.point.index;
+ State::Fn(Box::new(|t| cdata_open_inside(t, info)))
+ }
_ => State::Nok,
}
}
@@ -342,12 +295,11 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == CDATA_SEARCH[info.size] => {
- info.size += 1;
+ Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => {
tokenizer.consume();
- if info.size == CDATA_SEARCH.len() {
- info.size = 0;
+ if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() {
+ info.start = 0;
// Do not form containers.
tokenizer.concrete = true;
State::Fn(Box::new(|t| continuation(t, info)))
@@ -367,6 +319,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| tag_name(t, info)))
@@ -387,17 +340,18 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => {
let slash = matches!(tokenizer.current, Some(b'/'));
- let start = info.start.take().unwrap();
- let name = Slice::from_position(
+ // Guaranteed to be valid ASCII bytes.
+ let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &start,
- end: &tokenizer.point,
- },
- )
- .serialize()
- .trim()
- .to_lowercase();
+ info.start,
+ tokenizer.point.index,
+ );
+ let name = slice
+ .as_str()
+ // The line ending case might result in a `\r` that is already accounted for.
+ .trim()
+ .to_ascii_lowercase();
+ info.start = 0;
if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) {
info.kind = Kind::Raw;
@@ -427,6 +381,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
}
}
}
+ // ASCII alphanumerical and `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| tag_name(t, info)))
@@ -490,18 +445,19 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ Some(b'\t' | b' ') => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| complete_attribute_name_before(t, info)))
+ }
Some(b'/') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_end(t, info)))
}
+ // ASCII alphanumerical and `:` and `_`.
Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
}
- Some(b'\t' | b' ') => {
- tokenizer.consume();
- State::Fn(Box::new(|t| complete_attribute_name_before(t, info)))
- }
_ => complete_end(tokenizer, info),
}
}
@@ -518,6 +474,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat
/// ```
fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ // ASCII alphanumerical and `-`, `.`, `:`, and `_`.
Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
@@ -537,14 +494,14 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(b'=') => {
- tokenizer.consume();
- State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
- }
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name_after(t, info)))
}
+ Some(b'=') => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
+ }
_ => complete_attribute_name_before(tokenizer, info),
}
}
@@ -561,15 +518,15 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State
fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok,
- Some(byte) if matches!(byte, b'"' | b'\'') => {
- info.quote = Some(QuoteKind::from_byte(byte));
- tokenizer.consume();
- State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info)))
- }
Some(b'\t' | b' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
}
+ Some(b'"' | b'\'') => {
+ info.quote = tokenizer.current.unwrap();
+ tokenizer.consume();
+ State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info)))
+ }
_ => complete_attribute_value_unquoted(tokenizer, info),
}
}
@@ -585,7 +542,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) ->
fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
None | Some(b'\n') => State::Nok,
- Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => {
+ Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info)))
}
@@ -673,6 +630,21 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => {
+ tokenizer.exit(Token::HtmlFlowData);
+ tokenizer.check(blank_line_before, |ok| {
+ if ok {
+ Box::new(continuation_after)
+ } else {
+ Box::new(move |t| continuation_start(t, info))
+ }
+ })(tokenizer)
+ }
+ // Note: important that this is after the basic/complete case.
+ None | Some(b'\n') => {
+ tokenizer.exit(Token::HtmlFlowData);
+ continuation_start(tokenizer, info)
+ }
Some(b'-') if info.kind == Kind::Comment => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_comment_inside(t, info)))
@@ -693,20 +665,6 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_character_data_inside(t, info)))
}
- Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => {
- tokenizer.exit(Token::HtmlFlowData);
- tokenizer.check(blank_line_before, |ok| {
- if ok {
- Box::new(continuation_after)
- } else {
- Box::new(move |t| continuation_start(t, info))
- }
- })(tokenizer)
- }
- None | Some(b'\n') => {
- tokenizer.exit(Token::HtmlFlowData);
- continuation_start(tokenizer, info)
- }
_ => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation(t, info)))
@@ -793,7 +751,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State
match tokenizer.current {
Some(b'/') => {
tokenizer.consume();
- info.start = Some(tokenizer.point.clone());
+ info.start = tokenizer.point.index;
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => continuation(tokenizer, info),
@@ -809,18 +767,15 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State
fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
Some(b'>') => {
- info.size = 0;
-
- let start = info.start.take().unwrap();
- let name = Slice::from_position(
+ // Guaranteed to be valid ASCII bytes.
+ let slice = Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &start,
- end: &tokenizer.point,
- },
- )
- .serialize()
- .to_lowercase();
+ info.start,
+ tokenizer.point.index,
+ );
+ let name = slice.as_str().to_ascii_lowercase();
+
+ info.start = 0;
if HTML_RAW_NAMES.contains(&name.as_str()) {
tokenizer.consume();
@@ -829,13 +784,14 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State
continuation(tokenizer, info)
}
}
- Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => {
+ Some(b'A'..=b'Z' | b'a'..=b'z')
+ if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX =>
+ {
tokenizer.consume();
- info.size += 1;
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => {
- info.size = 0;
+ info.start = 0;
continuation(tokenizer, info)
}
}
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index f10a476..51beda5 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -54,12 +54,11 @@
//! [html_flow]: crate::construct::html_flow
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
+use crate::constant::HTML_CDATA_PREFIX;
use crate::construct::partial_space_or_tab::space_or_tab;
use crate::token::Token;
use crate::tokenizer::{State, StateFn, Tokenizer};
-const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'['];
-
/// Start of HTML (text)
///
/// ```markdown
@@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(instruction))
}
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open))
@@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(comment_open_inside))
}
- Some(b'[') => {
- tokenizer.consume();
- State::Fn(Box::new(|t| cdata_open_inside(t, 0)))
- }
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(declaration))
}
+ Some(b'[') => {
+ tokenizer.consume();
+ State::Fn(Box::new(|t| cdata_open_inside(t, 0)))
+ }
_ => State::Nok,
}
}
@@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State {
/// > | a <![CDATA[>&<]]> b
/// ^^^^^^
/// ```
-fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State {
- match tokenizer.current {
- Some(byte) if byte == CDATA_SEARCH[index] => {
- tokenizer.consume();
+fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State {
+ if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) {
+ tokenizer.consume();
- if index + 1 == CDATA_SEARCH.len() {
- State::Fn(Box::new(cdata))
- } else {
- State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1)))
- }
+ if size + 1 == HTML_CDATA_PREFIX.len() {
+ State::Fn(Box::new(cdata))
+ } else {
+ State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1)))
}
- _ => State::Nok,
+ } else {
+ State::Nok
}
}
@@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphabetical.
Some(b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_close))
@@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphanumerical and `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_close))
@@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphanumerical and `-`.
Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open))
@@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(end))
}
+ // ASCII alphabetical and `:` and `_`.
Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name))
@@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
+ // ASCII alphabetical and `-`, `.`, `:`, and `_`.
Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name))
@@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_before))
}
- Some(byte) if byte == b'"' || byte == b'\'' => {
+ Some(b'"' | b'\'') => {
+ let marker = tokenizer.current.unwrap();
tokenizer.consume();
- State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte)))
+ State::Fn(Box::new(move |t| {
+ tag_open_attribute_value_quoted(t, marker)
+ }))
}
Some(_) => {
tokenizer.consume();
@@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta
tokenizer,
Box::new(move |t| tag_open_attribute_value_quoted(t, marker)),
),
- Some(byte) if byte == marker => {
+ Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_quoted_after))
}
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 6399f81..a1ec8d9 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -214,16 +214,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
media: Media {
start: label_start.start,
end: (label_end_start, label_end_start + 3),
- // To do: virtual spaces not needed, create a `to_str`?
id: normalize_identifier(
- &Slice::from_position(
+ // We don’t care about virtual spaces, so `indices` and `as_str` are fine.
+ Slice::from_indices(
tokenizer.parse_state.bytes,
- &Position {
- start: &tokenizer.events[label_start.start.1].point,
- end: &tokenizer.events[label_end_start - 1].point,
- },
+ tokenizer.events[label_start.start.1].point.index,
+ tokenizer.events[label_end_start - 1].point.index,
)
- .serialize(),
+ .as_str(),
),
},
};
@@ -366,11 +364,11 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ^
/// ```
fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State {
- let label_start = tokenizer
+ tokenizer
.label_start_stack
.get_mut(label_start_index)
- .unwrap();
- label_start.balanced = true;
+ .unwrap()
+ .balanced = true;
State::Nok
}
@@ -529,23 +527,24 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn full_reference_after(tokenizer: &mut Tokenizer) -> State {
- let end = skip::to_back(
- &tokenizer.events,
- tokenizer.events.len() - 1,
- &[Token::ReferenceString],
- );
-
- // To do: virtual spaces not needed, create a `to_str`?
- let id = Slice::from_position(
- tokenizer.parse_state.bytes,
- &Position::from_exit_event(&tokenizer.events, end),
- )
- .serialize();
-
if tokenizer
.parse_state
.definitions
- .contains(&normalize_identifier(&id))
+ // We don’t care about virtual spaces, so `as_str` is fine.
+ .contains(&normalize_identifier(
+ Slice::from_position(
+ tokenizer.parse_state.bytes,
+ &Position::from_exit_event(
+ &tokenizer.events,
+ skip::to_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::ReferenceString],
+ ),
+ ),
+ )
+ .as_str(),
+ ))
{
State::Ok
} else {
diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs
index d30b8dd..4a3508e 100644
--- a/src/construct/label_start_image.rs
+++ b/src/construct/label_start_image.rs
@@ -64,9 +64,8 @@ pub fn open(tokenizer: &mut Tokenizer) -> State {
tokenizer.consume();
tokenizer.exit(Token::LabelMarker);
tokenizer.exit(Token::LabelImage);
- let end = tokenizer.events.len() - 1;
tokenizer.label_start_stack.push(LabelStart {
- start: (end - 5, end),
+ start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1),
balanced: false,
inactive: false,
});
diff --git a/src/construct/list.rs b/src/construct/list.rs
index 9b59130..d5a9899 100644
--- a/src/construct/list.rs
+++ b/src/construct/list.rs
@@ -56,69 +56,6 @@ use crate::util::{
slice::{Position, Slice},
};
-/// Type of list.
-#[derive(Debug, PartialEq)]
-enum Kind {
- /// In a dot (`.`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// 1. a
- /// ```
- Dot,
- /// In a paren (`)`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// 1) a
- /// ```
- Paren,
- /// In an asterisk (`*`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// * a
- /// ```
- Asterisk,
- /// In a plus (`+`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// + a
- /// ```
- Plus,
- /// In a dash (`-`) list item.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// - a
- /// ```
- Dash,
-}
-
-impl Kind {
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'.' => Kind::Dot,
- b')' => Kind::Paren,
- b'*' => Kind::Asterisk,
- b'+' => Kind::Plus,
- b'-' => Kind::Dash,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// Start of list item.
///
/// ```markdown
@@ -126,15 +63,19 @@ impl Kind {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.list {
tokenizer.enter(Token::ListItem);
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -149,15 +90,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Unordered.
- Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| {
+ Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| {
Box::new(if ok { nok } else { before_unordered })
})(tokenizer),
+ Some(b'+') => before_unordered(tokenizer),
// Ordered.
- Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => {
- tokenizer.enter(Token::ListItemPrefix);
- tokenizer.enter(Token::ListItemValue);
- inside(tokenizer, 0)
- }
+ Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer),
+ Some(b'1') => before_ordered(tokenizer),
_ => State::Nok,
}
}
@@ -175,6 +114,18 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {
marker(tokenizer)
}
+/// Start of an ordered list item.
+///
+/// ```markdown
+/// > | * a
+/// ^
+/// ```
+fn before_ordered(tokenizer: &mut Tokenizer) -> State {
+ tokenizer.enter(Token::ListItemPrefix);
+ tokenizer.enter(Token::ListItemValue);
+ inside(tokenizer, 0)
+}
+
/// In an ordered list item value.
///
/// ```markdown
@@ -183,14 +134,14 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => {
- tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, size + 1)))
- }
Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => {
tokenizer.exit(Token::ListItemValue);
marker(tokenizer)
}
+ Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => {
+ tokenizer.consume();
+ State::Fn(Box::new(move |t| inside(t, size + 1)))
+ }
_ => State::Nok,
}
}
@@ -262,7 +213,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn whitespace_after(tokenizer: &mut Tokenizer) -> State {
- if matches!(tokenizer.current, Some(b'\t' | b' ')) {
+ if let Some(b'\t' | b' ') = tokenizer.current {
State::Nok
} else {
State::Ok
@@ -309,7 +260,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State {
end: &tokenizer.point,
},
)
- .size();
+ .len();
if blank {
prefix += 1;
@@ -389,8 +340,8 @@ fn nok(_tokenizer: &mut Tokenizer) -> State {
pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
let mut index = 0;
let mut balance = 0;
- let mut lists_wip: Vec<(Kind, usize, usize, usize)> = vec![];
- let mut lists: Vec<(Kind, usize, usize, usize)> = vec![];
+ let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![];
+ let mut lists: Vec<(u8, usize, usize, usize)> = vec![];
// Merge list items.
while index < tokenizer.events.len() {
@@ -400,12 +351,14 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
if event.event_type == EventType::Enter {
let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1;
let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]);
- let kind = Kind::from_byte(
- Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point)
- .head()
- .unwrap(),
- );
- let current = (kind, balance, index, end);
+ // Guaranteed to be a valid ASCII byte.
+ let marker = Slice::from_index(
+ tokenizer.parse_state.bytes,
+ tokenizer.events[marker].point.index,
+ )
+ .head()
+ .unwrap();
+ let current = (marker, balance, index, end);
let mut list_index = lists_wip.len();
let mut matched = false;
@@ -475,7 +428,7 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
let mut list_start = tokenizer.events[list_item.2].clone();
let mut list_end = tokenizer.events[list_item.3].clone();
let token_type = match list_item.0 {
- Kind::Paren | Kind::Dot => Token::ListOrdered,
+ b'.' | b')' => Token::ListOrdered,
_ => Token::ListUnordered,
};
list_start.token_type = token_type.clone();
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index 146dc40..ec5669c 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -81,10 +81,9 @@ fn inside(tokenizer: &mut Tokenizer) -> State {
/// Merge “`Paragraph`”s, which currently span a single line, into actual
/// `Paragraph`s that span multiple lines.
pub fn resolve(tokenizer: &mut Tokenizer) {
- let len = tokenizer.events.len();
let mut index = 0;
- while index < len {
+ while index < tokenizer.events.len() {
let event = &tokenizer.events[index];
if event.event_type == EventType::Enter && event.token_type == Token::Paragraph {
diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs
index be8d6c8..155a1a3 100644
--- a/src/construct/partial_bom.rs
+++ b/src/construct/partial_bom.rs
@@ -10,13 +10,12 @@ use crate::tokenizer::{State, Tokenizer};
/// ^^^^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Some(0xEF) => {
- tokenizer.enter(Token::ByteOrderMark);
- tokenizer.consume();
- State::Fn(Box::new(cont))
- }
- _ => State::Nok,
+ if tokenizer.current == Some(0xEF) {
+ tokenizer.enter(Token::ByteOrderMark);
+ tokenizer.consume();
+ State::Fn(Box::new(cont))
+ } else {
+ State::Nok
}
}
@@ -27,12 +26,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^^^^
/// ```
fn cont(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Some(0xBB) => {
- tokenizer.consume();
- State::Fn(Box::new(end))
- }
- _ => State::Nok,
+ if tokenizer.current == Some(0xBB) {
+ tokenizer.consume();
+ State::Fn(Box::new(end))
+ } else {
+ State::Nok
}
}
@@ -43,12 +41,11 @@ fn cont(tokenizer: &mut Tokenizer) -> State {
/// ^^^^
/// ```
fn end(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Some(0xBF) => {
- tokenizer.consume();
- tokenizer.exit(Token::ByteOrderMark);
- State::Ok
- }
- _ => State::Nok,
+ if tokenizer.current == Some(0xBF) {
+ tokenizer.consume();
+ tokenizer.exit(Token::ByteOrderMark);
+ State::Ok
+ } else {
+ State::Nok
}
}
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index 0a3721c..809aa27 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
tokenizer.exit(info.options.marker.clone());
State::Fn(Box::new(|t| enclosed_before(t, info)))
}
- None | Some(b' ' | b')') => State::Nok,
- Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok,
+ // ASCII control, space, closing paren, but *not* `\0`.
+ None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok,
Some(_) => {
tokenizer.enter(info.options.destination.clone());
tokenizer.enter(info.options.raw.clone());
@@ -166,12 +166,12 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
+ None | Some(b'\n' | b'<') => State::Nok,
Some(b'>') => {
tokenizer.exit(Token::Data);
tokenizer.exit(info.options.string.clone());
enclosed_before(tokenizer, info)
}
- None | Some(b'\n' | b'<') => State::Nok,
Some(b'\\') => {
tokenizer.consume();
State::Fn(Box::new(|t| enclosed_escape(t, info)))
@@ -207,40 +207,25 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(b'(') => {
- if info.balance >= info.options.limit {
- State::Nok
- } else {
- tokenizer.consume();
- info.balance += 1;
- State::Fn(Box::new(move |t| raw(t, info)))
- }
+ None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => {
+ tokenizer.exit(Token::Data);
+ tokenizer.exit(info.options.string.clone());
+ tokenizer.exit(info.options.raw.clone());
+ tokenizer.exit(info.options.destination);
+ State::Ok
}
- Some(b')') => {
- if info.balance == 0 {
- tokenizer.exit(Token::Data);
- tokenizer.exit(info.options.string.clone());
- tokenizer.exit(info.options.raw.clone());
- tokenizer.exit(info.options.destination);
- State::Ok
- } else {
- tokenizer.consume();
- info.balance -= 1;
- State::Fn(Box::new(move |t| raw(t, info)))
- }
+ Some(b'(') if info.balance < info.options.limit => {
+ tokenizer.consume();
+ info.balance += 1;
+ State::Fn(Box::new(move |t| raw(t, info)))
}
- None | Some(b'\t' | b'\n' | b' ') => {
- if info.balance > 0 {
- State::Nok
- } else {
- tokenizer.exit(Token::Data);
- tokenizer.exit(info.options.string.clone());
- tokenizer.exit(info.options.raw.clone());
- tokenizer.exit(info.options.destination);
- State::Ok
- }
+ // ASCII control (but *not* `\0`) and space and `(`.
+ None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok,
+ Some(b')') => {
+ tokenizer.consume();
+ info.balance -= 1;
+ State::Fn(Box::new(move |t| raw(t, info)))
}
- Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok,
Some(b'\\') => {
tokenizer.consume();
State::Fn(Box::new(move |t| raw_escape(t, info)))
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index 7e40a2d..6fdb70d 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -123,39 +123,43 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ^
/// ```
fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
- match tokenizer.current {
- None | Some(b'[') => State::Nok,
- Some(b']') if !info.data => State::Nok,
- _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok,
- Some(b']') => {
- tokenizer.exit(info.options.string.clone());
- tokenizer.enter(info.options.marker.clone());
- tokenizer.consume();
- tokenizer.exit(info.options.marker.clone());
- tokenizer.exit(info.options.label);
- State::Ok
- }
- Some(b'\n') => tokenizer.go(
- space_or_tab_eol_with_options(EolOptions {
- content_type: Some(ContentType::String),
- connect: info.connect,
- }),
- |t| {
- info.connect = true;
- at_break(t, info)
- },
- )(tokenizer),
- _ => {
- tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
-
- if info.connect {
- let index = tokenizer.events.len() - 1;
- link(&mut tokenizer.events, index);
- } else {
- info.connect = true;
+ if info.size > LINK_REFERENCE_SIZE_MAX
+ || matches!(tokenizer.current, None | Some(b'['))
+ || (matches!(tokenizer.current, Some(b']')) && !info.data)
+ {
+ State::Nok
+ } else {
+ match tokenizer.current {
+ Some(b'\n') => tokenizer.go(
+ space_or_tab_eol_with_options(EolOptions {
+ content_type: Some(ContentType::String),
+ connect: info.connect,
+ }),
+ |t| {
+ info.connect = true;
+ at_break(t, info)
+ },
+ )(tokenizer),
+ Some(b']') => {
+ tokenizer.exit(info.options.string.clone());
+ tokenizer.enter(info.options.marker.clone());
+ tokenizer.consume();
+ tokenizer.exit(info.options.marker.clone());
+ tokenizer.exit(info.options.label);
+ State::Ok
}
+ _ => {
+ tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
+
+ if info.connect {
+ let index = tokenizer.events.len() - 1;
+ link(&mut tokenizer.events, index);
+ } else {
+ info.connect = true;
+ }
- label(tokenizer, info)
+ label(tokenizer, info)
+ }
}
}
}
@@ -172,30 +176,19 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- _ if info.size > LINK_REFERENCE_SIZE_MAX => {
- tokenizer.exit(Token::Data);
- at_break(tokenizer, info)
- }
- Some(b'\t' | b' ') => {
- tokenizer.consume();
- info.size += 1;
- State::Fn(Box::new(|t| label(t, info)))
- }
- Some(b'\\') => {
- tokenizer.consume();
- info.size += 1;
- if !info.data {
- info.data = true;
- }
- State::Fn(Box::new(|t| escape(t, info)))
- }
- Some(_) => {
- tokenizer.consume();
- info.size += 1;
- if !info.data {
- info.data = true;
+ Some(byte) => {
+ if info.size > LINK_REFERENCE_SIZE_MAX {
+ tokenizer.exit(Token::Data);
+ at_break(tokenizer, info)
+ } else {
+ let func = if matches!(byte, b'\\') { escape } else { label };
+ tokenizer.consume();
+ info.size += 1;
+ if !info.data && !matches!(byte, b'\t' | b' ') {
+ info.data = true;
+ }
+ State::Fn(Box::new(move |t| func(t, info)))
}
- State::Fn(Box::new(|t| label(t, info)))
}
}
}
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index 80861af..9cf2f14 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -48,70 +48,13 @@ pub struct Options {
pub string: Token,
}
-/// Type of title.
-#[derive(Debug, PartialEq)]
-enum Kind {
- /// In a parenthesized (`(` and `)`) title.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// (a)
- /// ```
- Paren,
- /// In a double quoted (`"`) title.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// "a"
- /// ```
- Double,
- /// In a single quoted (`'`) title.
- ///
- /// ## Example
- ///
- /// ```markdown
- /// 'a'
- /// ```
- Single,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- ///
- /// > 👉 **Note**: a closing paren is used for `Kind::Paren`.
- fn as_byte(&self) -> u8 {
- match self {
- Kind::Paren => b')',
- Kind::Double => b'"',
- Kind::Single => b'\'',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `(`, `"`, or `'`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'(' => Kind::Paren,
- b'"' => Kind::Double,
- b'\'' => Kind::Single,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse titles.
#[derive(Debug)]
struct Info {
/// Whether we’ve seen data.
connect: bool,
- /// Kind of title.
- kind: Kind,
+ /// Closing marker.
+ marker: u8,
/// Configuration.
options: Options,
}
@@ -124,10 +67,11 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
match tokenizer.current {
- Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => {
+ Some(b'"' | b'\'' | b'(') => {
+ let marker = tokenizer.current.unwrap();
let info = Info {
connect: false,
- kind: Kind::from_byte(byte),
+ marker: if marker == b'(' { b')' } else { marker },
options,
};
tokenizer.enter(info.options.title.clone());
@@ -150,7 +94,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ```
fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {
tokenizer.enter(info.options.marker.clone());
tokenizer.consume();
tokenizer.exit(info.options.marker.clone());
@@ -172,10 +116,6 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
- tokenizer.exit(info.options.string.clone());
- begin(tokenizer, info)
- }
None => State::Nok,
Some(b'\n') => tokenizer.go(
space_or_tab_eol_with_options(EolOptions {
@@ -187,7 +127,11 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
at_break(t, info)
},
)(tokenizer),
- _ => {
+ Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {
+ tokenizer.exit(info.options.string.clone());
+ begin(tokenizer, info)
+ }
+ Some(_) => {
tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
if info.connect {
@@ -210,21 +154,18 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn title(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ None | Some(b'\n') => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- None | Some(b'\n') => {
+ Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- Some(b'\\') => {
+ Some(byte) => {
+ let func = if matches!(byte, b'\\') { escape } else { title };
tokenizer.consume();
- State::Fn(Box::new(|t| escape(t, info)))
- }
- _ => {
- tokenizer.consume();
- State::Fn(Box::new(|t| title(t, info)))
+ State::Fn(Box::new(move |t| func(t, info)))
}
}
}
@@ -237,7 +178,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn escape(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'"' | b'\'' | b')') => {
tokenizer.consume();
State::Fn(Box::new(|t| title(t, info)))
}
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index 13815cb..4f872ba 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -92,8 +92,7 @@ fn trim_data(
if trim_end {
let mut index = slice.bytes.len();
- let vs = slice.after;
- let mut spaces_only = vs == 0;
+ let mut spaces_only = slice.after == 0;
while index > 0 {
match slice.bytes[index - 1] {
b' ' => {}
@@ -105,10 +104,10 @@ fn trim_data(
}
let diff = slice.bytes.len() - index;
- let token_type = if spaces_only
- && hard_break
- && exit_index + 1 < tokenizer.events.len()
+ let token_type = if hard_break
+ && spaces_only
&& diff >= HARD_BREAK_PREFIX_SIZE_MIN
+ && exit_index + 1 < tokenizer.events.len()
{
Token::HardBreakTrailing
} else {
@@ -123,7 +122,7 @@ fn trim_data(
return;
}
- if diff > 0 || vs > 0 {
+ if diff > 0 || slice.after > 0 {
let exit_point = tokenizer.events[exit_index].point.clone();
let mut enter_point = exit_point.clone();
enter_point.index -= diff;
@@ -156,14 +155,11 @@ fn trim_data(
if trim_start {
let mut index = 0;
- let vs = slice.before;
while index < slice.bytes.len() {
match slice.bytes[index] {
- b' ' | b'\t' => {}
+ b' ' | b'\t' => index += 1,
_ => break,
}
-
- index += 1;
}
// The whole data is whitespace.
@@ -174,7 +170,7 @@ fn trim_data(
return;
}
- if index > 0 || vs > 0 {
+ if index > 0 || slice.before > 0 {
let enter_point = tokenizer.events[exit_index - 1].point.clone();
let mut exit_point = enter_point.clone();
exit_point.index += index;
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 4fc4dc4..785d132 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -53,64 +53,11 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN};
use crate::token::Token;
use crate::tokenizer::{State, Tokenizer};
-/// Type of thematic break.
-#[derive(Debug, PartialEq)]
-enum Kind {
- /// In a thematic break using asterisks (`*`).
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ***
- /// ```
- Asterisk,
- /// In a thematic break using dashes (`-`).
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ---
- /// ```
- Dash,
- /// In a thematic break using underscores (`_`).
- ///
- /// ## Example
- ///
- /// ```markdown
- /// ___
- /// ```
- Underscore,
-}
-
-impl Kind {
- /// Turn the kind into a byte ([u8]).
- fn as_byte(&self) -> u8 {
- match self {
- Kind::Asterisk => b'*',
- Kind::Dash => b'-',
- Kind::Underscore => b'_',
- }
- }
- /// Turn a byte ([u8]) into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `byte` is not `*`, `-`, or `_`.
- fn from_byte(byte: u8) -> Kind {
- match byte {
- b'*' => Kind::Asterisk,
- b'-' => Kind::Dash,
- b'_' => Kind::Underscore,
- _ => unreachable!("invalid byte"),
- }
- }
-}
-
/// State needed to parse thematic breaks.
#[derive(Debug)]
struct Info {
- /// Kind of marker.
- kind: Kind,
+ /// Marker.
+ marker: u8,
/// Number of markers.
size: usize,
}
@@ -122,15 +69,19 @@ struct Info {
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- let max = if tokenizer.parse_state.constructs.code_indented {
- TAB_SIZE - 1
- } else {
- usize::MAX
- };
-
if tokenizer.parse_state.constructs.thematic_break {
tokenizer.enter(Token::ThematicBreak);
- tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer)
+ tokenizer.go(
+ space_or_tab_min_max(
+ 0,
+ if tokenizer.parse_state.constructs.code_indented {
+ TAB_SIZE - 1
+ } else {
+ usize::MAX
+ },
+ ),
+ before,
+ )(tokenizer)
} else {
State::Nok
}
@@ -144,10 +95,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break(
+ Some(b'*' | b'-' | b'_') => at_break(
tokenizer,
Info {
- kind: Kind::from_byte(byte),
+ marker: tokenizer.current.unwrap(),
size: 0,
},
),
@@ -163,13 +114,13 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {
+ None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {
tokenizer.exit(Token::ThematicBreak);
// Feel free to interrupt.
tokenizer.interrupt = false;
State::Ok
}
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {
tokenizer.enter(Token::ThematicBreakSequence);
sequence(tokenizer, info)
}
@@ -185,7 +136,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Some(byte) if byte == info.kind.as_byte() => {
+ Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {
tokenizer.consume();
info.size += 1;
State::Fn(Box::new(|t| sequence(t, info)))
diff --git a/src/content/document.rs b/src/content/document.rs
index 828431d..76d510a 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -89,14 +89,13 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let event = &tokenizer.events[index];
if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
- // To do: when we operate on u8, we can use a `to_str` here as we
- // don‘t need virtual spaces.
+ // Note: we don‘t care about virtual spaces, so `as_str` is fine.
let id = normalize_identifier(
- &Slice::from_position(
+ Slice::from_position(
tokenizer.parse_state.bytes,
&Position::from_exit_event(&tokenizer.events, index),
)
- .serialize(),
+ .as_str(),
);
if !definitions.contains(&id) {
diff --git a/src/lib.rs b/src/lib.rs
index 750ca36..9bdf7e3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -423,6 +423,6 @@ pub fn micromark(value: &str) -> String {
/// ```
#[must_use]
pub fn micromark_with_options(value: &str, options: &Options) -> String {
- let (events, result) = parse(value, options);
- compile(&events, result.bytes, options)
+ let (events, bytes) = parse(value, options);
+ compile(&events, bytes, options)
}
diff --git a/src/parser.rs b/src/parser.rs
index 613b206..23afb37 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -20,7 +20,7 @@ pub struct ParseState<'a> {
/// Turn a string of markdown into events.
///
/// Passes the codes back so the compiler can access the source.
-pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, ParseState<'a>) {
+pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, &'a [u8]) {
let mut parse_state = ParseState {
constructs: &options.constructs,
bytes: value.as_bytes(),
@@ -37,6 +37,5 @@ pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, ParseStat
},
);
- // To do: return bytes only?
- (events, parse_state)
+ (events, parse_state.bytes)
}
diff --git a/src/unicode.rs b/src/unicode.rs
index a8445f9..764d4c7 100644
--- a/src/unicode.rs
+++ b/src/unicode.rs
@@ -6,7 +6,7 @@
/// > It is generate from the latest Unicode data.
///
/// Rust does not contain an `is_punctuation` method on `char`, while it does
-/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation).
+/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric).
///
/// `CommonMark` handles attention (emphasis, strong) markers based on what
/// comes before or after them.
diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs
index 5277f90..f8fd18f 100644
--- a/src/util/decode_character_reference.rs
+++ b/src/util/decode_character_reference.rs
@@ -57,9 +57,9 @@ pub fn decode_named(value: &str) -> String {
/// ```rust ignore
/// use micromark::util::decode_character_reference::decode_numeric;
///
-/// assert_eq!(decode_numeric("123", 10), '{');
-/// assert_eq!(decode_numeric("9", 16), '\t');
-/// assert_eq!(decode_numeric("0", 10), '�'); // Not allowed.
+/// assert_eq!(decode_numeric("123", 10), "{");
+/// assert_eq!(decode_numeric("9", 16), "\t");
+/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.
/// ```
///
/// ## Panics
@@ -74,27 +74,19 @@ pub fn decode_named(value: &str) -> String {
///
/// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)
/// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references)
-pub fn decode_numeric(value: &str, radix: u32) -> char {
- let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int");
-
- if
- // C0 except for HT, LF, FF, CR, space
- code < 0x09 ||
- code == 0x0B ||
- (code > 0x0D && code < 0x20) ||
- // Control character (DEL) of the basic block and C1 controls.
- (code > 0x7E && code < 0xA0) ||
- // Lone high surrogates and low surrogates.
- (code > 0xd7ff && code < 0xe000) ||
- // Noncharacters.
- (code > 0xfdcf && code < 0xfdf0) ||
- ((code & 0xffff) == 0xffff) ||
- ((code & 0xffff) == 0xfffe) ||
- // Out of range
- code > 0x0010_ffff
- {
- char::REPLACEMENT_CHARACTER
- } else {
- char::from_u32(code).expect("expected valid `code`")
+pub fn decode_numeric(value: &str, radix: u32) -> String {
+ if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) {
+ if !matches!(char,
+ // C0 except for HT, LF, FF, CR, space
+ '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' |
+ // Control character (DEL) of c0, and C1 controls.
+ '\u{7F}'..='\u{9F}'
+ // Lone surrogates, noncharacters, and out of range are handled by
+ // Rust.
+ ) {
+ return char.to_string();
+ }
}
+
+ char::REPLACEMENT_CHARACTER.to_string()
}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 91c5462..d37a2de 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -20,37 +20,33 @@
/// ## References
///
/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
-pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String {
- let check = if encode_html { check_all } else { check_nil };
- let mut value = value.into();
-
+pub fn encode(value: &str, encode_html: bool) -> String {
// It’ll grow a bit bigger for each dangerous character.
let mut result = String::with_capacity(value.len());
+ let bytes = value.as_bytes();
+ let mut index = 0;
+ let mut start = 0;
- while let Some(indice) = value.find(check) {
- let after = value.split_off(indice + 1);
- let dangerous = value.pop().unwrap();
- result.push_str(&value);
- result.push_str(match dangerous {
- '\0' => "�",
- '&' => "&amp;",
- '"' => "&quot;",
- '<' => "&lt;",
- '>' => "&gt;",
- _ => unreachable!("xxx"),
- });
- value = after;
- }
+ while index < bytes.len() {
+ let byte = bytes[index];
+ if matches!(byte, b'\0') || (encode_html && matches!(byte, b'&' | b'"' | b'<' | b'>')) {
+ result.push_str(&value[start..index]);
+ result.push_str(match byte {
+ b'\0' => "�",
+ b'&' => "&amp;",
+ b'"' => "&quot;",
+ b'<' => "&lt;",
+ b'>' => "&gt;",
+ _ => panic!("impossible"),
+ });
- result.push_str(&value);
+ start = index + 1;
+ }
- result
-}
+ index += 1;
+ }
-fn check_all(char: char) -> bool {
- matches!(char, '\0' | '&' | '"' | '<' | '>')
-}
+ result.push_str(&value[start..]);
-fn check_nil(char: char) -> bool {
- matches!(char, '\0')
+ result
}
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index 42a2bb0..f5b12d0 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -34,25 +34,34 @@
pub fn normalize_identifier(value: &str) -> String {
// Note: it’ll grow a bit smaller for consecutive whitespace.
let mut result = String::with_capacity(value.len());
- let mut at_start = true;
- let mut at_whitespace = true;
+ let bytes = value.as_bytes();
+ let mut in_whitespace = true;
+ let mut index = 0;
+ let mut start = 0;
- // Collapse markdown whitespace and trim it.
- for char in value.chars() {
- match char {
- '\t' | '\n' | '\r' | ' ' => {
- at_whitespace = true;
+ while index < bytes.len() {
+ if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') {
+ // First whitespace we see after non-whitespace.
+ if !in_whitespace {
+ result.push_str(&value[start..index]);
+ in_whitespace = true;
}
- _ => {
- if at_whitespace && !at_start {
- result.push(' ');
- }
-
- result.push(char);
- at_start = false;
- at_whitespace = false;
+ }
+ // First non-whitespace we see after whitespace.
+ else if in_whitespace {
+ if start != 0 {
+ result.push(' ');
}
+
+ start = index;
+ in_whitespace = false;
}
+
+ index += 1;
+ }
+
+ if !in_whitespace {
+ result.push_str(&value[start..]);
}
// Some characters are considered “uppercase”, but if their lowercase
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 8c09549..051e1e1 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -32,7 +32,7 @@ use crate::util::encode::encode;
///
/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
- let value = encode(normalize_uri(value), true);
+ let value = encode(&*normalize_uri(value), true);
if let Some(protocols) = protocols {
let end = value.find(|c| matches!(c, '?' | '#' | '/'));
diff --git a/src/util/slice.rs b/src/util/slice.rs
index cd3641e..d899dac 100644
--- a/src/util/slice.rs
+++ b/src/util/slice.rs
@@ -2,6 +2,7 @@
use crate::constant::TAB_SIZE;
use crate::tokenizer::{Event, EventType, Point};
+use std::str;
/// A range between two places.
#[derive(Debug)]
@@ -78,6 +79,15 @@ impl<'a> Slice<'a> {
}
}
+ /// To do.
+ pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> {
+ Slice {
+ bytes: &bytes[index..=index],
+ before: 0,
+ after: 0,
+ }
+ }
+
/// Get the slice belonging to a position.
pub fn from_position(bytes: &'a [u8], position: &Position) -> Slice<'a> {
let mut before = position.start.vs;
@@ -107,14 +117,18 @@ impl<'a> Slice<'a> {
}
/// To do.
- // To do: rename to `len`?
- pub fn size(&self) -> usize {
- self.bytes.len() + self.before + self.after
+ pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> {
+ Slice {
+ bytes: &bytes[start..end],
+ before: 0,
+ after: 0,
+ }
}
- // To do:
- // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html>
- // to implement an `as_str`.
+ /// To do.
+ pub fn len(&self) -> usize {
+ self.bytes.len() + self.before + self.after
+ }
/// To do.
pub fn head(&self) -> Option<u8> {
@@ -127,16 +141,20 @@ impl<'a> Slice<'a> {
}
}
+ // To do:
+ pub fn as_str(&self) -> &str {
+ str::from_utf8(self.bytes).unwrap()
+ }
+
/// To do.
pub fn serialize(&self) -> String {
- let mut string = String::with_capacity(self.size());
+ let mut string = String::with_capacity(self.len());
let mut index = self.before;
while index > 0 {
string.push(' ');
index -= 1;
}
- // To do: invalid UTF8?
- string.push_str(std::str::from_utf8(self.bytes).unwrap());
+ string.push_str(self.as_str());
index = self.after;
while index > 0 {
string.push(' ');
diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs
index f251952..99f729e 100644
--- a/tests/code_fenced.rs
+++ b/tests/code_fenced.rs
@@ -221,6 +221,12 @@ fn code_fenced() {
);
assert_eq!(
+ micromark("```a\\&b\0c"),
+ "<pre><code class=\"language-a&amp;b�c\"></code></pre>\n",
+ "should encode dangerous characters in languages"
+ );
+
+ assert_eq!(
micromark(" ```\naaa\n ```"),
"<pre><code>aaa\n ```\n</code></pre>\n",
"should not support a closing sequence w/ too much indent, regardless of opening sequence (1)"