aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
commitf7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch)
treec1ac3f22473bd79566d835b2474d2ae9e00d6c55
parentd729b07712ca9cc91e68af1776dac9d7008a90cb (diff)
downloadmarkdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip
Refactor to work on `char`s
Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event.
-rw-r--r--src/compiler.rs204
-rw-r--r--src/constant.rs4
-rw-r--r--src/construct/attention.rs75
-rw-r--r--src/construct/autolink.rs40
-rw-r--r--src/construct/blank_line.rs4
-rw-r--r--src/construct/block_quote.rs27
-rw-r--r--src/construct/character_escape.rs6
-rw-r--r--src/construct/character_reference.rs82
-rw-r--r--src/construct/code_fenced.rs65
-rw-r--r--src/construct/code_indented.rs17
-rw-r--r--src/construct/code_text.rs18
-rw-r--r--src/construct/definition.rs10
-rw-r--r--src/construct/hard_break_escape.rs6
-rw-r--r--src/construct/heading_atx.rs22
-rw-r--r--src/construct/heading_setext.rs8
-rw-r--r--src/construct/html_flow.rs224
-rw-r--r--src/construct/html_text.rs161
-rw-r--r--src/construct/label_end.rs92
-rw-r--r--src/construct/label_start_image.rs6
-rw-r--r--src/construct/label_start_link.rs4
-rw-r--r--src/construct/list.rs57
-rw-r--r--src/construct/paragraph.rs6
-rw-r--r--src/construct/partial_data.rs31
-rw-r--r--src/construct/partial_destination.rs39
-rw-r--r--src/construct/partial_label.rs22
-rw-r--r--src/construct/partial_non_lazy_continuation.rs4
-rw-r--r--src/construct/partial_space_or_tab.rs13
-rw-r--r--src/construct/partial_title.rs35
-rw-r--r--src/construct/partial_whitespace.rs53
-rw-r--r--src/construct/thematic_break.rs27
-rw-r--r--src/content/document.rs39
-rw-r--r--src/content/flow.rs14
-rw-r--r--src/content/string.rs6
-rw-r--r--src/content/text.rs24
-rw-r--r--src/lib.rs15
-rw-r--r--src/parser.rs14
-rw-r--r--src/token.rs14
-rw-r--r--src/tokenizer.rs280
-rw-r--r--src/util/codes.rs125
-rw-r--r--src/util/encode.rs12
-rw-r--r--src/util/mod.rs3
-rw-r--r--src/util/sanitize_uri.rs2
-rw-r--r--src/util/slice.rs156
-rw-r--r--src/util/span.rs57
-rw-r--r--tests/misc_tabs.rs6
45 files changed, 1087 insertions, 1042 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index a575221..f5673b4 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -2,14 +2,14 @@
use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC};
use crate::construct::character_reference::Kind as CharacterReferenceKind;
use crate::token::Token;
-use crate::tokenizer::{Code, Event, EventType};
+use crate::tokenizer::{Event, EventType};
use crate::util::normalize_identifier::normalize_identifier;
use crate::util::{
decode_character_reference::{decode_named, decode_numeric},
encode::encode,
sanitize_uri::sanitize_uri,
skip,
- span::{codes as codes_from_span, from_exit_event, serialize},
+ slice::{Position, Slice},
};
use crate::{LineEnding, Options};
@@ -60,7 +60,7 @@ struct Definition {
struct CompileContext<'a> {
/// Static info.
pub events: &'a [Event],
- pub codes: &'a [Code],
+ pub chars: &'a [char],
/// Fields used by handlers to track the things they need to track to
/// compile markdown.
pub atx_opening_sequence_size: Option<usize>,
@@ -76,7 +76,7 @@ struct CompileContext<'a> {
/// Fields used to influance the current compilation.
pub slurp_one_line_ending: bool,
pub tags: bool,
- pub ignore_encode: bool,
+ pub encode_html: bool,
pub last_was_tag: bool,
/// Configuration
pub protocol_href: Option<Vec<&'static str>>,
@@ -92,13 +92,13 @@ impl<'a> CompileContext<'a> {
/// Create a new compile context.
pub fn new(
events: &'a [Event],
- codes: &'a [Code],
+ chars: &'a [char],
options: &Options,
line_ending: LineEnding,
) -> CompileContext<'a> {
CompileContext {
events,
- codes,
+ chars,
atx_opening_sequence_size: None,
heading_setext_buffer: None,
code_flow_seen_data: None,
@@ -111,7 +111,7 @@ impl<'a> CompileContext<'a> {
tight_stack: vec![],
slurp_one_line_ending: false,
tags: true,
- ignore_encode: false,
+ encode_html: true,
last_was_tag: false,
protocol_href: if options.allow_dangerous_protocol {
None
@@ -151,16 +151,13 @@ impl<'a> CompileContext<'a> {
pub fn push_raw<'x, S: Into<&'x str>>(&mut self, value: S) {
let value = value.into();
- if self.ignore_encode {
- self.push(value);
- } else {
- self.push(&*encode(value));
- }
+ self.push(&*encode(value, self.encode_html));
}
pub fn tag<'x, S: Into<&'x str>>(&mut self, value: S) {
if self.tags {
- self.push(value.into());
+ let value = value.into();
+ self.push(&*encode(value, false));
self.last_was_tag = true;
}
}
@@ -199,7 +196,7 @@ impl<'a> CompileContext<'a> {
/// Turn events and codes into a string of HTML.
#[allow(clippy::too_many_lines)]
-pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
+pub fn compile(events: &[Event], chars: &[char], options: &Options) -> String {
let mut index = 0;
let mut line_ending_inferred = None;
@@ -211,8 +208,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
if event.event_type == EventType::Exit
&& (event.token_type == Token::BlankLineEnding || event.token_type == Token::LineEnding)
{
- let codes = codes_from_span(codes, &from_exit_event(events, index));
- line_ending_inferred = Some(LineEnding::from_code(*codes.first().unwrap()));
+ line_ending_inferred = Some(LineEnding::from_str(
+ &Slice::from_position(chars, &Position::from_exit_event(events, index)).serialize(),
+ ));
break;
}
@@ -239,7 +237,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String {
}
};
- let mut context = CompileContext::new(events, codes, options, line_ending_default);
+ let mut context = CompileContext::new(events, chars, options, line_ending_default);
let mut definition_indices = vec![];
let mut index = 0;
let mut definition_inside = false;
@@ -441,7 +439,7 @@ fn on_enter_definition(context: &mut CompileContext) {
/// Handle [`Enter`][EventType::Enter]:[`DefinitionDestinationString`][Token::DefinitionDestinationString].
fn on_enter_definition_destination_string(context: &mut CompileContext) {
context.buffer();
- context.ignore_encode = true;
+ context.encode_html = false;
}
/// Handle [`Enter`][EventType::Enter]:[`Emphasis`][Token::Emphasis].
@@ -453,14 +451,14 @@ fn on_enter_emphasis(context: &mut CompileContext) {
fn on_enter_html_flow(context: &mut CompileContext) {
context.line_ending_if_needed();
if context.allow_dangerous_html {
- context.ignore_encode = true;
+ context.encode_html = false;
}
}
/// Handle [`Enter`][EventType::Enter]:[`HtmlText`][Token::HtmlText].
fn on_enter_html_text(context: &mut CompileContext) {
if context.allow_dangerous_html {
- context.ignore_encode = true;
+ context.encode_html = false;
}
}
@@ -595,7 +593,7 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) {
context.buffer();
// Ignore encoding the result, as we’ll first percent encode the url and
// encode manually after.
- context.ignore_encode = true;
+ context.encode_html = false;
}
/// Handle [`Enter`][EventType::Enter]:[`Strong`][Token::Strong].
@@ -605,34 +603,36 @@ fn on_enter_strong(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail].
fn on_exit_autolink_email(context: &mut CompileContext) {
- let slice = serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- );
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
context.tag(&*format!(
"<a href=\"{}\">",
sanitize_uri(
- format!("mailto:{}", slice.as_str()).as_str(),
+ format!("mailto:{}", value.as_str()).as_str(),
&context.protocol_href
)
));
- context.push_raw(&*slice);
+ context.push_raw(&*value);
context.tag("</a>");
}
/// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol].
fn on_exit_autolink_protocol(context: &mut CompileContext) {
- let slice = serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- );
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
context.tag(&*format!(
"<a href=\"{}\">",
- sanitize_uri(slice.as_str(), &context.protocol_href)
+ sanitize_uri(value.as_str(), &context.protocol_href)
));
- context.push_raw(&*slice);
+ context.push_raw(&*value);
context.tag("</a>");
}
@@ -677,11 +677,12 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {
.character_reference_kind
.take()
.expect("expected `character_reference_kind` to be set");
- let reference = serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- );
+ let reference = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
let ref_string = reference.as_str();
let value = match kind {
CharacterReferenceKind::Decimal => decode_numeric(ref_string, 10).to_string(),
@@ -694,12 +695,14 @@ fn on_exit_character_reference_value(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk].
fn on_exit_code_flow_chunk(context: &mut CompileContext) {
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
context.code_flow_seen_data = Some(true);
- context.push_raw(&*serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- ));
+ context.push_raw(&*value);
}
/// Handle [`Exit`][EventType::Exit]:[`CodeFencedFence`][Token::CodeFencedFence].
@@ -793,12 +796,14 @@ fn on_exit_drop(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}.
fn on_exit_data(context: &mut CompileContext) {
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
// Just output it.
- context.push_raw(&*serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- ));
+ context.push_raw(&*value);
}
/// Handle [`Exit`][EventType::Exit]:[`Definition`][Token::Definition].
@@ -830,19 +835,21 @@ fn on_exit_definition_destination_string(context: &mut CompileContext) {
let buf = context.resume();
let definition = context.media_stack.last_mut().unwrap();
definition.destination = Some(buf);
- context.ignore_encode = false;
+ context.encode_html = true;
}
/// Handle [`Exit`][EventType::Exit]:[`DefinitionLabelString`][Token::DefinitionLabelString].
fn on_exit_definition_label_string(context: &mut CompileContext) {
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
// Discard label, use the source content instead.
context.resume();
let definition = context.media_stack.last_mut().unwrap();
- definition.reference_id = Some(serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- ));
+ definition.reference_id = Some(value);
}
/// Handle [`Exit`][EventType::Exit]:[`DefinitionTitleString`][Token::DefinitionTitleString].
@@ -871,12 +878,11 @@ fn on_exit_heading_atx(context: &mut CompileContext) {
fn on_exit_heading_atx_sequence(context: &mut CompileContext) {
// First fence we see.
if context.atx_opening_sequence_size.is_none() {
- let rank = serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
+ let rank = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
)
- .len();
+ .size();
context.line_ending_if_needed();
context.atx_opening_sequence_size = Some(rank);
context.tag(&*format!("<h{}>", rank));
@@ -902,11 +908,12 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) {
.heading_setext_buffer
.take()
.expect("`atx_opening_sequence_size` must be set in headings");
- let head = codes_from_span(
- context.codes,
- &from_exit_event(context.events, context.index),
- )[0];
- let level: usize = if head == Code::Char('-') { 2 } else { 1 };
+ let head = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .head();
+ let level = if head == Some('-') { 2 } else { 1 };
context.line_ending_if_needed();
context.tag(&*format!("<h{}>", level));
@@ -916,17 +923,18 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:{[`HtmlFlow`][Token::HtmlFlow],[`HtmlText`][Token::HtmlText]}.
fn on_exit_html(context: &mut CompileContext) {
- context.ignore_encode = false;
+ context.encode_html = true;
}
/// Handle [`Exit`][EventType::Exit]:{[`HtmlFlowData`][Token::HtmlFlowData],[`HtmlTextData`][Token::HtmlTextData]}.
fn on_exit_html_data(context: &mut CompileContext) {
- let slice = serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- );
- context.push_raw(&*slice);
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
+ context.push_raw(&*value);
}
/// Handle [`Exit`][EventType::Exit]:[`Label`][Token::Label].
@@ -938,12 +946,14 @@ fn on_exit_label(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:[`LabelText`][Token::LabelText].
fn on_exit_label_text(context: &mut CompileContext) {
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
let media = context.media_stack.last_mut().unwrap();
- media.label_id = Some(serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- ));
+ media.label_id = Some(value);
}
/// Handle [`Exit`][EventType::Exit]:[`LineEnding`][Token::LineEnding].
@@ -953,11 +963,13 @@ fn on_exit_line_ending(context: &mut CompileContext) {
} else if context.slurp_one_line_ending {
context.slurp_one_line_ending = false;
} else {
- context.push_raw(&*serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- ));
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
+ context.push_raw(&*value);
}
}
@@ -1004,12 +1016,12 @@ fn on_exit_list_item_value(context: &mut CompileContext) {
let expect_first_item = context.expect_first_item.unwrap();
if expect_first_item {
- let slice = serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- );
- let value = slice.parse::<u32>().ok().unwrap();
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+ let value = value.parse::<u32>().ok().unwrap();
if value != 1 {
context.tag(" start=\"");
@@ -1110,14 +1122,16 @@ fn on_exit_paragraph(context: &mut CompileContext) {
/// Handle [`Exit`][EventType::Exit]:[`ReferenceString`][Token::ReferenceString].
fn on_exit_reference_string(context: &mut CompileContext) {
+ let value = Slice::from_position(
+ context.chars,
+ &Position::from_exit_event(context.events, context.index),
+ )
+ .serialize();
+
// Drop stuff.
context.resume();
let media = context.media_stack.last_mut().unwrap();
- media.reference_id = Some(serialize(
- context.codes,
- &from_exit_event(context.events, context.index),
- false,
- ));
+ media.reference_id = Some(value);
}
/// Handle [`Exit`][EventType::Exit]:[`ResourceDestinationString`][Token::ResourceDestinationString].
@@ -1125,7 +1139,7 @@ fn on_exit_resource_destination_string(context: &mut CompileContext) {
let buf = context.resume();
let media = context.media_stack.last_mut().unwrap();
media.destination = Some(buf);
- context.ignore_encode = false;
+ context.encode_html = true;
}
/// Handle [`Exit`][EventType::Exit]:[`ResourceTitleString`][Token::ResourceTitleString].
diff --git a/src/constant.rs b/src/constant.rs
index b8b36ad..d84dda5 100644
--- a/src/constant.rs
+++ b/src/constant.rs
@@ -232,11 +232,7 @@ pub const SAFE_PROTOCOL_SRC: [&str; 2] = ["http", "https"];
/// constructs in markdown, most notable the whitespace required to form
/// [code (indented)][code_indented].
///
-/// > 👉 **Note**: each [`Code::VirtualSpace`][vs] and `Code::Char('\t' | ' ')`
-/// > counts.
-///
/// [code_indented]: crate::construct::code_indented
-/// [vs]: crate::tokenizer::Code::VirtualSpace
pub const TAB_SIZE: usize = 4;
/// The number of markers needed for a [thematic break][thematic_break] to form.
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index 27d7544..65c2f6f 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -52,8 +52,9 @@
//! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element
use crate::token::Token;
-use crate::tokenizer::{Code, Event, EventType, Point, State, Tokenizer};
+use crate::tokenizer::{Event, EventType, Point, State, Tokenizer};
use crate::unicode::PUNCTUATION;
+use crate::util::slice::Slice;
/// Character code kinds.
#[derive(Debug, PartialEq)]
@@ -128,17 +129,6 @@ impl MarkerKind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not `Code::Char('*' | '_')`.
- fn from_code(code: Code) -> MarkerKind {
- match code {
- Code::Char(char) => MarkerKind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// Attentention sequence that we can take markers from.
@@ -170,9 +160,9 @@ struct Sequence {
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('*' | '_') if tokenizer.parse_state.constructs.attention => {
+ Some(char) if tokenizer.parse_state.constructs.attention && matches!(char, '*' | '_') => {
tokenizer.enter(Token::AttentionSequence);
- inside(tokenizer, MarkerKind::from_code(tokenizer.current))
+ inside(tokenizer, MarkerKind::from_char(char))
}
_ => State::Nok,
}
@@ -185,23 +175,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^^
/// ```
fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State {
- match tokenizer.current {
- Code::Char(char) if char == marker.as_char() => {
- tokenizer.consume();
- State::Fn(Box::new(move |t| inside(t, marker)))
- }
- _ => {
- tokenizer.exit(Token::AttentionSequence);
- tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention));
- State::Ok
- }
+ if tokenizer.current == Some(marker.as_char()) {
+ tokenizer.consume();
+ State::Fn(Box::new(move |t| inside(t, marker)))
+ } else {
+ tokenizer.exit(Token::AttentionSequence);
+ tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention));
+ State::Ok
}
}
/// Resolve attention sequences.
#[allow(clippy::too_many_lines)]
fn resolve_attention(tokenizer: &mut Tokenizer) {
- let codes = &tokenizer.parse_state.codes;
+ let chars = &tokenizer.parse_state.chars;
let mut start = 0;
let mut balance = 0;
let mut sequences = vec![];
@@ -216,17 +203,21 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
if enter.token_type == Token::AttentionSequence {
let end = start + 1;
let exit = &tokenizer.events[end];
- let marker = MarkerKind::from_code(codes[enter.point.index]);
+ let marker =
+ MarkerKind::from_char(Slice::from_point(chars, &enter.point).head().unwrap());
let before = classify_character(if enter.point.index > 0 {
- codes[enter.point.index - 1]
- } else {
- Code::None
- });
- let after = classify_character(if exit.point.index < codes.len() {
- codes[exit.point.index]
+ Slice::from_point(
+ chars,
+ &Point {
+ index: enter.point.index - 1,
+ ..enter.point
+ },
+ )
+ .tail()
} else {
- Code::None
+ None
});
+ let after = classify_character(Slice::from_point(chars, &exit.point).tail());
let open = after == GroupKind::Other
|| (after == GroupKind::Punctuation && before != GroupKind::Other);
// To do: GFM strikethrough?
@@ -326,9 +317,9 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
let sequence_close = &mut sequences[close];
let close_event_index = sequence_close.event_index;
let seq_close_enter = sequence_close.start_point.clone();
+ // No need to worry about `VS`, because sequences are only actual characters.
sequence_close.size -= take;
sequence_close.start_point.column += take;
- sequence_close.start_point.offset += take;
sequence_close.start_point.index += take;
let seq_close_exit = sequence_close.start_point.clone();
@@ -352,9 +343,9 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
let sequence_open = &mut sequences[open];
let open_event_index = sequence_open.event_index;
let seq_open_exit = sequence_open.end_point.clone();
+ // No need to worry about `VS`, because sequences are only actual characters.
sequence_open.size -= take;
sequence_open.end_point.column -= take;
- sequence_open.end_point.offset -= take;
sequence_open.end_point.index -= take;
let seq_open_enter = sequence_open.end_point.clone();
@@ -492,20 +483,20 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {
/// Used for attention (emphasis, strong), whose sequences can open or close
/// based on the class of surrounding characters.
///
-/// > 👉 **Note** that eof (`Code::None`) is seen as whitespace.
+/// > 👉 **Note** that eof (`None`) is seen as whitespace.
///
/// ## References
///
/// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js)
-fn classify_character(code: Code) -> GroupKind {
- match code {
+fn classify_character(char: Option<char>) -> GroupKind {
+ match char {
// Custom characters.
- Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace => GroupKind::Whitespace,
+ None => GroupKind::Whitespace,
// Unicode whitespace.
- Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace,
+ Some(char) if char.is_whitespace() => GroupKind::Whitespace,
// Unicode punctuation.
- Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation,
+ Some(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation,
// Everything else.
- Code::Char(_) => GroupKind::Other,
+ Some(_) => GroupKind::Other,
}
}
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
index 3933596..399570b 100644
--- a/src/construct/autolink.rs
+++ b/src/construct/autolink.rs
@@ -103,7 +103,7 @@
use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of an autolink.
///
@@ -115,7 +115,7 @@ use crate::tokenizer::{Code, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('<') if tokenizer.parse_state.constructs.autolink => {
+ Some('<') if tokenizer.parse_state.constructs.autolink => {
tokenizer.enter(Token::Autolink);
tokenizer.enter(Token::AutolinkMarker);
tokenizer.consume();
@@ -137,11 +137,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Some(char) if char.is_ascii_alphabetic() => {
tokenizer.consume();
State::Fn(Box::new(scheme_or_email_atext))
}
- Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer),
+ Some(char) if is_ascii_atext(char) => email_atext(tokenizer),
_ => State::Nok,
}
}
@@ -156,7 +156,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
+ Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
scheme_inside_or_email_atext(tokenizer, 1)
}
_ => email_atext(tokenizer),
@@ -173,11 +173,11 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State {
/// ```
fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Code::Char(':') => {
+ Some(':') => {
tokenizer.consume();
State::Fn(Box::new(url_inside))
}
- Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z')
+ Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z')
if size < AUTOLINK_SCHEME_SIZE_MAX =>
{
tokenizer.consume();
@@ -195,15 +195,13 @@ fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State
/// ```
fn url_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.exit(Token::AutolinkProtocol);
end(tokenizer)
}
- Code::Char(char) if char.is_ascii_control() => State::Nok,
- Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => {
- State::Nok
- }
- Code::Char(_) => {
+ Some(char) if char.is_ascii_control() => State::Nok,
+ None | Some(' ') => State::Nok,
+ Some(_) => {
tokenizer.consume();
State::Fn(Box::new(url_inside))
}
@@ -218,11 +216,11 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State {
/// ```
fn email_atext(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('@') => {
+ Some('@') => {
tokenizer.consume();
State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0)))
}
- Code::Char(char) if is_ascii_atext(char) => {
+ Some(char) if is_ascii_atext(char) => {
tokenizer.consume();
State::Fn(Box::new(email_atext))
}
@@ -238,7 +236,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {
/// ```
fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size),
+ Some(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size),
_ => State::Nok,
}
}
@@ -251,11 +249,11 @@ fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State {
/// ```
fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Code::Char('.') => {
+ Some('.') => {
tokenizer.consume();
State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0)))
}
- Code::Char('>') => {
+ Some('>') => {
let index = tokenizer.events.len();
tokenizer.exit(Token::AutolinkProtocol);
// Change the token type.
@@ -277,11 +275,11 @@ fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State {
/// ```
fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => {
+ Some('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |t| email_value(t, size + 1)))
}
- Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => {
+ Some(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |t| email_label(t, size + 1)))
}
@@ -299,7 +297,7 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State {
/// ```
fn end(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.enter(Token::AutolinkMarker);
tokenizer.consume();
tokenizer.exit(Token::AutolinkMarker);
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs
index 537ffc1..6780f40 100644
--- a/src/construct/blank_line.rs
+++ b/src/construct/blank_line.rs
@@ -33,7 +33,7 @@
//! [flow]: crate::content::flow
use crate::construct::partial_space_or_tab::space_or_tab;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of a blank line.
///
@@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok,
+ None | Some('\n') => State::Ok,
_ => State::Nok,
}
}
diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs
index 3bb4b8b..49a0ea0 100644
--- a/src/construct/block_quote.rs
+++ b/src/construct/block_quote.rs
@@ -36,7 +36,7 @@
use crate::constant::TAB_SIZE;
use crate::construct::partial_space_or_tab::space_or_tab_min_max;
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of block quote.
///
@@ -65,7 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.enter(Token::BlockQuote);
cont_before(tokenizer)
}
@@ -98,7 +98,7 @@ pub fn cont(tokenizer: &mut Tokenizer) -> State {
/// ```
fn cont_before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.enter(Token::BlockQuotePrefix);
tokenizer.enter(Token::BlockQuoteMarker);
tokenizer.consume();
@@ -118,17 +118,14 @@ fn cont_before(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn cont_after(tokenizer: &mut Tokenizer) -> State {
- match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.enter(Token::SpaceOrTab);
- tokenizer.consume();
- tokenizer.exit(Token::SpaceOrTab);
- tokenizer.exit(Token::BlockQuotePrefix);
- State::Ok
- }
- _ => {
- tokenizer.exit(Token::BlockQuotePrefix);
- State::Ok
- }
+ if let Some('\t' | ' ') = tokenizer.current {
+ tokenizer.enter(Token::SpaceOrTab);
+ tokenizer.consume();
+ tokenizer.exit(Token::SpaceOrTab);
+ tokenizer.exit(Token::BlockQuotePrefix);
+ State::Ok
+ } else {
+ tokenizer.exit(Token::BlockQuotePrefix);
+ State::Ok
}
}
diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs
index 9e9b713..e9263af 100644
--- a/src/construct/character_escape.rs
+++ b/src/construct/character_escape.rs
@@ -34,7 +34,7 @@
//! [hard_break_escape]: crate::construct::hard_break_escape
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of a character escape.
///
@@ -44,7 +44,7 @@ use crate::tokenizer::{Code, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('\\') if tokenizer.parse_state.constructs.character_escape => {
+ Some('\\') if tokenizer.parse_state.constructs.character_escape => {
tokenizer.enter(Token::CharacterEscape);
tokenizer.enter(Token::CharacterEscapeMarker);
tokenizer.consume();
@@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(char) if char.is_ascii_punctuation() => {
+ Some(char) if char.is_ascii_punctuation() => {
tokenizer.enter(Token::CharacterEscapeValue);
tokenizer.consume();
tokenizer.exit(Token::CharacterEscapeValue);
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs
index 8521f15..59043d1 100644
--- a/src/construct/character_reference.rs
+++ b/src/construct/character_reference.rs
@@ -66,7 +66,8 @@ use crate::constant::{
CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,
};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{Point, State, Tokenizer};
+use crate::util::slice::{Position, Slice};
/// Kind of a character reference.
#[derive(Debug, Clone, PartialEq)]
@@ -120,8 +121,10 @@ impl Kind {
/// State needed to parse character references.
#[derive(Debug, Clone)]
struct Info {
- /// All parsed characters.
- buffer: String,
+ /// Place of value start.
+ start: Point,
+ /// Size of value.
+ size: usize,
/// Kind of character reference.
kind: Kind,
}
@@ -138,7 +141,7 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('&') if tokenizer.parse_state.constructs.character_reference => {
+ Some('&') if tokenizer.parse_state.constructs.character_reference => {
tokenizer.enter(Token::CharacterReference);
tokenizer.enter(Token::CharacterReferenceMarker);
tokenizer.consume();
@@ -161,18 +164,21 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn open(tokenizer: &mut Tokenizer) -> State {
- let info = Info {
- buffer: String::new(),
- kind: Kind::Named,
- };
- if let Code::Char('#') = tokenizer.current {
+ if let Some('#') = tokenizer.current {
tokenizer.enter(Token::CharacterReferenceMarkerNumeric);
tokenizer.consume();
tokenizer.exit(Token::CharacterReferenceMarkerNumeric);
- State::Fn(Box::new(|t| numeric(t, info)))
+ State::Fn(Box::new(numeric))
} else {
tokenizer.enter(Token::CharacterReferenceValue);
- value(tokenizer, info)
+ value(
+ tokenizer,
+ Info {
+ start: tokenizer.point.clone(),
+ size: 0,
+ kind: Kind::Named,
+ },
+ )
}
}
@@ -185,17 +191,25 @@ fn open(tokenizer: &mut Tokenizer) -> State {
/// > | a&#x9;b
/// ^
/// ```
-fn numeric(tokenizer: &mut Tokenizer, mut info: Info) -> State {
- if let Code::Char('x' | 'X') = tokenizer.current {
+fn numeric(tokenizer: &mut Tokenizer) -> State {
+ if let Some('x' | 'X') = tokenizer.current {
tokenizer.enter(Token::CharacterReferenceMarkerHexadecimal);
tokenizer.consume();
tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal);
tokenizer.enter(Token::CharacterReferenceValue);
- info.kind = Kind::Hexadecimal;
+ let info = Info {
+ start: tokenizer.point.clone(),
+ size: 0,
+ kind: Kind::Hexadecimal,
+ };
State::Fn(Box::new(|t| value(t, info)))
} else {
tokenizer.enter(Token::CharacterReferenceValue);
- info.kind = Kind::Decimal;
+ let info = Info {
+ start: tokenizer.point.clone(),
+ size: 0,
+ kind: Kind::Decimal,
+ };
value(tokenizer, info)
}
}
@@ -215,24 +229,32 @@ fn numeric(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char(';') if !info.buffer.is_empty() => {
- let unknown_named = Kind::Named == info.kind
- && !CHARACTER_REFERENCES.iter().any(|d| d.0 == info.buffer);
+ Some(';') if info.size > 0 => {
+ if Kind::Named == info.kind {
+ let value = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &info.start,
+ end: &tokenizer.point,
+ },
+ )
+ .serialize();
- if unknown_named {
- State::Nok
- } else {
- tokenizer.exit(Token::CharacterReferenceValue);
- tokenizer.enter(Token::CharacterReferenceMarkerSemi);
- tokenizer.consume();
- tokenizer.exit(Token::CharacterReferenceMarkerSemi);
- tokenizer.exit(Token::CharacterReference);
- State::Ok
+ if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) {
+ return State::Nok;
+ }
}
+
+ tokenizer.exit(Token::CharacterReferenceValue);
+ tokenizer.enter(Token::CharacterReferenceMarkerSemi);
+ tokenizer.consume();
+ tokenizer.exit(Token::CharacterReferenceMarkerSemi);
+ tokenizer.exit(Token::CharacterReference);
+ State::Ok
}
- Code::Char(char) => {
- if info.buffer.len() < info.kind.max() && info.kind.allowed(char) {
- info.buffer.push(char);
+ Some(char) => {
+ if info.size < info.kind.max() && info.kind.allowed(char) {
+ info.size += 1;
tokenizer.consume();
State::Fn(Box::new(|t| value(t, info)))
} else {
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index 2fea95e..98fa54f 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -107,8 +107,8 @@ use crate::construct::{
partial_space_or_tab::{space_or_tab, space_or_tab_min_max},
};
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, State, Tokenizer};
-use crate::util::span::from_exit_event;
+use crate::tokenizer::{ContentType, State, Tokenizer};
+use crate::util::slice::{Position, Slice};
/// Kind of fences.
#[derive(Debug, Clone, PartialEq)]
@@ -155,17 +155,6 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not ``Code::Char('~' | '`')``.
- fn from_code(code: Code) -> Kind {
- match code {
- Code::Char(char) => Kind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// State needed to parse code (fenced).
@@ -217,20 +206,23 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
if let Some(event) = tail {
if event.token_type == Token::SpaceOrTab {
- let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
- prefix = span.end_index - span.start_index;
+ prefix = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1),
+ )
+ .size();
}
}
match tokenizer.current {
- Code::Char('`' | '~') => {
+ Some(char) if matches!(char, '`' | '~') => {
tokenizer.enter(Token::CodeFencedFenceSequence);
sequence_open(
tokenizer,
Info {
prefix,
size: 0,
- kind: Kind::from_code(tokenizer.current),
+ kind: Kind::from_char(char),
},
)
}
@@ -248,7 +240,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.consume();
State::Fn(Box::new(|t| {
info.size += 1;
@@ -273,7 +265,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::CodeFencedFence);
// Do not form containers.
tokenizer.concrete = true;
@@ -282,7 +274,7 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State {
_ => {
tokenizer.enter(Token::CodeFencedFenceInfo);
tokenizer.enter_with_content(Token::Data, Some(ContentType::String));
- info_inside(tokenizer, info, vec![])
+ info_inside(tokenizer, info)
}
}
}
@@ -295,9 +287,9 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// | console.log(1)
/// | ~~~
/// ```
-fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec<Code>) -> State {
+fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::Data);
tokenizer.exit(Token::CodeFencedFenceInfo);
tokenizer.exit(Token::CodeFencedFence);
@@ -305,16 +297,15 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec<Code>) -> S
tokenizer.concrete = true;
at_break(tokenizer, info)
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.exit(Token::Data);
tokenizer.exit(Token::CodeFencedFenceInfo);
tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer)
}
- Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok,
- Code::Char(_) => {
- codes.push(tokenizer.current);
+ Some('`') if info.kind == Kind::GraveAccent => State::Nok,
+ Some(_) => {
tokenizer.consume();
- State::Fn(Box::new(|t| info_inside(t, info, codes)))
+ State::Fn(Box::new(|t| info_inside(t, info)))
}
}
}
@@ -329,7 +320,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec<Code>) -> S
/// ```
fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::CodeFencedFence);
// Do not form containers.
tokenizer.concrete = true;
@@ -353,7 +344,7 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn meta(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::Data);
tokenizer.exit(Token::CodeFencedFenceMeta);
tokenizer.exit(Token::CodeFencedFence);
@@ -361,7 +352,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State {
tokenizer.concrete = true;
at_break(tokenizer, info)
}
- Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok,
+ Some('`') if info.kind == Kind::GraveAccent => State::Nok,
_ => {
tokenizer.consume();
State::Fn(Box::new(|t| meta(t, info)))
@@ -422,7 +413,7 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
@@ -461,7 +452,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.enter(Token::CodeFencedFenceSequence);
close_sequence(tokenizer, info, 0)
}
@@ -479,7 +470,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.consume();
State::Fn(Box::new(move |t| close_sequence(t, info, size + 1)))
}
@@ -501,7 +492,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State {
/// ```
fn close_sequence_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::CodeFencedFence);
State::Ok
}
@@ -547,9 +538,7 @@ fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_break(tokenizer, info)
- }
+ None | Some('\n') => at_break(tokenizer, info),
_ => {
tokenizer.enter(Token::CodeFlowChunk);
content_continue(tokenizer, info)
@@ -567,7 +556,7 @@ fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::CodeFlowChunk);
at_break(tokenizer, info)
}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 015c4a0..bb1615c 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -48,7 +48,7 @@
use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
use crate::constant::TAB_SIZE;
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of code (indented).
///
@@ -78,11 +78,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => after(tokenizer),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer
- .attempt(further_start, |ok| {
- Box::new(if ok { at_break } else { after })
- })(tokenizer),
+ None => after(tokenizer),
+ Some('\n') => tokenizer.attempt(further_start, |ok| {
+ Box::new(if ok { at_break } else { after })
+ })(tokenizer),
_ => {
tokenizer.enter(Token::CodeFlowChunk);
content(tokenizer)
@@ -98,7 +97,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State {
/// ```
fn content(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::CodeFlowChunk);
at_break(tokenizer)
}
@@ -134,7 +133,7 @@ fn further_start(tokenizer: &mut Tokenizer) -> State {
State::Nok
} else {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
@@ -178,7 +177,7 @@ fn further_begin(tokenizer: &mut Tokenizer) -> State {
/// ```
fn further_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => further_start(tokenizer),
+ Some('\n') => further_start(tokenizer),
_ => State::Nok,
}
}
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
index f5f92fc..150f63b 100644
--- a/src/construct/code_text.rs
+++ b/src/construct/code_text.rs
@@ -84,7 +84,7 @@
//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of code (text).
///
@@ -98,9 +98,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
let len = tokenizer.events.len();
match tokenizer.current {
- Code::Char('`')
+ Some('`')
if tokenizer.parse_state.constructs.code_text
- && (tokenizer.previous != Code::Char('`')
+ && (tokenizer.previous != Some('`')
|| (len > 0
&& tokenizer.events[len - 1].token_type == Token::CharacterEscape)) =>
{
@@ -119,7 +119,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State {
- if let Code::Char('`') = tokenizer.current {
+ if let Some('`') = tokenizer.current {
tokenizer.consume();
State::Fn(Box::new(move |t| sequence_open(t, size + 1)))
} else {
@@ -136,14 +136,14 @@ fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State {
/// ```
fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State {
match tokenizer.current {
- Code::None => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None => State::Nok,
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
State::Fn(Box::new(move |t| between(t, size_open)))
}
- Code::Char('`') => {
+ Some('`') => {
tokenizer.enter(Token::CodeTextSequence);
sequence_close(tokenizer, size_open, 0)
}
@@ -162,7 +162,7 @@ fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State {
/// ```
fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '`') => {
+ None | Some('\n' | '`') => {
tokenizer.exit(Token::CodeTextData);
between(tokenizer, size_open)
}
@@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State {
/// ```
fn sequence_close(tokenizer: &mut Tokenizer, size_open: usize, size: usize) -> State {
match tokenizer.current {
- Code::Char('`') => {
+ Some('`') => {
tokenizer.consume();
State::Fn(Box::new(move |t| sequence_close(t, size_open, size + 1)))
}
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index ffaaa98..f2b5ae0 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -100,7 +100,7 @@ use crate::construct::{
partial_title::{start as title, Options as TitleOptions},
};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
use crate::util::skip::opt_back as skip_opt_back;
/// At the start of a definition.
@@ -137,7 +137,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('[') => tokenizer.go(
+ Some('[') => tokenizer.go(
|t| {
label(
t,
@@ -162,7 +162,7 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn label_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(':') => {
+ Some(':') => {
tokenizer.enter(Token::DefinitionMarker);
tokenizer.consume();
tokenizer.exit(Token::DefinitionMarker);
@@ -231,7 +231,7 @@ fn after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn after_whitespace(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::Definition);
// You’d be interrupting.
tokenizer.interrupt = true;
@@ -294,7 +294,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn title_after_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok,
+ None | Some('\n') => State::Ok,
_ => State::Nok,
}
}
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
index 40a83ef..0585c4c 100644
--- a/src/construct/hard_break_escape.rs
+++ b/src/construct/hard_break_escape.rs
@@ -40,7 +40,7 @@
//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of a hard break (escape).
///
@@ -51,7 +51,7 @@ use crate::tokenizer::{Code, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('\\') if tokenizer.parse_state.constructs.hard_break_escape => {
+ Some('\\') if tokenizer.parse_state.constructs.hard_break_escape => {
tokenizer.enter(Token::HardBreakEscape);
tokenizer.consume();
State::Fn(Box::new(inside))
@@ -69,7 +69,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.exit(Token::HardBreakEscape);
State::Ok
}
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 5de9a80..7a7cf2e 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -57,7 +57,7 @@
use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE};
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, Event, EventType, State, Tokenizer};
+use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer};
/// Start of a heading (atx).
///
@@ -87,7 +87,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
- if Code::Char('#') == tokenizer.current {
+ if Some('#') == tokenizer.current {
tokenizer.enter(Token::HeadingAtxSequence);
sequence_open(tokenizer, 0)
} else {
@@ -103,11 +103,11 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if rank > 0 => {
+ None | Some('\n') if rank > 0 => {
tokenizer.exit(Token::HeadingAtxSequence);
at_break(tokenizer)
}
- Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
+ Some('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |tokenizer| {
sequence_open(tokenizer, rank + 1)
@@ -129,21 +129,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::HeadingAtx);
tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve));
// Feel free to interrupt.
tokenizer.interrupt = false;
State::Ok
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.go(space_or_tab(), at_break)(tokenizer)
- }
- Code::Char('#') => {
+ Some('\t' | ' ') => tokenizer.go(space_or_tab(), at_break)(tokenizer),
+ Some('#') => {
tokenizer.enter(Token::HeadingAtxSequence);
further_sequence(tokenizer)
}
- Code::Char(_) => {
+ Some(_) => {
tokenizer.enter_with_content(Token::Data, Some(ContentType::Text));
data(tokenizer)
}
@@ -159,7 +157,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn further_sequence(tokenizer: &mut Tokenizer) -> State {
- if let Code::Char('#') = tokenizer.current {
+ if let Some('#') = tokenizer.current {
tokenizer.consume();
State::Fn(Box::new(further_sequence))
} else {
@@ -177,7 +175,7 @@ fn further_sequence(tokenizer: &mut Tokenizer) -> State {
fn data(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text.
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => {
+ None | Some('\t' | '\n' | ' ') => {
tokenizer.exit(Token::Data);
at_break(tokenizer)
}
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
index a0f7545..f9dd3f7 100644
--- a/src/construct/heading_setext.rs
+++ b/src/construct/heading_setext.rs
@@ -60,7 +60,7 @@
use crate::constant::TAB_SIZE;
use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
use crate::token::Token;
-use crate::tokenizer::{Code, EventType, State, Tokenizer};
+use crate::tokenizer::{EventType, State, Tokenizer};
use crate::util::skip::opt_back as skip_opt_back;
/// Kind of underline.
@@ -148,7 +148,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(char) if char == '-' || char == '=' => {
+ Some(char) if matches!(char, '-' | '=') => {
tokenizer.enter(Token::HeadingSetextUnderline);
inside(tokenizer, Kind::from_char(char))
}
@@ -165,7 +165,7 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State {
match tokenizer.current {
- Code::Char(char) if char == kind.as_char() => {
+ Some(char) if char == kind.as_char() => {
tokenizer.consume();
State::Fn(Box::new(move |t| inside(t, kind)))
}
@@ -185,7 +185,7 @@ fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State {
/// ```
fn after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
// Feel free to interrupt.
tokenizer.interrupt = false;
tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve));
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index 24d6f98..238963d 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -105,8 +105,10 @@ use crate::construct::{
partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions},
};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
-use crate::util::codes::{parse, serialize};
+use crate::tokenizer::{Point, State, Tokenizer};
+use crate::util::slice::{Position, Slice};
+
+const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '['];
/// Kind of HTML (flow).
#[derive(Debug, PartialEq)]
@@ -168,17 +170,6 @@ impl QuoteKind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not `Code::Char('"' | '\'')`.
- fn from_code(code: Code) -> QuoteKind {
- match code {
- Code::Char(char) => QuoteKind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// State needed to parse HTML (flow).
@@ -190,9 +181,9 @@ struct Info {
start_tag: bool,
/// Used depending on `kind` to either collect all parsed characters, or to
/// store expected characters.
- buffer: Vec<Code>,
- /// `index` into `buffer` when expecting certain characters.
- index: usize,
+ start: Option<Point>,
+ /// Collected index, for various reasons.
+ size: usize,
/// Current quote, when in a double or single quoted attribute value.
quote: Option<QuoteKind>,
}
@@ -234,7 +225,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
- if Code::Char('<') == tokenizer.current {
+ if Some('<') == tokenizer.current {
tokenizer.enter(Token::HtmlFlowData);
tokenizer.consume();
State::Fn(Box::new(open))
@@ -259,21 +250,22 @@ fn open(tokenizer: &mut Tokenizer) -> State {
kind: Kind::Basic,
// Assume closing tag (or no tag).
start_tag: false,
- buffer: vec![],
- index: 0,
+ start: None,
+ size: 0,
quote: None,
};
match tokenizer.current {
- Code::Char('!') => {
+ Some('!') => {
tokenizer.consume();
State::Fn(Box::new(|t| declaration_open(t, info)))
}
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
+ info.start = Some(tokenizer.point.clone());
State::Fn(Box::new(|t| tag_close_start(t, info)))
}
- Code::Char('?') => {
+ Some('?') => {
info.kind = Kind::Instruction;
tokenizer.consume();
// Do not form containers.
@@ -282,8 +274,9 @@ fn open(tokenizer: &mut Tokenizer) -> State {
// right now, so we do need to search for `>`, similar to declarations.
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
info.start_tag = true;
+ info.start = Some(tokenizer.point.clone());
tag_name(tokenizer, info)
}
_ => State::Nok,
@@ -302,19 +295,18 @@ fn open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
info.kind = Kind::Comment;
State::Fn(Box::new(|t| comment_open_inside(t, info)))
}
- Code::Char('[') => {
+ Some('[') => {
tokenizer.consume();
info.kind = Kind::Cdata;
- info.buffer = parse("CDATA[");
- info.index = 0;
+ info.size = 0;
State::Fn(Box::new(|t| cdata_open_inside(t, info)))
}
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
info.kind = Kind::Declaration;
// Do not form containers.
@@ -333,7 +325,7 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
// Do not form containers.
tokenizer.concrete = true;
@@ -350,20 +342,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ^^^^^^
/// ```
fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
- if tokenizer.current == info.buffer[info.index] {
- info.index += 1;
- tokenizer.consume();
+ match tokenizer.current {
+ Some(char) if char == CDATA_SEARCH[info.size] => {
+ info.size += 1;
+ tokenizer.consume();
- if info.index == info.buffer.len() {
- info.buffer.clear();
- // Do not form containers.
- tokenizer.concrete = true;
- State::Fn(Box::new(|t| continuation(t, info)))
- } else {
- State::Fn(Box::new(|t| cdata_open_inside(t, info)))
+ if info.size == CDATA_SEARCH.len() {
+ info.size = 0;
+ // Do not form containers.
+ tokenizer.concrete = true;
+ State::Fn(Box::new(|t| continuation(t, info)))
+ } else {
+ State::Fn(Box::new(|t| cdata_open_inside(t, info)))
+ }
}
- } else {
- State::Nok
+ _ => State::Nok,
}
}
@@ -373,11 +366,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// > | </x>
/// ^
/// ```
-fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State {
+fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
- info.buffer.push(tokenizer.current);
State::Fn(Box::new(|t| tag_name(t, info)))
}
_ => State::Nok,
@@ -394,22 +386,27 @@ fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::None
- | Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => {
- let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();
- let name = tag_name_buffer.as_str();
- let slash = matches!(tokenizer.current, Code::Char('/'));
-
- info.buffer.clear();
-
- if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) {
+ None | Some('\t' | '\n' | ' ' | '/' | '>') => {
+ let slash = matches!(tokenizer.current, Some('/'));
+ let start = info.start.take().unwrap();
+ let name = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &start,
+ end: &tokenizer.point,
+ },
+ )
+ .serialize()
+ .trim()
+ .to_lowercase();
+ println!("name: {:?}", name);
+
+ if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) {
info.kind = Kind::Raw;
// Do not form containers.
tokenizer.concrete = true;
continuation(tokenizer, info)
- } else if HTML_BLOCK_NAMES.contains(&name) {
+ } else if HTML_BLOCK_NAMES.contains(&name.as_str()) {
// Basic is assumed, no need to set `kind`.
if slash {
tokenizer.consume();
@@ -432,12 +429,11 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
}
}
}
- Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
+ Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume();
- info.buffer.push(tokenizer.current);
State::Fn(Box::new(|t| tag_name(t, info)))
}
- Code::Char(_) => State::Nok,
+ Some(_) => State::Nok,
}
}
@@ -449,7 +445,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
// Do not form containers.
tokenizer.concrete = true;
@@ -467,7 +463,7 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_closing_tag_after(t, info)))
}
@@ -496,15 +492,15 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_end(t, info)))
}
- Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
+ Some('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name_before(t, info)))
}
@@ -524,7 +520,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat
/// ```
fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
+ Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
}
@@ -543,11 +539,11 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('=') => {
+ Some('=') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name_after(t, info)))
}
@@ -566,13 +562,13 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State
/// ```
fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok,
- Code::Char('"' | '\'') => {
+ None | Some('<' | '=' | '>' | '`') => State::Nok,
+ Some(char) if matches!(char, '"' | '\'') => {
+ info.quote = Some(QuoteKind::from_char(char));
tokenizer.consume();
- info.quote = Some(QuoteKind::from_code(tokenizer.current));
State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info)))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
}
@@ -590,8 +586,8 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) ->
/// ```
fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Nok,
- Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => {
+ None | Some('\n') => State::Nok,
+ Some(char) if char == info.quote.as_ref().unwrap().as_char() => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info)))
}
@@ -610,13 +606,10 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta
/// ```
fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None
- | Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => {
+ None | Some('\t' | '\n' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => {
complete_attribute_name_after(tokenizer, info)
}
- Code::Char(_) => {
+ Some(_) => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_unquoted(t, info)))
}
@@ -632,9 +625,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S
/// ```
fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => {
- complete_attribute_name_before(tokenizer, info)
- }
+ Some('\t' | ' ' | '/' | '>') => complete_attribute_name_before(tokenizer, info),
_ => State::Nok,
}
}
@@ -647,7 +638,7 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info)
/// ```
fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_after(t, info)))
}
@@ -663,16 +654,16 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
// Do not form containers.
tokenizer.concrete = true;
continuation(tokenizer, info)
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_after(t, info)))
}
- Code::Char(_) => State::Nok,
+ Some(_) => State::Nok,
}
}
@@ -684,29 +675,27 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-') if info.kind == Kind::Comment => {
+ Some('-') if info.kind == Kind::Comment => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_comment_inside(t, info)))
}
- Code::Char('<') if info.kind == Kind::Raw => {
+ Some('<') if info.kind == Kind::Raw => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_raw_tag_open(t, info)))
}
- Code::Char('>') if info.kind == Kind::Declaration => {
+ Some('>') if info.kind == Kind::Declaration => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_close(t, info)))
}
- Code::Char('?') if info.kind == Kind::Instruction => {
+ Some('?') if info.kind == Kind::Instruction => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
- Code::Char(']') if info.kind == Kind::Cdata => {
+ Some(']') if info.kind == Kind::Cdata => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_character_data_inside(t, info)))
}
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- if info.kind == Kind::Basic || info.kind == Kind::Complete =>
- {
+ Some('\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => {
tokenizer.exit(Token::HtmlFlowData);
tokenizer.check(blank_line_before, |ok| {
if ok {
@@ -716,7 +705,7 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
}
})(tokenizer)
}
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::HtmlFlowData);
continuation_start(tokenizer, info)
}
@@ -753,7 +742,7 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
@@ -772,9 +761,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- continuation_start(tokenizer, info)
- }
+ None | Some('\n') => continuation_start(tokenizer, info),
_ => {
tokenizer.enter(Token::HtmlFlowData);
continuation(tokenizer, info)
@@ -790,7 +777,7 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
@@ -804,10 +791,11 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
/// > | <script>console.log(1)</script>
/// ^
/// ```
-fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State {
+fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
+ info.start = Some(tokenizer.point.clone());
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => continuation(tokenizer, info),
@@ -822,24 +810,34 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
- let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();
- info.buffer.clear();
-
- if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) {
+ Some('>') => {
+ info.size = 0;
+
+ let start = info.start.take().unwrap();
+ let name = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &start,
+ end: &tokenizer.point,
+ },
+ )
+ .serialize()
+ .to_lowercase();
+
+ if HTML_RAW_NAMES.contains(&name.as_str()) {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_close(t, info)))
} else {
continuation(tokenizer, info)
}
}
- Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => {
+ Some('A'..='Z' | 'a'..='z') if info.size < HTML_RAW_SIZE_MAX => {
tokenizer.consume();
- info.buffer.push(tokenizer.current);
+ info.size += 1;
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => {
- info.buffer.clear();
+ info.size = 0;
continuation(tokenizer, info)
}
}
@@ -853,7 +851,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State
/// ```
fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char(']') => {
+ Some(']') => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
@@ -877,11 +875,11 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) ->
/// ```
fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_close(t, info)))
}
- Code::Char('-') if info.kind == Kind::Comment => {
+ Some('-') if info.kind == Kind::Comment => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
@@ -897,7 +895,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta
/// ```
fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::HtmlFlowData);
continuation_after(tokenizer)
}
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index 3ac8d71..b1ad113 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -56,8 +56,9 @@
use crate::construct::partial_space_or_tab::space_or_tab;
use crate::token::Token;
-use crate::tokenizer::{Code, State, StateFn, Tokenizer};
-use crate::util::codes::parse;
+use crate::tokenizer::{State, StateFn, Tokenizer};
+
+const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '['];
/// Start of HTML (text)
///
@@ -66,7 +67,7 @@ use crate::util::codes::parse;
/// ^
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- if Code::Char('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text {
+ if Some('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text {
tokenizer.enter(Token::HtmlText);
tokenizer.enter(Token::HtmlTextData);
tokenizer.consume();
@@ -88,19 +89,19 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('!') => {
+ Some('!') => {
tokenizer.consume();
State::Fn(Box::new(declaration_open))
}
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
State::Fn(Box::new(tag_close_start))
}
- Code::Char('?') => {
+ Some('?') => {
tokenizer.consume();
State::Fn(Box::new(instruction))
}
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open))
}
@@ -120,16 +121,15 @@ fn open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn declaration_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(comment_open_inside))
}
- Code::Char('[') => {
+ Some('[') => {
tokenizer.consume();
- let buffer = parse("CDATA[");
- State::Fn(Box::new(|t| cdata_open_inside(t, buffer, 0)))
+ State::Fn(Box::new(|t| cdata_open_inside(t, 0)))
}
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(declaration))
}
@@ -145,7 +145,7 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn comment_open_inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(comment_start))
}
@@ -168,8 +168,8 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State {
/// [html_flow]: crate::construct::html_flow
fn comment_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::Char('>') => State::Nok,
- Code::Char('-') => {
+ None | Some('>') => State::Nok,
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(comment_start_dash))
}
@@ -192,7 +192,7 @@ fn comment_start(tokenizer: &mut Tokenizer) -> State {
/// [html_flow]: crate::construct::html_flow
fn comment_start_dash(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::Char('>') => State::Nok,
+ None | Some('>') => State::Nok,
_ => comment(tokenizer),
}
}
@@ -205,11 +205,9 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State {
/// ```
fn comment(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(comment))
- }
- Code::Char('-') => {
+ None => State::Nok,
+ Some('\n') => at_line_ending(tokenizer, Box::new(comment)),
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(comment_close))
}
@@ -228,7 +226,7 @@ fn comment(tokenizer: &mut Tokenizer) -> State {
/// ```
fn comment_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(end))
}
@@ -242,17 +240,18 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State {
/// > | a <![CDATA[>&<]]> b
/// ^^^^^^
/// ```
-fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec<Code>, index: usize) -> State {
- if tokenizer.current == buffer[index] {
- tokenizer.consume();
+fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State {
+ match tokenizer.current {
+ Some(char) if char == CDATA_SEARCH[index] => {
+ tokenizer.consume();
- if index + 1 == buffer.len() {
- State::Fn(Box::new(cdata))
- } else {
- State::Fn(Box::new(move |t| cdata_open_inside(t, buffer, index + 1)))
+ if index + 1 == CDATA_SEARCH.len() {
+ State::Fn(Box::new(cdata))
+ } else {
+ State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1)))
+ }
}
- } else {
- State::Nok
+ _ => State::Nok,
}
}
@@ -264,11 +263,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec<Code>, index: usize)
/// ```
fn cdata(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(cdata))
- }
- Code::Char(']') => {
+ None => State::Nok,
+ Some('\n') => at_line_ending(tokenizer, Box::new(cdata)),
+ Some(']') => {
tokenizer.consume();
State::Fn(Box::new(cdata_close))
}
@@ -287,7 +284,7 @@ fn cdata(tokenizer: &mut Tokenizer) -> State {
/// ```
fn cdata_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(']') => {
+ Some(']') => {
tokenizer.consume();
State::Fn(Box::new(cdata_end))
}
@@ -303,8 +300,8 @@ fn cdata_close(tokenizer: &mut Tokenizer) -> State {
/// ```
fn cdata_end(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => end(tokenizer),
- Code::Char(']') => cdata_close(tokenizer),
+ Some('>') => end(tokenizer),
+ Some(']') => cdata_close(tokenizer),
_ => cdata(tokenizer),
}
}
@@ -317,10 +314,8 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State {
/// ```
fn declaration(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::Char('>') => end(tokenizer),
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(declaration))
- }
+ None | Some('>') => end(tokenizer),
+ Some('\n') => at_line_ending(tokenizer, Box::new(declaration)),
_ => {
tokenizer.consume();
State::Fn(Box::new(declaration))
@@ -336,11 +331,9 @@ fn declaration(tokenizer: &mut Tokenizer) -> State {
/// ```
fn instruction(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(instruction))
- }
- Code::Char('?') => {
+ None => State::Nok,
+ Some('\n') => at_line_ending(tokenizer, Box::new(instruction)),
+ Some('?') => {
tokenizer.consume();
State::Fn(Box::new(instruction_close))
}
@@ -359,7 +352,7 @@ fn instruction(tokenizer: &mut Tokenizer) -> State {
/// ```
fn instruction_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => end(tokenizer),
+ Some('>') => end(tokenizer),
_ => instruction(tokenizer),
}
}
@@ -372,7 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(tag_close))
}
@@ -388,7 +381,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
+ Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(tag_close))
}
@@ -404,10 +397,8 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(tag_close_between))
- }
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\n') => at_line_ending(tokenizer, Box::new(tag_close_between)),
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(tag_close_between))
}
@@ -423,13 +414,11 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
+ Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open))
}
- Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer),
+ Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer),
_ => State::Nok,
}
}
@@ -442,18 +431,16 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(tag_open_between))
- }
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_between)),
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_between))
}
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
State::Fn(Box::new(end))
}
- Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => {
+ Some(':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name))
}
@@ -469,7 +456,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
+ Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name))
}
@@ -486,14 +473,12 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after))
- }
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)),
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_name_after))
}
- Code::Char('=') => {
+ Some('=') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_before))
}
@@ -510,19 +495,17 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before))
- }
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ None | Some('<' | '=' | '>' | '`') => State::Nok,
+ Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)),
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_before))
}
- Code::Char(char) if char == '"' || char == '\'' => {
+ Some(char) if char == '"' || char == '\'' => {
tokenizer.consume();
State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, char)))
}
- Code::Char(_) => {
+ Some(_) => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_unquoted))
}
@@ -537,12 +520,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> State {
match tokenizer.current {
- Code::None => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending(
+ None => State::Nok,
+ Some('\n') => at_line_ending(
tokenizer,
Box::new(move |t| tag_open_attribute_value_quoted(t, marker)),
),
- Code::Char(char) if char == marker => {
+ Some(char) if char == marker => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_quoted_after))
}
@@ -563,11 +546,9 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> S
/// ```
fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => State::Nok,
- Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer),
- Code::Char(_) => {
+ None | Some('"' | '\'' | '<' | '=' | '`') => State::Nok,
+ Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer),
+ Some(_) => {
tokenizer.consume();
State::Fn(Box::new(tag_open_attribute_value_unquoted))
}
@@ -583,9 +564,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State {
/// ```
fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer),
+ Some('\t' | '\n' | ' ' | '>' | '/') => tag_open_between(tokenizer),
_ => State::Nok,
}
}
@@ -598,7 +577,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn end(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
tokenizer.exit(Token::HtmlTextData);
tokenizer.exit(Token::HtmlText);
@@ -620,7 +599,7 @@ fn end(tokenizer: &mut Tokenizer) -> State {
/// ```
fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.exit(Token::HtmlTextData);
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs
index 6f0a707..5ea788f 100644
--- a/src/construct/label_end.rs
+++ b/src/construct/label_end.rs
@@ -1,4 +1,4 @@
-//! Label end is a construct that occurs in the [text][] content type.
+//! Label end is a construct that occurs in the [text][] conten&t type.
//!
//! It forms with the following BNF:
//!
@@ -154,10 +154,11 @@ use crate::construct::{
partial_title::{start as title, Options as TitleOptions},
};
use crate::token::Token;
-use crate::tokenizer::{Code, Event, EventType, Media, State, Tokenizer};
+use crate::tokenizer::{Event, EventType, Media, State, Tokenizer};
use crate::util::{
normalize_identifier::normalize_identifier,
- span::{serialize, Span},
+ skip,
+ slice::{Position, Slice},
};
/// State needed to parse label end.
@@ -181,7 +182,7 @@ struct Info {
/// > | [a] b
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
- if Code::Char(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end {
+ if Some(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end {
let mut label_start_index = None;
let mut index = tokenizer.label_start_stack.len();
@@ -207,19 +208,23 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
}
let label_end_start = tokenizer.events.len();
+
let info = Info {
label_start_index,
media: Media {
start: label_start.start,
end: (label_end_start, label_end_start + 3),
- id: normalize_identifier(&serialize(
- &tokenizer.parse_state.codes,
- &Span {
- start_index: tokenizer.events[label_start.start.1].point.index,
- end_index: tokenizer.events[label_end_start - 1].point.index,
- },
- false,
- )),
+ // To do: virtual spaces not needed, create a `to_str`?
+ id: normalize_identifier(
+ &Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &tokenizer.events[label_start.start.1].point,
+ end: &tokenizer.events[label_end_start - 1].point,
+ },
+ )
+ .serialize(),
+ ),
},
};
@@ -253,7 +258,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
// Resource (`[asd](fgh)`)?
- Code::Char('(') => tokenizer.attempt(resource, move |is_ok| {
+ Some('(') => tokenizer.attempt(resource, move |is_ok| {
Box::new(move |t| {
// Also fine if `defined`, as then it’s a valid shortcut.
if is_ok || defined {
@@ -264,7 +269,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State {
})
})(tokenizer),
// Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference?
- Code::Char('[') => tokenizer.attempt(full_reference, move |is_ok| {
+ Some('[') => tokenizer.attempt(full_reference, move |is_ok| {
Box::new(move |t| {
if is_ok {
ok(t, info)
@@ -377,7 +382,7 @@ fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State {
/// ```
fn resource(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('(') => {
+ Some('(') => {
tokenizer.enter(Token::Resource);
tokenizer.enter(Token::ResourceMarker);
tokenizer.consume();
@@ -406,7 +411,7 @@ fn resource_start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn resource_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(')') => resource_end(tokenizer),
+ Some(')') => resource_end(tokenizer),
_ => tokenizer.go(
|t| {
destination(
@@ -446,7 +451,7 @@ fn destination_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn resource_between(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('"' | '\'' | '(') => tokenizer.go(
+ Some('"' | '\'' | '(') => tokenizer.go(
|t| {
title(
t,
@@ -481,7 +486,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn resource_end(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(')') => {
+ Some(')') => {
tokenizer.enter(Token::ResourceMarker);
tokenizer.consume();
tokenizer.exit(Token::ResourceMarker);
@@ -500,7 +505,7 @@ fn resource_end(tokenizer: &mut Tokenizer) -> State {
/// ```
fn full_reference(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('[') => tokenizer.go(
+ Some('[') => tokenizer.go(
|t| {
label(
t,
@@ -524,36 +529,23 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn full_reference_after(tokenizer: &mut Tokenizer) -> State {
- let events = &tokenizer.events;
- let mut index = events.len() - 1;
- let mut start: Option<usize> = None;
- let mut end: Option<usize> = None;
-
- while index > 0 {
- index -= 1;
- let event = &events[index];
- if event.token_type == Token::ReferenceString {
- if event.event_type == EventType::Exit {
- end = Some(event.point.index);
- } else {
- start = Some(event.point.index);
- break;
- }
- }
- }
+ let end = skip::to_back(
+ &tokenizer.events,
+ tokenizer.events.len() - 1,
+ &[Token::ReferenceString],
+ );
+
+ // To do: virtual spaces not needed, create a `to_str`?
+ let id = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position::from_exit_event(&tokenizer.events, end),
+ )
+ .serialize();
if tokenizer
.parse_state
.definitions
- .contains(&normalize_identifier(&serialize(
- &tokenizer.parse_state.codes,
- &Span {
- // Always found, otherwise we don’t get here.
- start_index: start.unwrap(),
- end_index: end.unwrap(),
- },
- false,
- )))
+ .contains(&normalize_identifier(&id))
{
State::Ok
} else {
@@ -571,7 +563,7 @@ fn full_reference_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn collapsed_reference(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('[') => {
+ Some('[') => {
tokenizer.enter(Token::Reference);
tokenizer.enter(Token::ReferenceMarker);
tokenizer.consume();
@@ -592,7 +584,7 @@ fn collapsed_reference(tokenizer: &mut Tokenizer) -> State {
/// ```
fn collapsed_reference_open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char(']') => {
+ Some(']') => {
tokenizer.enter(Token::ReferenceMarker);
tokenizer.consume();
tokenizer.exit(Token::ReferenceMarker);
@@ -735,7 +727,11 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) {
0,
vec![Event {
event_type: EventType::Exit,
- token_type: Token::Link,
+ token_type: if group_enter_event.token_type == Token::LabelLink {
+ Token::Link
+ } else {
+ Token::Image
+ },
point: events[group_end_index].point.clone(),
link: None,
}],
diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs
index 8c12ffe..078026d 100644
--- a/src/construct/label_start_image.rs
+++ b/src/construct/label_start_image.rs
@@ -30,7 +30,7 @@
use super::label_end::resolve_media;
use crate::token::Token;
-use crate::tokenizer::{Code, LabelStart, State, Tokenizer};
+use crate::tokenizer::{LabelStart, State, Tokenizer};
/// Start of label (image) start.
///
@@ -40,7 +40,7 @@ use crate::tokenizer::{Code, LabelStart, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('!') if tokenizer.parse_state.constructs.label_start_image => {
+ Some('!') if tokenizer.parse_state.constructs.label_start_image => {
tokenizer.enter(Token::LabelImage);
tokenizer.enter(Token::LabelImageMarker);
tokenizer.consume();
@@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
pub fn open(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('[') => {
+ Some('[') => {
tokenizer.enter(Token::LabelMarker);
tokenizer.consume();
tokenizer.exit(Token::LabelMarker);
diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs
index e13cd77..d7ae1d6 100644
--- a/src/construct/label_start_link.rs
+++ b/src/construct/label_start_link.rs
@@ -29,7 +29,7 @@
use super::label_end::resolve_media;
use crate::token::Token;
-use crate::tokenizer::{Code, LabelStart, State, Tokenizer};
+use crate::tokenizer::{LabelStart, State, Tokenizer};
/// Start of label (link) start.
///
@@ -39,7 +39,7 @@ use crate::tokenizer::{Code, LabelStart, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('[') if tokenizer.parse_state.constructs.label_start_link => {
+ Some('[') if tokenizer.parse_state.constructs.label_start_link => {
let start = tokenizer.events.len();
tokenizer.enter(Token::LabelLink);
tokenizer.enter(Token::LabelMarker);
diff --git a/src/construct/list.rs b/src/construct/list.rs
index f5bb0ce..355eeee 100644
--- a/src/construct/list.rs
+++ b/src/construct/list.rs
@@ -50,10 +50,10 @@ use crate::construct::{
thematic_break::start as thematic_break,
};
use crate::token::Token;
-use crate::tokenizer::{Code, EventType, State, Tokenizer};
+use crate::tokenizer::{EventType, State, Tokenizer};
use crate::util::{
skip,
- span::{codes as codes_from_span, from_exit_event},
+ slice::{Position, Slice},
};
/// Type of list.
@@ -117,17 +117,6 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not `Code::Char('.' | ')' | '*' | '+' | '-')`.
- fn from_code(code: Code) -> Kind {
- match code {
- Code::Char(char) => Kind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// Start of list item.
@@ -160,11 +149,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
// Unordered.
- Code::Char('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| {
+ Some('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| {
Box::new(if ok { nok } else { before_unordered })
})(tokenizer),
// Ordered.
- Code::Char(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => {
+ Some(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => {
tokenizer.enter(Token::ListItemPrefix);
tokenizer.enter(Token::ListItemValue);
inside(tokenizer, 0)
@@ -194,11 +183,11 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer, size: usize) -> State {
match tokenizer.current {
- Code::Char(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => {
+ Some(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => {
tokenizer.consume();
State::Fn(Box::new(move |t| inside(t, size + 1)))
}
- Code::Char('.' | ')') if !tokenizer.interrupt || size < 2 => {
+ Some('.' | ')') if !tokenizer.interrupt || size < 2 => {
tokenizer.exit(Token::ListItemValue);
marker(tokenizer)
}
@@ -273,10 +262,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn whitespace_after(tokenizer: &mut Tokenizer) -> State {
- if matches!(
- tokenizer.current,
- Code::VirtualSpace | Code::Char('\t' | ' ')
- ) {
+ if matches!(tokenizer.current, Some('\t' | ' ')) {
State::Nok
} else {
State::Ok
@@ -291,7 +277,7 @@ fn whitespace_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn prefix_other(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.enter(Token::SpaceOrTab);
tokenizer.consume();
tokenizer.exit(Token::SpaceOrTab);
@@ -316,8 +302,18 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State {
tokenizer.events.len() - 1,
&[Token::ListItem],
);
- let prefix = tokenizer.point.index - tokenizer.events[start].point.index
- + (if blank { 1 } else { 0 });
+ let mut prefix = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &tokenizer.events[start].point,
+ end: &tokenizer.point,
+ },
+ )
+ .size();
+
+ if blank {
+ prefix += 1;
+ }
let container = tokenizer.container.as_mut().unwrap();
container.blank_initial = blank;
@@ -403,12 +399,15 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {
if event.token_type == Token::ListItem {
if event.event_type == EventType::Enter {
let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1;
- let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]) + 1;
- let codes = codes_from_span(
- &tokenizer.parse_state.codes,
- &from_exit_event(&tokenizer.events, marker),
+ let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]);
+ let kind = Kind::from_char(
+ Slice::from_point(
+ &tokenizer.parse_state.chars,
+ &tokenizer.events[marker].point,
+ )
+ .head()
+ .unwrap(),
);
- let kind = Kind::from_code(codes[0]);
let current = (kind, balance, index, end);
let mut list_index = lists_wip.len();
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index 4bce6a4..5d230d3 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -33,7 +33,7 @@
//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, EventType, State, Tokenizer};
+use crate::tokenizer::{ContentType, EventType, State, Tokenizer};
use crate::util::skip::opt as skip_opt;
/// Before a paragraph.
@@ -44,7 +44,7 @@ use crate::util::skip::opt as skip_opt;
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
unreachable!("unexpected eol/eof")
}
_ => {
@@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::Data);
tokenizer.exit(Token::Paragraph);
tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve));
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs
index 4216276..0b66b09 100644
--- a/src/construct/partial_data.rs
+++ b/src/construct/partial_data.rs
@@ -7,7 +7,7 @@
//! [text]: crate::content::text
use crate::token::Token;
-use crate::tokenizer::{Code, EventType, State, Tokenizer};
+use crate::tokenizer::{EventType, State, Tokenizer};
/// At the beginning of data.
///
@@ -15,13 +15,14 @@ use crate::tokenizer::{Code, EventType, State, Tokenizer};
/// > | abc
/// ^
/// ```
-pub fn start(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State {
- if stop.contains(&tokenizer.current) {
- tokenizer.enter(Token::Data);
- tokenizer.consume();
- State::Fn(Box::new(move |t| data(t, stop)))
- } else {
- at_break(tokenizer, stop)
+pub fn start(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State {
+ match tokenizer.current {
+ Some(char) if stop.contains(&char) => {
+ tokenizer.enter(Token::Data);
+ tokenizer.consume();
+ State::Fn(Box::new(move |t| data(t, stop)))
+ }
+ _ => at_break(tokenizer, stop),
}
}
@@ -31,16 +32,16 @@ pub fn start(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State {
/// > | abc
/// ^
/// ```
-fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State {
+fn at_break(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State {
match tokenizer.current {
- Code::None => State::Ok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None => State::Ok,
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
State::Fn(Box::new(move |t| at_break(t, stop)))
}
- _ if stop.contains(&tokenizer.current) => {
+ Some(char) if stop.contains(&char) => {
tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data));
State::Ok
}
@@ -57,10 +58,10 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State {
/// > | abc
/// ^^^
/// ```
-fn data(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State {
+fn data(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State {
let done = match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => true,
- _ if stop.contains(&tokenizer.current) => true,
+ None | Some('\n') => true,
+ Some(char) if stop.contains(&char) => true,
_ => false,
};
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index 6a984e2..6447228 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -72,7 +72,7 @@
//! [sanitize_uri]: crate::util::sanitize_uri
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, State, Tokenizer};
+use crate::tokenizer::{ContentType, State, Tokenizer};
/// Configuration.
///
@@ -117,7 +117,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
};
match tokenizer.current {
- Code::Char('<') => {
+ Some('<') => {
tokenizer.enter(info.options.destination.clone());
tokenizer.enter(info.options.literal.clone());
tokenizer.enter(info.options.marker.clone());
@@ -125,11 +125,9 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
tokenizer.exit(info.options.marker.clone());
State::Fn(Box::new(|t| enclosed_before(t, info)))
}
- Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ' | ')') => {
- State::Nok
- }
- Code::Char(char) if char.is_ascii_control() => State::Nok,
- Code::Char(_) => {
+ None | Some(' ' | ')') => State::Nok,
+ Some(char) if char.is_ascii_control() => State::Nok,
+ Some(_) => {
tokenizer.enter(info.options.destination.clone());
tokenizer.enter(info.options.raw.clone());
tokenizer.enter(info.options.string.clone());
@@ -146,7 +144,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ^
/// ```
fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State {
- if let Code::Char('>') = tokenizer.current {
+ if let Some('>') = tokenizer.current {
tokenizer.enter(info.options.marker.clone());
tokenizer.consume();
tokenizer.exit(info.options.marker.clone());
@@ -168,13 +166,13 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.exit(Token::Data);
tokenizer.exit(info.options.string.clone());
enclosed_before(tokenizer, info)
}
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '<') => State::Nok,
- Code::Char('\\') => {
+ None | Some('\n' | '<') => State::Nok,
+ Some('\\') => {
tokenizer.consume();
State::Fn(Box::new(|t| enclosed_escape(t, info)))
}
@@ -193,7 +191,7 @@ fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('<' | '>' | '\\') => {
+ Some('<' | '>' | '\\') => {
tokenizer.consume();
State::Fn(Box::new(|t| enclosed(t, info)))
}
@@ -209,7 +207,7 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('(') => {
+ Some('(') => {
if info.balance >= info.options.limit {
State::Nok
} else {
@@ -218,7 +216,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
State::Fn(Box::new(move |t| raw(t, info)))
}
}
- Code::Char(')') => {
+ Some(')') => {
if info.balance == 0 {
tokenizer.exit(Token::Data);
tokenizer.exit(info.options.string.clone());
@@ -231,10 +229,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
State::Fn(Box::new(move |t| raw(t, info)))
}
}
- Code::None
- | Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ') => {
+ None | Some('\t' | '\n' | ' ') => {
if info.balance > 0 {
State::Nok
} else {
@@ -245,12 +240,12 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
State::Ok
}
}
- Code::Char(char) if char.is_ascii_control() => State::Nok,
- Code::Char('\\') => {
+ Some(char) if char.is_ascii_control() => State::Nok,
+ Some('\\') => {
tokenizer.consume();
State::Fn(Box::new(move |t| raw_escape(t, info)))
}
- Code::Char(_) => {
+ Some(_) => {
tokenizer.consume();
State::Fn(Box::new(move |t| raw(t, info)))
}
@@ -265,7 +260,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn raw_escape(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('(' | ')' | '\\') => {
+ Some('(' | ')' | '\\') => {
tokenizer.consume();
State::Fn(Box::new(move |t| raw(t, info)))
}
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index 91a0e26..ee31533 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -62,7 +62,7 @@ use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions};
use crate::constant::LINK_REFERENCE_SIZE_MAX;
use crate::subtokenize::link;
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, State, Tokenizer};
+use crate::tokenizer::{ContentType, State, Tokenizer};
/// Configuration.
///
@@ -98,7 +98,7 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
match tokenizer.current {
- Code::Char('[') => {
+ Some('[') => {
let info = Info {
connect: false,
data: false,
@@ -124,10 +124,10 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::None | Code::Char('[') => State::Nok,
- Code::Char(']') if !info.data => State::Nok,
+ None | Some('[') => State::Nok,
+ Some(']') if !info.data => State::Nok,
_ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok,
- Code::Char(']') => {
+ Some(']') => {
tokenizer.exit(info.options.string.clone());
tokenizer.enter(info.options.marker.clone());
tokenizer.consume();
@@ -135,7 +135,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.exit(info.options.label);
State::Ok
}
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go(
+ Some('\n') => tokenizer.go(
space_or_tab_eol_with_options(EolOptions {
content_type: Some(ContentType::String),
connect: info.connect,
@@ -168,7 +168,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '[' | ']') => {
+ None | Some('\n' | '[' | ']') => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
@@ -176,12 +176,12 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
info.size += 1;
State::Fn(Box::new(|t| label(t, info)))
}
- Code::Char('\\') => {
+ Some('\\') => {
tokenizer.consume();
info.size += 1;
if !info.data {
@@ -189,7 +189,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {
}
State::Fn(Box::new(|t| escape(t, info)))
}
- Code::Char(_) => {
+ Some(_) => {
tokenizer.consume();
info.size += 1;
if !info.data {
@@ -208,7 +208,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn escape(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('[' | '\\' | ']') => {
+ Some('[' | '\\' | ']') => {
tokenizer.consume();
info.size += 1;
State::Fn(Box::new(|t| label(t, info)))
diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs
index bdc22e4..068e30f 100644
--- a/src/construct/partial_non_lazy_continuation.rs
+++ b/src/construct/partial_non_lazy_continuation.rs
@@ -11,7 +11,7 @@
//! [html_flow]: crate::construct::html_flow
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Start of continuation.
///
@@ -22,7 +22,7 @@ use crate::tokenizer::{Code, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs
index 5f1a917..6070ffe 100644
--- a/src/construct/partial_space_or_tab.rs
+++ b/src/construct/partial_space_or_tab.rs
@@ -6,7 +6,7 @@
use crate::subtokenize::link;
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, State, StateFn, Tokenizer};
+use crate::tokenizer::{ContentType, State, StateFn, Tokenizer};
/// Options to parse `space_or_tab`.
#[derive(Debug)]
@@ -134,7 +134,7 @@ pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> {
/// ```
fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ') if info.options.max > 0 => {
+ Some('\t' | ' ') if info.options.max > 0 => {
tokenizer
.enter_with_content(info.options.kind.clone(), info.options.content_type.clone());
@@ -165,7 +165,7 @@ fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ') if info.size < info.options.max => {
+ Some('\t' | ' ') if info.size < info.options.max => {
tokenizer.consume();
info.size += 1;
State::Fn(Box::new(|t| inside(t, info)))
@@ -190,7 +190,7 @@ fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn after_space_or_tab(tokenizer: &mut Tokenizer, mut info: EolInfo) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.enter_with_content(Token::LineEnding, info.options.content_type.clone());
if info.connect {
@@ -239,10 +239,7 @@ fn after_eol(tokenizer: &mut Tokenizer, info: EolInfo) -> State {
/// ```
fn after_more_space_or_tab(tokenizer: &mut Tokenizer) -> State {
// Blank line not allowed.
- if matches!(
- tokenizer.current,
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- ) {
+ if matches!(tokenizer.current, None | Some('\n')) {
State::Nok
} else {
State::Ok
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index e9528fd..15fc25e 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -33,7 +33,7 @@
use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions};
use crate::subtokenize::link;
use crate::token::Token;
-use crate::tokenizer::{Code, ContentType, State, Tokenizer};
+use crate::tokenizer::{ContentType, State, Tokenizer};
/// Configuration.
///
@@ -103,19 +103,6 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not `Code::Char('(' | '"' | '\'')`.
- fn from_code(code: Code) -> Kind {
- match code {
- Code::Char(char) => Kind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// State needed to parse titles.
@@ -137,10 +124,10 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
match tokenizer.current {
- Code::Char('"' | '\'' | '(') => {
+ Some(char) if matches!(char, '"' | '\'' | '(') => {
let info = Info {
connect: false,
- kind: Kind::from_code(tokenizer.current),
+ kind: Kind::from_char(char),
options,
};
tokenizer.enter(info.options.title.clone());
@@ -163,7 +150,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {
/// ```
fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.enter(info.options.marker.clone());
tokenizer.consume();
tokenizer.exit(info.options.marker.clone());
@@ -185,12 +172,12 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.exit(info.options.string.clone());
begin(tokenizer, info)
}
- Code::None => State::Nok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go(
+ None => State::Nok,
+ Some('\n') => tokenizer.go(
space_or_tab_eol_with_options(EolOptions {
content_type: Some(ContentType::String),
connect: info.connect,
@@ -223,15 +210,15 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn title(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::Data);
at_break(tokenizer, info)
}
- Code::Char('\\') => {
+ Some('\\') => {
tokenizer.consume();
State::Fn(Box::new(|t| escape(t, info)))
}
@@ -250,7 +237,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn escape(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.consume();
State::Fn(Box::new(|t| title(t, info)))
}
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index 4c94c7d..152824b 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -47,8 +47,8 @@
use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN;
use crate::token::Token;
-use crate::tokenizer::{Code, Event, EventType, Tokenizer};
-use crate::util::span;
+use crate::tokenizer::{Event, EventType, Tokenizer};
+use crate::util::slice::{Position, Slice};
/// To do.
pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) {
@@ -85,30 +85,26 @@ fn trim_data(
trim_end: bool,
hard_break: bool,
) {
- let mut codes = span::codes(
- &tokenizer.parse_state.codes,
- &span::from_exit_event(&tokenizer.events, exit_index),
+ let mut slice = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position::from_exit_event(&tokenizer.events, exit_index),
);
if trim_end {
- let mut index = codes.len();
- let mut vs = 0;
- let mut spaces_only = true;
+ let mut index = slice.chars.len();
+ let vs = slice.after;
+ let mut spaces_only = vs == 0;
while index > 0 {
- match codes[index - 1] {
- Code::Char(' ') => {}
- Code::Char('\t') => spaces_only = false,
- Code::VirtualSpace => {
- vs += 1;
- spaces_only = false;
- }
+ match slice.chars[index - 1] {
+ ' ' => {}
+ '\t' => spaces_only = false,
_ => break,
}
index -= 1;
}
- let diff = codes.len() - index;
+ let diff = slice.chars.len() - index;
let token_type = if spaces_only
&& hard_break
&& exit_index + 1 < tokenizer.events.len()
@@ -127,12 +123,12 @@ fn trim_data(
return;
}
- if diff > 0 {
+ if diff > 0 || vs > 0 {
let exit_point = tokenizer.events[exit_index].point.clone();
let mut enter_point = exit_point.clone();
enter_point.index -= diff;
- enter_point.column -= diff - vs;
- enter_point.offset -= diff - vs;
+ enter_point.column -= diff;
+ enter_point.vs = 0;
tokenizer.map.add(
exit_index + 1,
@@ -154,17 +150,16 @@ fn trim_data(
);
tokenizer.events[exit_index].point = enter_point;
- codes = &codes[..index];
+ slice.chars = &slice.chars[..index];
}
}
if trim_start {
let mut index = 0;
- let mut vs = 0;
- while index < codes.len() {
- match codes[index] {
- Code::Char(' ' | '\t') => {}
- Code::VirtualSpace => vs += 1,
+ let vs = slice.before;
+ while index < slice.chars.len() {
+ match slice.chars[index] {
+ ' ' | '\t' => {}
_ => break,
}
@@ -173,18 +168,18 @@ fn trim_data(
// The whole data is whitespace.
// We can be very fast: we only change the token types.
- if index == codes.len() {
+ if index == slice.chars.len() {
tokenizer.events[exit_index - 1].token_type = Token::SpaceOrTab;
tokenizer.events[exit_index].token_type = Token::SpaceOrTab;
return;
}
- if index > 0 {
+ if index > 0 || vs > 0 {
let enter_point = tokenizer.events[exit_index - 1].point.clone();
let mut exit_point = enter_point.clone();
exit_point.index += index;
- exit_point.column += index - vs;
- exit_point.offset += index - vs;
+ exit_point.column += index;
+ exit_point.vs = 0;
tokenizer.map.add(
exit_index - 1,
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 41dc6ae..bed454b 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -51,7 +51,7 @@
use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};
use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Type of thematic break.
#[derive(Debug, PartialEq)]
@@ -104,19 +104,6 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not `Code::Char('*' | '-' | '_')`.
- fn from_code(code: Code) -> Kind {
- match code {
- Code::Char(char) => Kind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// State needed to parse thematic breaks.
@@ -157,10 +144,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::Char('*' | '-' | '_') => at_break(
+ Some(char) if matches!(char, '*' | '-' | '_') => at_break(
tokenizer,
Info {
- kind: Kind::from_code(tokenizer.current),
+ kind: Kind::from_char(char),
size: 0,
},
),
@@ -176,15 +163,13 @@ fn before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN =>
- {
+ None | Some('\n' | '\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {
tokenizer.exit(Token::ThematicBreak);
// Feel free to interrupt.
tokenizer.interrupt = false;
State::Ok
}
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.enter(Token::ThematicBreakSequence);
sequence(tokenizer, info)
}
@@ -200,7 +185,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char(char) if char == info.kind.as_char() => {
+ Some(char) if char == info.kind.as_char() => {
tokenizer.consume();
info.size += 1;
State::Fn(Box::new(|t| sequence(t, info)))
diff --git a/src/content/document.rs b/src/content/document.rs
index 32b32ba..2924f6c 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -17,12 +17,12 @@ use crate::parser::ParseState;
use crate::subtokenize::subtokenize;
use crate::token::Token;
use crate::tokenizer::{
- Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
+ Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
};
use crate::util::{
normalize_identifier::normalize_identifier,
skip,
- span::{from_exit_event, serialize},
+ slice::{Position, Slice},
};
/// Phases where we can exit containers.
@@ -78,7 +78,7 @@ struct DocumentInfo {
pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let mut tokenizer = Tokenizer::new(point, parse_state);
- let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start));
+ let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before));
tokenizer.flush(state, true);
let mut index = 0;
@@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let event = &tokenizer.events[index];
if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
+ // To do: when we operate on u8, we can use a `to_str` here as we
+ // don‘t need virtual spaces.
let id = normalize_identifier(
- serialize(
- &parse_state.codes,
- &from_exit_event(&tokenizer.events, index),
- false,
+ &Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position::from_exit_event(&tokenizer.events, index),
)
- .as_str(),
+ .serialize(),
);
if !definitions.contains(&id) {
@@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
events
}
+/// At the beginning.
+///
+/// Perhaps a BOM?
+///
+/// ```markdown
+/// > | a
+/// ^
+/// ```
+fn before(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some('\u{FEFF}') => {
+ tokenizer.enter(Token::ByteOrderMark);
+ tokenizer.consume();
+ tokenizer.exit(Token::ByteOrderMark);
+ State::Fn(Box::new(start))
+ }
+ _ => start(tokenizer),
+ }
+}
+
/// Before document.
//
/// ```markdown
@@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State
// Parse flow, pausing after eols.
tokenizer.go_until(
state,
- |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')),
+ |code| matches!(code, Some('\n')),
move |state| Box::new(move |t| flow_end(t, info, state)),
)(tokenizer)
}
diff --git a/src/content/flow.rs b/src/content/flow.rs
index ea09cd9..09c4e2c 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -27,7 +27,7 @@ use crate::construct::{
thematic_break::start as thematic_break,
};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Before flow.
///
@@ -41,7 +41,7 @@ use crate::tokenizer::{Code, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt(blank_line, |ok| {
Box::new(if ok { blank_line_after } else { initial_before })
})(tokenizer),
@@ -62,7 +62,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn initial_before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt_n(
vec![
Box::new(code_indented),
@@ -87,8 +87,8 @@ fn initial_before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn blank_line_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None => State::Ok,
+ Some('\n') => {
tokenizer.enter(Token::BlankLineEnding);
tokenizer.consume();
tokenizer.exit(Token::BlankLineEnding);
@@ -111,8 +111,8 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None => State::Ok,
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
diff --git a/src/content/string.rs b/src/content/string.rs
index c6c0094..8bc2b91 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -16,9 +16,9 @@ use crate::construct::{
character_escape::start as character_escape, character_reference::start as character_reference,
partial_data::start as data, partial_whitespace::create_resolve_whitespace,
};
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
-const MARKERS: [Code; 2] = [Code::Char('&'), Code::Char('\\')];
+const MARKERS: [char; 2] = ['&', '\\'];
/// Start of string.
pub fn start(tokenizer: &mut Tokenizer) -> State {
@@ -32,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// Before string.
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt_n(
vec![Box::new(character_reference), Box::new(character_escape)],
|ok| Box::new(if ok { before } else { before_data }),
diff --git a/src/content/text.rs b/src/content/text.rs
index 4248053..ebdf888 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -28,18 +28,18 @@ use crate::construct::{
label_start_image::start as label_start_image, label_start_link::start as label_start_link,
partial_data::start as data, partial_whitespace::create_resolve_whitespace,
};
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
-const MARKERS: [Code; 9] = [
- Code::Char('!'), // `label_start_image`
- Code::Char('&'), // `character_reference`
- Code::Char('*'), // `attention`
- Code::Char('<'), // `autolink`, `html_text`
- Code::Char('['), // `label_start_link`
- Code::Char('\\'), // `character_escape`, `hard_break_escape`
- Code::Char(']'), // `label_end`
- Code::Char('_'), // `attention`
- Code::Char('`'), // `code_text`
+const MARKERS: [char; 9] = [
+ '!', // `label_start_image`
+ '&', // `character_reference`
+ '*', // `attention`
+ '<', // `autolink`, `html_text`
+ '[', // `label_start_link`
+ '\\', // `character_escape`, `hard_break_escape`
+ ']', // `label_end`
+ '_', // `attention`
+ '`', // `code_text`
];
/// Start of text.
@@ -57,7 +57,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// Before text.
pub fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt_n(
vec![
Box::new(attention),
diff --git a/src/lib.rs b/src/lib.rs
index 4dc15e6..c1b0fa0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,7 +17,6 @@ mod util;
use crate::compiler::compile;
use crate::parser::parse;
-use crate::tokenizer::Code;
/// Type of line endings in markdown.
#[derive(Debug, Default, Clone, PartialEq)]
@@ -61,16 +60,16 @@ impl LineEnding {
LineEnding::LineFeed => "\n",
}
}
- /// Turn a [Code] into a line ending.
+ /// Turn a string into a line ending.
///
/// ## Panics
///
/// Panics if `code` is not `\r\n`, `\r`, or `\n`.
- fn from_code(code: Code) -> LineEnding {
- match code {
- Code::CarriageReturnLineFeed => LineEnding::CarriageReturnLineFeed,
- Code::Char('\r') => LineEnding::CarriageReturn,
- Code::Char('\n') => LineEnding::LineFeed,
+ fn from_str(str: &str) -> LineEnding {
+ match str {
+ "\r\n" => LineEnding::CarriageReturnLineFeed,
+ "\r" => LineEnding::CarriageReturn,
+ "\n" => LineEnding::LineFeed,
_ => unreachable!("invalid code"),
}
}
@@ -425,5 +424,5 @@ pub fn micromark(value: &str) -> String {
#[must_use]
pub fn micromark_with_options(value: &str, options: &Options) -> String {
let (events, result) = parse(value, options);
- compile(&events, &result.codes, options)
+ compile(&events, &result.chars, options)
}
diff --git a/src/parser.rs b/src/parser.rs
index 0f71daf..cc9c256 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,19 +1,18 @@
//! Turn a string of markdown into events.
use crate::content::document::document;
-use crate::tokenizer::{Code, Event, Point};
-use crate::util::codes::parse as parse_codes;
+use crate::tokenizer::{Event, Point};
use crate::{Constructs, Options};
/// Information needed, in all content types, when parsing markdown.
///
/// Importantly, this contains a set of known definitions.
-/// It also references the input value as [`Code`][]s.
+/// It also references the input value as a `Vec<char>`.
#[derive(Debug)]
pub struct ParseState<'a> {
pub constructs: &'a Constructs,
- /// List of codes.
- pub codes: Vec<Code>,
+ /// List of chars.
+ pub chars: Vec<char>,
/// Set of defined identifiers.
pub definitions: Vec<String>,
}
@@ -24,7 +23,8 @@ pub struct ParseState<'a> {
pub fn parse<'a>(value: &str, options: &'a Options) -> (Vec<Event>, ParseState<'a>) {
let mut parse_state = ParseState {
constructs: &options.constructs,
- codes: parse_codes(value),
+ // To do: change to `u8`s?
+ chars: value.chars().collect::<_>(),
definitions: vec![],
};
@@ -33,8 +33,8 @@ pub fn parse<'a>(value: &str, options: &'a Options) -> (Vec<Event>, ParseState<'
Point {
line: 1,
column: 1,
- offset: 0,
index: 0,
+ vs: 0,
},
);
diff --git a/src/token.rs b/src/token.rs
index a0479e1..db3bffc 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -157,6 +157,17 @@ pub enum Token {
/// | b
/// ```
BlockQuotePrefix,
+ /// Byte order mark.
+ ///
+ /// ## Info
+ ///
+ /// * **Context**:
+ /// optional first event
+ /// * **Content model**:
+ /// void
+ /// * **Construct**:
+ /// [`document`][crate::content::document]
+ ByteOrderMark,
/// Whole character escape.
///
/// ## Info
@@ -1822,13 +1833,14 @@ pub enum Token {
}
/// List of void tokens, used to make sure everything is working good.
-pub const VOID_TOKENS: [Token; 39] = [
+pub const VOID_TOKENS: [Token; 40] = [
Token::AttentionSequence,
Token::AutolinkEmail,
Token::AutolinkMarker,
Token::AutolinkProtocol,
Token::BlankLineEnding,
Token::BlockQuoteMarker,
+ Token::ByteOrderMark,
Token::CharacterEscapeMarker,
Token::CharacterEscapeValue,
Token::CharacterReferenceMarker,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index ba18956..ec70a2b 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -11,6 +11,7 @@
//! [`attempt`]: Tokenizer::attempt
//! [`check`]: Tokenizer::check
+use crate::constant::TAB_SIZE;
use crate::parser::ParseState;
use crate::token::{Token, VOID_TOKENS};
use crate::util::edit_map::EditMap;
@@ -24,20 +25,11 @@ pub enum ContentType {
String,
}
-/// Enum representing a character code.
-#[derive(Debug, Clone, Copy, PartialEq)]
-pub enum Code {
- /// End of the input stream (called eof).
- None,
- /// Used to make parsing line endings easier as it represents both
- /// `Code::Char('\r')` and `Code::Char('\n')` combined.
- CarriageReturnLineFeed,
- /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab
- /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s.
- VirtualSpace,
- /// The most frequent variant of this enum is `Code::Char(char)`, which just
- /// represents a char, but micromark adds meaning to certain other values.
- Char(char),
+#[derive(Debug, PartialEq)]
+pub enum CharAction {
+ Normal(char),
+ Insert(char),
+ Ignore,
}
/// A location in the document (`line`/`column`/`offset`).
@@ -54,9 +46,12 @@ pub struct Point {
/// the same as editors.
pub column: usize,
/// 0-indexed position in the document.
- pub offset: usize,
- /// Index into `codes`.
+ ///
+ /// Also an `index` into `codes`.
+ // To do: call it `offset`?
pub index: usize,
+ /// To do.
+ pub vs: usize,
}
/// Possible event types.
@@ -86,7 +81,7 @@ pub struct Event {
}
/// The essence of the state machine are functions: `StateFn`.
-/// It’s responsible for dealing with that single passed [`Code`][].
+/// It’s responsible for dealing with the current char.
/// It yields a [`State`][].
pub type StateFn = dyn FnOnce(&mut Tokenizer) -> State;
@@ -162,9 +157,9 @@ struct InternalState {
/// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt.
stack_len: usize,
/// Previous code.
- previous: Code,
+ previous: Option<char>,
/// Current code.
- current: Code,
+ current: Option<char>,
/// Current relative and absolute position in the file.
point: Point,
}
@@ -173,9 +168,11 @@ struct InternalState {
#[allow(clippy::struct_excessive_bools)]
pub struct Tokenizer<'a> {
/// Jump between line endings.
- column_start: Vec<usize>,
+ column_start: Vec<(usize, usize)>,
// First line.
- line_start: usize,
+ first_line: usize,
+ /// To do.
+ line_start: Point,
/// Track whether a character is expected to be consumed, and whether it’s
/// actually consumed
///
@@ -184,9 +181,9 @@ pub struct Tokenizer<'a> {
/// Track whether this tokenizer is done.
resolved: bool,
/// Current character code.
- pub current: Code,
+ pub current: Option<char>,
/// Previous character code.
- pub previous: Code,
+ pub previous: Option<char>,
/// Current relative and absolute place in the file.
pub point: Point,
/// Semantic labels of one or more codes in `codes`.
@@ -237,11 +234,12 @@ impl<'a> Tokenizer<'a> {
/// Create a new tokenizer.
pub fn new(point: Point, parse_state: &'a ParseState) -> Tokenizer<'a> {
Tokenizer {
- previous: Code::None,
- current: Code::None,
+ previous: None,
+ current: None,
// To do: reserve size when feeding?
column_start: vec![],
- line_start: point.line,
+ first_line: point.line,
+ line_start: point.clone(),
consumed: true,
resolved: false,
point,
@@ -280,18 +278,18 @@ impl<'a> Tokenizer<'a> {
/// Define a jump between two places.
pub fn define_skip(&mut self, point: &Point) {
- define_skip_impl(self, point.line, point.index);
+ define_skip_impl(self, point.line, (point.index, point.vs));
}
/// Define the current place as a jump between two places.
pub fn define_skip_current(&mut self) {
- define_skip_impl(self, self.point.line, self.point.index);
+ define_skip_impl(self, self.point.line, (self.point.index, self.point.vs));
}
/// Increment the current positional info if we’re right after a line
/// ending, which has a skip defined.
fn account_for_potential_skip(&mut self) {
- let at = self.point.line - self.line_start;
+ let at = self.point.line - self.first_line;
if self.point.column == 1 && at != self.column_start.len() {
self.move_to(self.column_start[at]);
@@ -299,10 +297,10 @@ impl<'a> Tokenizer<'a> {
}
/// Prepare for a next code to get consumed.
- pub fn expect(&mut self, code: Code) {
+ pub fn expect(&mut self, char: Option<char>) {
assert!(self.consumed, "expected previous character to be consumed");
self.consumed = false;
- self.current = code;
+ self.current = char;
}
/// Consume the current character.
@@ -311,46 +309,60 @@ impl<'a> Tokenizer<'a> {
pub fn consume(&mut self) {
log::debug!("consume: `{:?}` ({:?})", self.current, self.point);
assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned");
- self.move_to(self.point.index + 1);
+
+ self.move_one();
+
self.previous = self.current;
+ // While we’re not at the eof, it is at least better to not have the
+ // same current code as `previous` *and* `current`.
+ self.current = None;
// Mark as consumed.
self.consumed = true;
}
- /// To do.
- pub fn move_to(&mut self, to: usize) {
- while self.point.index < to {
- let code = &self.parse_state.codes[self.point.index];
- self.point.index += 1;
+ /// Move to the next (virtual) character.
+ pub fn move_one(&mut self) {
+ match char_action(&self.parse_state.chars, &self.point) {
+ CharAction::Ignore => {
+ self.point.index += 1;
+ }
+ CharAction::Insert(char) => {
+ self.previous = Some(char);
+ self.point.column += 1;
+ self.point.vs += 1;
+ }
+ CharAction::Normal(char) => {
+ self.previous = Some(char);
+ self.point.vs = 0;
+ self.point.index += 1;
- match code {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ if char == '\n' {
self.point.line += 1;
self.point.column = 1;
- self.point.offset += if *code == Code::CarriageReturnLineFeed {
- 2
- } else {
- 1
- };
- if self.point.line - self.line_start + 1 > self.column_start.len() {
- self.column_start.push(self.point.index);
+ if self.point.line - self.first_line + 1 > self.column_start.len() {
+ self.column_start.push((self.point.index, self.point.vs));
}
+ self.line_start = self.point.clone();
+
self.account_for_potential_skip();
log::debug!("position: after eol: `{:?}`", self.point);
- }
- Code::VirtualSpace => {
- // Empty.
- }
- _ => {
+ } else {
self.point.column += 1;
- self.point.offset += 1;
}
}
}
}
+ /// Move (virtual) characters.
+ pub fn move_to(&mut self, to: (usize, usize)) {
+ let (to_index, to_vs) = to;
+ while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs {
+ self.move_one();
+ }
+ }
+
/// Mark the start of a semantic label.
pub fn enter(&mut self, token_type: Token) {
self.enter_with_link(token_type, None);
@@ -368,11 +380,23 @@ impl<'a> Tokenizer<'a> {
}
pub fn enter_with_link(&mut self, token_type: Token, link: Option<Link>) {
- log::debug!("enter: `{:?}` ({:?})", token_type, self.point);
+ let mut point = self.point.clone();
+
+ // Move back past ignored chars.
+ while point.index > 0 {
+ point.index -= 1;
+ let action = char_action(&self.parse_state.chars, &point);
+ if !matches!(action, CharAction::Ignore) {
+ point.index += 1;
+ break;
+ }
+ }
+
+ log::debug!("enter: `{:?}` ({:?})", token_type, point);
self.events.push(Event {
event_type: EventType::Enter,
token_type: token_type.clone(),
- point: self.point.clone(),
+ point,
link,
});
self.stack.push(token_type);
@@ -391,7 +415,9 @@ impl<'a> Tokenizer<'a> {
let mut point = self.point.clone();
assert!(
- current_token != previous.token_type || previous.point.index != point.index,
+ current_token != previous.token_type
+ || previous.point.index != point.index
+ || previous.point.vs != point.vs,
"expected non-empty token"
);
@@ -406,18 +432,18 @@ impl<'a> Tokenizer<'a> {
// A bit weird, but if we exit right after a line ending, we *don’t* want to consider
// potential skips.
- if matches!(
- self.previous,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- ) {
- point.column = 1;
- point.offset = previous.point.offset
- + if self.previous == Code::CarriageReturnLineFeed {
- 2
- } else {
- 1
- };
- point.index = previous.point.index + 1;
+ if matches!(self.previous, Some('\n')) {
+ point = self.line_start.clone();
+ } else {
+ // Move back past ignored chars.
+ while point.index > 0 {
+ point.index -= 1;
+ let action = char_action(&self.parse_state.chars, &point);
+ if !matches!(action, CharAction::Ignore) {
+ point.index += 1;
+ break;
+ }
+ }
}
log::debug!("exit: `{:?}` ({:?})", token_type, point);
@@ -494,7 +520,7 @@ impl<'a> Tokenizer<'a> {
pub fn go_until(
&mut self,
state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static,
- until: impl Fn(Code) -> bool + 'static,
+ until: impl Fn(Option<char>) -> bool + 'static,
done: impl FnOnce(State) -> Box<StateFn> + 'static,
) -> Box<StateFn> {
attempt_impl(
@@ -619,19 +645,32 @@ impl<'a> Tokenizer<'a> {
assert!(!self.resolved, "cannot feed after drain");
assert!(min >= self.point.index, "cannot move backwards");
- self.move_to(min);
+ // To do: accept `vs`?
+ self.move_to((min, 0));
let mut state = State::Fn(Box::new(start));
while self.point.index < max {
match state {
State::Ok | State::Nok => break,
- State::Fn(func) => {
- let code = self.parse_state.codes[self.point.index];
- log::debug!("main: passing: `{:?}` ({:?})", code, self.point);
- self.expect(code);
- state = func(self);
- }
+ State::Fn(func) => match char_action(&self.parse_state.chars, &self.point) {
+ CharAction::Ignore => {
+ state = State::Fn(Box::new(func));
+ self.move_one();
+ }
+ CharAction::Insert(char) => {
+ log::debug!("main: passing (fake): `{:?}` ({:?})", char, self.point);
+ self.expect(Some(char));
+ state = func(self);
+ // self.point.column += 1;
+ // self.point.vs += 1;
+ }
+ CharAction::Normal(char) => {
+ log::debug!("main: passing: `{:?}` ({:?})", char, self.point);
+ self.expect(Some(char));
+ state = func(self);
+ }
+ },
}
}
@@ -648,15 +687,35 @@ impl<'a> Tokenizer<'a> {
match state {
State::Ok | State::Nok => break,
State::Fn(func) => {
+ // To do: clean this?
// We sometimes move back when flushing, so then we use those codes.
- let code = if self.point.index < max {
- self.parse_state.codes[self.point.index]
+ if self.point.index == max {
+ let char = None;
+ log::debug!("main: flushing eof: `{:?}` ({:?})", char, self.point);
+ self.expect(char);
+ state = func(self);
} else {
- Code::None
+ match char_action(&self.parse_state.chars, &self.point) {
+ CharAction::Ignore => {
+ state = State::Fn(Box::new(func));
+ self.move_one();
+ }
+ CharAction::Insert(char) => {
+ log::debug!(
+ "main: flushing (fake): `{:?}` ({:?})",
+ char,
+ self.point
+ );
+ self.expect(Some(char));
+ state = func(self);
+ }
+ CharAction::Normal(char) => {
+ log::debug!("main: flushing: `{:?}` ({:?})", char, self.point);
+ self.expect(Some(char));
+ state = func(self);
+ }
+ }
};
- log::debug!("main: flushing {:?}", code);
- self.expect(code);
- state = func(self);
}
}
}
@@ -676,13 +735,58 @@ impl<'a> Tokenizer<'a> {
}
}
+fn char_action(chars: &[char], point: &Point) -> CharAction {
+ if point.index < chars.len() {
+ let char = chars[point.index];
+
+ if char == '\0' {
+ CharAction::Normal(char::REPLACEMENT_CHARACTER)
+ } else if char == '\r' {
+ // CRLF.
+ if point.index < chars.len() - 1 && chars[point.index + 1] == '\n' {
+ CharAction::Ignore
+ }
+ // CR.
+ else {
+ CharAction::Normal('\n')
+ }
+ } else if char == '\t' {
+ let remainder = point.column % TAB_SIZE;
+ let vs = if remainder == 0 {
+ 0
+ } else {
+ TAB_SIZE - remainder
+ };
+
+ // On the tab itself, first send it.
+ if point.vs == 0 {
+ if vs == 0 {
+ CharAction::Normal(char)
+ } else {
+ CharAction::Insert(char)
+ }
+ } else if vs == 0 {
+ CharAction::Normal(' ')
+ } else {
+ CharAction::Insert(' ')
+ }
+ }
+ // VS?
+ else {
+ CharAction::Normal(char)
+ }
+ } else {
+ unreachable!("out of bounds")
+ }
+}
+
/// Internal utility to wrap states to also capture codes.
///
/// Recurses into itself.
/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check].
fn attempt_impl(
state: impl FnOnce(&mut Tokenizer) -> State + 'static,
- pause: Option<Box<dyn Fn(Code) -> bool + 'static>>,
+ pause: Option<Box<dyn Fn(Option<char>) -> bool + 'static>>,
start: usize,
done: impl FnOnce(&mut Tokenizer, State) -> State + 'static,
) -> Box<StateFn> {
@@ -706,14 +810,14 @@ fn attempt_impl(
/// Define a jump between two places.
///
/// This defines to which future index we move after a line ending.
-fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, index: usize) {
- log::debug!("position: define skip: {:?} -> ({:?})", line, index);
- let at = line - tokenizer.line_start;
+fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize)) {
+ log::debug!("position: define skip: {:?} -> ({:?})", line, info);
+ let at = line - tokenizer.first_line;
- if at == tokenizer.column_start.len() {
- tokenizer.column_start.push(index);
+ if at >= tokenizer.column_start.len() {
+ tokenizer.column_start.push(info);
} else {
- tokenizer.column_start[at] = index;
+ tokenizer.column_start[at] = info;
}
tokenizer.account_for_potential_skip();
diff --git a/src/util/codes.rs b/src/util/codes.rs
deleted file mode 100644
index 5006a00..0000000
--- a/src/util/codes.rs
+++ /dev/null
@@ -1,125 +0,0 @@
-//! Utilities to deal with character codes.
-
-use crate::constant::TAB_SIZE;
-use crate::tokenizer::Code;
-
-/// Turn a string into codes.
-pub fn parse(value: &str) -> Vec<Code> {
- // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller
- // with `Code::CarriageReturnLineFeed`.
- let mut codes = Vec::with_capacity(value.len());
- let mut at_start = true;
- let mut at_carriage_return = false;
- let mut column = 1;
-
- for char in value.chars() {
- if at_start {
- at_start = false;
-
- if char == '\u{feff}' {
- // Ignore.
- continue;
- }
- }
-
- // Send a CRLF.
- if at_carriage_return && '\n' == char {
- at_carriage_return = false;
- codes.push(Code::CarriageReturnLineFeed);
- } else {
- // Send the previous CR: we’re not at a next `\n`.
- if at_carriage_return {
- at_carriage_return = false;
- codes.push(Code::Char('\r'));
- }
-
- match char {
- // Send a replacement character.
- '\0' => {
- column += 1;
- codes.push(Code::Char(char::REPLACEMENT_CHARACTER));
- }
- // Send a tab and virtual spaces.
- '\t' => {
- let remainder = column % TAB_SIZE;
- let mut virtual_spaces = if remainder == 0 {
- 0
- } else {
- TAB_SIZE - remainder
- };
- codes.push(Code::Char(char));
- column += 1;
- while virtual_spaces > 0 {
- codes.push(Code::VirtualSpace);
- column += 1;
- virtual_spaces -= 1;
- }
- }
- // Send an LF.
- '\n' => {
- column = 1;
- codes.push(Code::Char(char));
- }
- // Don’t send anything yet.
- '\r' => {
- column = 1;
- at_carriage_return = true;
- }
- // Send the char.
- _ => {
- column += 1;
- codes.push(Code::Char(char));
- }
- }
- };
- }
-
- // Send the last CR: we’re not at a next `\n`.
- if at_carriage_return {
- codes.push(Code::Char('\r'));
- }
-
- codes
-}
-
-/// Serialize codes, optionally expanding tabs.
-pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
- let mut at_tab = false;
- // Note: It’ll grow a bit smaller with each
- // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false,
- // and bigger with `Code::CarriageReturnLineFeed`,
- let mut value = String::with_capacity(codes.len());
-
- for code in codes {
- let mut at_tab_next = false;
-
- match code {
- Code::CarriageReturnLineFeed => {
- value.push_str("\r\n");
- }
- Code::Char(char) if *char == '\n' || *char == '\r' => {
- value.push(*char);
- }
- Code::Char(char) if *char == '\t' => {
- at_tab_next = true;
- value.push(if expand_tabs { ' ' } else { *char });
- }
- Code::VirtualSpace => {
- if !expand_tabs && at_tab {
- continue;
- }
- value.push(' ');
- }
- Code::Char(char) => {
- value.push(*char);
- }
- Code::None => {
- unreachable!("unexpected EOF code in codes");
- }
- }
-
- at_tab = at_tab_next;
- }
-
- value
-}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 965ea5c..91c5462 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -20,7 +20,8 @@
/// ## References
///
/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
-pub fn encode<S: Into<String>>(value: S) -> String {
+pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String {
+ let check = if encode_html { check_all } else { check_nil };
let mut value = value.into();
// It’ll grow a bit bigger for each dangerous character.
@@ -31,6 +32,7 @@ pub fn encode<S: Into<String>>(value: S) -> String {
let dangerous = value.pop().unwrap();
result.push_str(&value);
result.push_str(match dangerous {
+ '\0' => "�",
'&' => "&amp;",
'"' => "&quot;",
'<' => "&lt;",
@@ -45,6 +47,10 @@ pub fn encode<S: Into<String>>(value: S) -> String {
result
}
-fn check(char: char) -> bool {
- matches!(char, '&' | '"' | '<' | '>')
+fn check_all(char: char) -> bool {
+ matches!(char, '\0' | '&' | '"' | '<' | '>')
+}
+
+fn check_nil(char: char) -> bool {
+ matches!(char, '\0')
}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index ae1add6..a01f31e 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,10 +1,9 @@
//! Utilities used when compiling markdown.
-pub mod codes;
pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
pub mod normalize_identifier;
pub mod sanitize_uri;
pub mod skip;
-pub mod span;
+pub mod slice;
diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs
index 81450ae..8c09549 100644
--- a/src/util/sanitize_uri.rs
+++ b/src/util/sanitize_uri.rs
@@ -32,7 +32,7 @@ use crate::util::encode::encode;
///
/// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)
pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {
- let value = encode(normalize_uri(value));
+ let value = encode(normalize_uri(value), true);
if let Some(protocols) = protocols {
let end = value.find(|c| matches!(c, '?' | '#' | '/'));
diff --git a/src/util/slice.rs b/src/util/slice.rs
new file mode 100644
index 0000000..2134069
--- /dev/null
+++ b/src/util/slice.rs
@@ -0,0 +1,156 @@
+//! Utilities to deal with characters.
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::{Event, EventType, Point};
+
+/// A range between two places.
+#[derive(Debug)]
+pub struct Position<'a> {
+ pub start: &'a Point,
+ pub end: &'a Point,
+}
+
+impl<'a> Position<'a> {
+ /// Get a position from an exit event.
+ ///
+ /// Looks backwards for the corresponding `enter` event.
+ /// This does not support nested events (such as lists in lists).
+ ///
+ /// ## Panics
+ ///
+ /// This function panics if an enter event is given.
+ /// When `micromark` is used, this function never panics.
+ pub fn from_exit_event(events: &'a [Event], index: usize) -> Position<'a> {
+ let exit = &events[index];
+ assert_eq!(
+ exit.event_type,
+ EventType::Exit,
+ "expected `from_exit_event` to be called on `exit` event"
+ );
+ let mut enter_index = index - 1;
+
+ loop {
+ let enter = &events[enter_index];
+ if enter.event_type == EventType::Enter && enter.token_type == exit.token_type {
+ return Position {
+ start: &enter.point,
+ end: &exit.point,
+ };
+ }
+
+ enter_index -= 1;
+ }
+ }
+}
+
+/// Chars belonging to a range.
+///
+/// Includes information on virtual spaces before and after the chars.
+#[derive(Debug)]
+pub struct Slice<'a> {
+ pub chars: &'a [char],
+ pub before: usize,
+ pub after: usize,
+}
+
+impl<'a> Slice<'a> {
+ /// Get the slice belonging to a position.
+ pub fn from_point(list: &'a [char], point: &Point) -> Slice<'a> {
+ let mut before = point.vs;
+ let mut start = point.index;
+ let end = if start < list.len() { start + 1 } else { start };
+
+ // If we have virtual spaces before, it means we are past the actual
+ // character at that index, and those virtual spaces.
+ if before > 0 {
+ before = TAB_SIZE - before;
+ start += 1;
+ };
+
+ Slice {
+ chars: if start < end { &list[start..end] } else { &[] },
+ before,
+ after: 0,
+ }
+ }
+
+ /// Get the slice belonging to a position.
+ pub fn from_position(list: &'a [char], position: &Position) -> Slice<'a> {
+ let mut before = position.start.vs;
+ let mut after = position.end.vs;
+ let mut start = position.start.index;
+ let mut end = position.end.index;
+
+ // If we have virtual spaces before, it means we are past the actual
+ // character at that index, and those virtual spaces.
+ if before > 0 {
+ before = TAB_SIZE - before;
+ start += 1;
+ };
+
+ // If we have virtual spaces after, it means that character is included,
+ // and one less virtual space.
+ if after > 0 {
+ after -= 1;
+ end += 1;
+ }
+
+ Slice {
+ chars: &list[start..end],
+ before,
+ after,
+ }
+ }
+
+ /// To do.
+ pub fn size(&self) -> usize {
+ self.chars.len() + self.before + self.after
+ }
+
+ // To do:
+ // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html>
+ // to implement an `as_str`.
+
+ /// To do.
+ pub fn head(&self) -> Option<char> {
+ if self.before > 0 {
+ Some(' ')
+ } else if self.chars.is_empty() {
+ None
+ } else {
+ Some(self.chars[0])
+ }
+ }
+
+ /// To do.
+ pub fn tail(&self) -> Option<char> {
+ if self.after > 0 {
+ Some(' ')
+ } else {
+ let index = self.chars.len();
+ if index > 0 {
+ Some(self.chars[index - 1])
+ } else {
+ None
+ }
+ }
+ }
+
+ /// To do.
+ pub fn serialize(&self) -> String {
+ let mut string = String::with_capacity(self.size());
+ let mut index = self.before;
+ while index > 0 {
+ string.push(' ');
+ index -= 1;
+ }
+ string.push_str(&self.chars.iter().collect::<String>());
+ index = self.after;
+ while index > 0 {
+ string.push(' ');
+ index -= 1;
+ }
+
+ string
+ }
+}
diff --git a/src/util/span.rs b/src/util/span.rs
deleted file mode 100644
index ca25924..0000000
--- a/src/util/span.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-//! Utilities to deal with semantic labels.
-
-use crate::tokenizer::{Code, Event, EventType};
-use crate::util::codes::serialize as serialize_codes;
-
-/// A struct representing the span of an opening and closing event of a token.
-#[derive(Debug)]
-pub struct Span {
- /// Absolute offset (an `index` in `codes`) of where this span starts.
- pub start_index: usize,
- /// Absolute offset (an `index` in `codes`) of where this span ends.
- pub end_index: usize,
-}
-
-/// Get a span from an event.
-///
-/// Get the span of an `exit` event, by looking backwards through the events to
-/// find the corresponding `enter` event.
-/// This assumes that tokens with the same are not nested.
-///
-/// ## Panics
-///
-/// This function panics if an enter event is given.
-/// When `micromark` is used, this function never panics.
-pub fn from_exit_event(events: &[Event], index: usize) -> Span {
- let exit = &events[index];
- let end_index = exit.point.index;
- let token_type = exit.token_type.clone();
- assert_eq!(
- exit.event_type,
- EventType::Exit,
- "expected `from_exit_event` to be called on `exit` event"
- );
- let mut enter_index = index - 1;
-
- loop {
- let enter = &events[enter_index];
- if enter.event_type == EventType::Enter && enter.token_type == token_type {
- return Span {
- start_index: enter.point.index,
- end_index,
- };
- }
-
- enter_index -= 1;
- }
-}
-
-/// Serialize a span, optionally expanding tabs.
-pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {
- serialize_codes(codes(all_codes, span), expand_tabs)
-}
-
-/// Get a slice of codes from a span.
-pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
- &codes[span.start_index..span.end_index]
-}
diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs
index 7073c57..c5e5c43 100644
--- a/tests/misc_tabs.rs
+++ b/tests/misc_tabs.rs
@@ -69,12 +69,6 @@ fn tabs_flow() {
);
assert_eq!(
- micromark(" \t---"),
- "<pre><code>---\n</code></pre>",
- "should not support a 3*SP + HT to start a thematic break"
- );
-
- assert_eq!(
micromark(" \t```"),
"<pre><code>```\n</code></pre>",
"should not support a 3*SP + HT to start a fenced code"