From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 28 Jul 2022 16:48:00 +0200 Subject: Refactor to work on `char`s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event. --- src/compiler.rs | 204 +++++++++--------- src/constant.rs | 4 - src/construct/attention.rs | 75 +++---- src/construct/autolink.rs | 40 ++-- src/construct/blank_line.rs | 4 +- src/construct/block_quote.rs | 27 ++- src/construct/character_escape.rs | 6 +- src/construct/character_reference.rs | 82 +++++--- src/construct/code_fenced.rs | 65 +++--- src/construct/code_indented.rs | 17 +- src/construct/code_text.rs | 18 +- src/construct/definition.rs | 10 +- src/construct/hard_break_escape.rs | 6 +- src/construct/heading_atx.rs | 22 +- src/construct/heading_setext.rs | 8 +- src/construct/html_flow.rs | 224 ++++++++++---------- src/construct/html_text.rs | 161 +++++++------- src/construct/label_end.rs | 92 ++++---- src/construct/label_start_image.rs | 6 +- src/construct/label_start_link.rs | 4 +- src/construct/list.rs | 57 +++-- src/construct/paragraph.rs | 6 +- src/construct/partial_data.rs | 31 +-- src/construct/partial_destination.rs | 39 ++-- src/construct/partial_label.rs | 22 +- src/construct/partial_non_lazy_continuation.rs | 4 +- src/construct/partial_space_or_tab.rs | 13 +- src/construct/partial_title.rs | 35 +--- src/construct/partial_whitespace.rs | 53 +++-- src/construct/thematic_break.rs | 27 +-- src/content/document.rs | 39 +++- src/content/flow.rs | 14 +- src/content/string.rs | 6 +- src/content/text.rs | 24 +-- src/lib.rs | 15 +- src/parser.rs | 14 +- src/token.rs | 14 +- src/tokenizer.rs | 280 +++++++++++++++++-------- src/util/codes.rs | 125 ----------- src/util/encode.rs | 12 +- src/util/mod.rs | 3 +- src/util/sanitize_uri.rs | 2 +- src/util/slice.rs | 156 ++++++++++++++ src/util/span.rs | 57 ----- tests/misc_tabs.rs | 6 - 45 files changed, 1087 insertions(+), 1042 deletions(-) delete mode 100644 src/util/codes.rs create mode 100644 src/util/slice.rs delete mode 100644 src/util/span.rs diff --git a/src/compiler.rs b/src/compiler.rs index a575221..f5673b4 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -2,14 +2,14 @@ use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}; use crate::construct::character_reference::Kind as CharacterReferenceKind; use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType}; +use crate::tokenizer::{Event, EventType}; use crate::util::normalize_identifier::normalize_identifier; use crate::util::{ decode_character_reference::{decode_named, decode_numeric}, encode::encode, sanitize_uri::sanitize_uri, skip, - span::{codes as codes_from_span, from_exit_event, serialize}, + slice::{Position, Slice}, }; use crate::{LineEnding, Options}; @@ -60,7 +60,7 @@ struct Definition { struct CompileContext<'a> { /// Static info. pub events: &'a [Event], - pub codes: &'a [Code], + pub chars: &'a [char], /// Fields used by handlers to track the things they need to track to /// compile markdown. pub atx_opening_sequence_size: Option, @@ -76,7 +76,7 @@ struct CompileContext<'a> { /// Fields used to influance the current compilation. pub slurp_one_line_ending: bool, pub tags: bool, - pub ignore_encode: bool, + pub encode_html: bool, pub last_was_tag: bool, /// Configuration pub protocol_href: Option>, @@ -92,13 +92,13 @@ impl<'a> CompileContext<'a> { /// Create a new compile context. pub fn new( events: &'a [Event], - codes: &'a [Code], + chars: &'a [char], options: &Options, line_ending: LineEnding, ) -> CompileContext<'a> { CompileContext { events, - codes, + chars, atx_opening_sequence_size: None, heading_setext_buffer: None, code_flow_seen_data: None, @@ -111,7 +111,7 @@ impl<'a> CompileContext<'a> { tight_stack: vec![], slurp_one_line_ending: false, tags: true, - ignore_encode: false, + encode_html: true, last_was_tag: false, protocol_href: if options.allow_dangerous_protocol { None @@ -151,16 +151,13 @@ impl<'a> CompileContext<'a> { pub fn push_raw<'x, S: Into<&'x str>>(&mut self, value: S) { let value = value.into(); - if self.ignore_encode { - self.push(value); - } else { - self.push(&*encode(value)); - } + self.push(&*encode(value, self.encode_html)); } pub fn tag<'x, S: Into<&'x str>>(&mut self, value: S) { if self.tags { - self.push(value.into()); + let value = value.into(); + self.push(&*encode(value, false)); self.last_was_tag = true; } } @@ -199,7 +196,7 @@ impl<'a> CompileContext<'a> { /// Turn events and codes into a string of HTML. #[allow(clippy::too_many_lines)] -pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { +pub fn compile(events: &[Event], chars: &[char], options: &Options) -> String { let mut index = 0; let mut line_ending_inferred = None; @@ -211,8 +208,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { if event.event_type == EventType::Exit && (event.token_type == Token::BlankLineEnding || event.token_type == Token::LineEnding) { - let codes = codes_from_span(codes, &from_exit_event(events, index)); - line_ending_inferred = Some(LineEnding::from_code(*codes.first().unwrap())); + line_ending_inferred = Some(LineEnding::from_str( + &Slice::from_position(chars, &Position::from_exit_event(events, index)).serialize(), + )); break; } @@ -239,7 +237,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { } }; - let mut context = CompileContext::new(events, codes, options, line_ending_default); + let mut context = CompileContext::new(events, chars, options, line_ending_default); let mut definition_indices = vec![]; let mut index = 0; let mut definition_inside = false; @@ -441,7 +439,7 @@ fn on_enter_definition(context: &mut CompileContext) { /// Handle [`Enter`][EventType::Enter]:[`DefinitionDestinationString`][Token::DefinitionDestinationString]. fn on_enter_definition_destination_string(context: &mut CompileContext) { context.buffer(); - context.ignore_encode = true; + context.encode_html = false; } /// Handle [`Enter`][EventType::Enter]:[`Emphasis`][Token::Emphasis]. @@ -453,14 +451,14 @@ fn on_enter_emphasis(context: &mut CompileContext) { fn on_enter_html_flow(context: &mut CompileContext) { context.line_ending_if_needed(); if context.allow_dangerous_html { - context.ignore_encode = true; + context.encode_html = false; } } /// Handle [`Enter`][EventType::Enter]:[`HtmlText`][Token::HtmlText]. fn on_enter_html_text(context: &mut CompileContext) { if context.allow_dangerous_html { - context.ignore_encode = true; + context.encode_html = false; } } @@ -595,7 +593,7 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) { context.buffer(); // Ignore encoding the result, as we’ll first percent encode the url and // encode manually after. - context.ignore_encode = true; + context.encode_html = false; } /// Handle [`Enter`][EventType::Enter]:[`Strong`][Token::Strong]. @@ -605,34 +603,36 @@ fn on_enter_strong(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail]. fn on_exit_autolink_email(context: &mut CompileContext) { - let slice = serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - ); + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + context.tag(&*format!( "", sanitize_uri( - format!("mailto:{}", slice.as_str()).as_str(), + format!("mailto:{}", value.as_str()).as_str(), &context.protocol_href ) )); - context.push_raw(&*slice); + context.push_raw(&*value); context.tag(""); } /// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol]. fn on_exit_autolink_protocol(context: &mut CompileContext) { - let slice = serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - ); + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + context.tag(&*format!( "", - sanitize_uri(slice.as_str(), &context.protocol_href) + sanitize_uri(value.as_str(), &context.protocol_href) )); - context.push_raw(&*slice); + context.push_raw(&*value); context.tag(""); } @@ -677,11 +677,12 @@ fn on_exit_character_reference_value(context: &mut CompileContext) { .character_reference_kind .take() .expect("expected `character_reference_kind` to be set"); - let reference = serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - ); + let reference = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + let ref_string = reference.as_str(); let value = match kind { CharacterReferenceKind::Decimal => decode_numeric(ref_string, 10).to_string(), @@ -694,12 +695,14 @@ fn on_exit_character_reference_value(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk]. fn on_exit_code_flow_chunk(context: &mut CompileContext) { + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + context.code_flow_seen_data = Some(true); - context.push_raw(&*serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - )); + context.push_raw(&*value); } /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFence`][Token::CodeFencedFence]. @@ -793,12 +796,14 @@ fn on_exit_drop(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}. fn on_exit_data(context: &mut CompileContext) { + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + // Just output it. - context.push_raw(&*serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - )); + context.push_raw(&*value); } /// Handle [`Exit`][EventType::Exit]:[`Definition`][Token::Definition]. @@ -830,19 +835,21 @@ fn on_exit_definition_destination_string(context: &mut CompileContext) { let buf = context.resume(); let definition = context.media_stack.last_mut().unwrap(); definition.destination = Some(buf); - context.ignore_encode = false; + context.encode_html = true; } /// Handle [`Exit`][EventType::Exit]:[`DefinitionLabelString`][Token::DefinitionLabelString]. fn on_exit_definition_label_string(context: &mut CompileContext) { + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + // Discard label, use the source content instead. context.resume(); let definition = context.media_stack.last_mut().unwrap(); - definition.reference_id = Some(serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - )); + definition.reference_id = Some(value); } /// Handle [`Exit`][EventType::Exit]:[`DefinitionTitleString`][Token::DefinitionTitleString]. @@ -871,12 +878,11 @@ fn on_exit_heading_atx(context: &mut CompileContext) { fn on_exit_heading_atx_sequence(context: &mut CompileContext) { // First fence we see. if context.atx_opening_sequence_size.is_none() { - let rank = serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, + let rank = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), ) - .len(); + .size(); context.line_ending_if_needed(); context.atx_opening_sequence_size = Some(rank); context.tag(&*format!("", rank)); @@ -902,11 +908,12 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) { .heading_setext_buffer .take() .expect("`atx_opening_sequence_size` must be set in headings"); - let head = codes_from_span( - context.codes, - &from_exit_event(context.events, context.index), - )[0]; - let level: usize = if head == Code::Char('-') { 2 } else { 1 }; + let head = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .head(); + let level = if head == Some('-') { 2 } else { 1 }; context.line_ending_if_needed(); context.tag(&*format!("", level)); @@ -916,17 +923,18 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlow`][Token::HtmlFlow],[`HtmlText`][Token::HtmlText]}. fn on_exit_html(context: &mut CompileContext) { - context.ignore_encode = false; + context.encode_html = true; } /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlowData`][Token::HtmlFlowData],[`HtmlTextData`][Token::HtmlTextData]}. fn on_exit_html_data(context: &mut CompileContext) { - let slice = serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - ); - context.push_raw(&*slice); + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + + context.push_raw(&*value); } /// Handle [`Exit`][EventType::Exit]:[`Label`][Token::Label]. @@ -938,12 +946,14 @@ fn on_exit_label(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`LabelText`][Token::LabelText]. fn on_exit_label_text(context: &mut CompileContext) { + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + let media = context.media_stack.last_mut().unwrap(); - media.label_id = Some(serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - )); + media.label_id = Some(value); } /// Handle [`Exit`][EventType::Exit]:[`LineEnding`][Token::LineEnding]. @@ -953,11 +963,13 @@ fn on_exit_line_ending(context: &mut CompileContext) { } else if context.slurp_one_line_ending { context.slurp_one_line_ending = false; } else { - context.push_raw(&*serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - )); + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + + context.push_raw(&*value); } } @@ -1004,12 +1016,12 @@ fn on_exit_list_item_value(context: &mut CompileContext) { let expect_first_item = context.expect_first_item.unwrap(); if expect_first_item { - let slice = serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - ); - let value = slice.parse::().ok().unwrap(); + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + let value = value.parse::().ok().unwrap(); if value != 1 { context.tag(" start=\""); @@ -1110,14 +1122,16 @@ fn on_exit_paragraph(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`ReferenceString`][Token::ReferenceString]. fn on_exit_reference_string(context: &mut CompileContext) { + let value = Slice::from_position( + context.chars, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(); + // Drop stuff. context.resume(); let media = context.media_stack.last_mut().unwrap(); - media.reference_id = Some(serialize( - context.codes, - &from_exit_event(context.events, context.index), - false, - )); + media.reference_id = Some(value); } /// Handle [`Exit`][EventType::Exit]:[`ResourceDestinationString`][Token::ResourceDestinationString]. @@ -1125,7 +1139,7 @@ fn on_exit_resource_destination_string(context: &mut CompileContext) { let buf = context.resume(); let media = context.media_stack.last_mut().unwrap(); media.destination = Some(buf); - context.ignore_encode = false; + context.encode_html = true; } /// Handle [`Exit`][EventType::Exit]:[`ResourceTitleString`][Token::ResourceTitleString]. diff --git a/src/constant.rs b/src/constant.rs index b8b36ad..d84dda5 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -232,11 +232,7 @@ pub const SAFE_PROTOCOL_SRC: [&str; 2] = ["http", "https"]; /// constructs in markdown, most notable the whitespace required to form /// [code (indented)][code_indented]. /// -/// > 👉 **Note**: each [`Code::VirtualSpace`][vs] and `Code::Char('\t' | ' ')` -/// > counts. -/// /// [code_indented]: crate::construct::code_indented -/// [vs]: crate::tokenizer::Code::VirtualSpace pub const TAB_SIZE: usize = 4; /// The number of markers needed for a [thematic break][thematic_break] to form. diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 27d7544..65c2f6f 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -52,8 +52,9 @@ //! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType, Point, State, Tokenizer}; +use crate::tokenizer::{Event, EventType, Point, State, Tokenizer}; use crate::unicode::PUNCTUATION; +use crate::util::slice::Slice; /// Character code kinds. #[derive(Debug, PartialEq)] @@ -128,17 +129,6 @@ impl MarkerKind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('*' | '_')`. - fn from_code(code: Code) -> MarkerKind { - match code { - Code::Char(char) => MarkerKind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// Attentention sequence that we can take markers from. @@ -170,9 +160,9 @@ struct Sequence { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('*' | '_') if tokenizer.parse_state.constructs.attention => { + Some(char) if tokenizer.parse_state.constructs.attention && matches!(char, '*' | '_') => { tokenizer.enter(Token::AttentionSequence); - inside(tokenizer, MarkerKind::from_code(tokenizer.current)) + inside(tokenizer, MarkerKind::from_char(char)) } _ => State::Nok, } @@ -185,23 +175,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^^ /// ``` fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { - match tokenizer.current { - Code::Char(char) if char == marker.as_char() => { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, marker))) - } - _ => { - tokenizer.exit(Token::AttentionSequence); - tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); - State::Ok - } + if tokenizer.current == Some(marker.as_char()) { + tokenizer.consume(); + State::Fn(Box::new(move |t| inside(t, marker))) + } else { + tokenizer.exit(Token::AttentionSequence); + tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); + State::Ok } } /// Resolve attention sequences. #[allow(clippy::too_many_lines)] fn resolve_attention(tokenizer: &mut Tokenizer) { - let codes = &tokenizer.parse_state.codes; + let chars = &tokenizer.parse_state.chars; let mut start = 0; let mut balance = 0; let mut sequences = vec![]; @@ -216,17 +203,21 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { if enter.token_type == Token::AttentionSequence { let end = start + 1; let exit = &tokenizer.events[end]; - let marker = MarkerKind::from_code(codes[enter.point.index]); + let marker = + MarkerKind::from_char(Slice::from_point(chars, &enter.point).head().unwrap()); let before = classify_character(if enter.point.index > 0 { - codes[enter.point.index - 1] - } else { - Code::None - }); - let after = classify_character(if exit.point.index < codes.len() { - codes[exit.point.index] + Slice::from_point( + chars, + &Point { + index: enter.point.index - 1, + ..enter.point + }, + ) + .tail() } else { - Code::None + None }); + let after = classify_character(Slice::from_point(chars, &exit.point).tail()); let open = after == GroupKind::Other || (after == GroupKind::Punctuation && before != GroupKind::Other); // To do: GFM strikethrough? @@ -326,9 +317,9 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { let sequence_close = &mut sequences[close]; let close_event_index = sequence_close.event_index; let seq_close_enter = sequence_close.start_point.clone(); + // No need to worry about `VS`, because sequences are only actual characters. sequence_close.size -= take; sequence_close.start_point.column += take; - sequence_close.start_point.offset += take; sequence_close.start_point.index += take; let seq_close_exit = sequence_close.start_point.clone(); @@ -352,9 +343,9 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { let sequence_open = &mut sequences[open]; let open_event_index = sequence_open.event_index; let seq_open_exit = sequence_open.end_point.clone(); + // No need to worry about `VS`, because sequences are only actual characters. sequence_open.size -= take; sequence_open.end_point.column -= take; - sequence_open.end_point.offset -= take; sequence_open.end_point.index -= take; let seq_open_enter = sequence_open.end_point.clone(); @@ -492,20 +483,20 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { /// Used for attention (emphasis, strong), whose sequences can open or close /// based on the class of surrounding characters. /// -/// > 👉 **Note** that eof (`Code::None`) is seen as whitespace. +/// > 👉 **Note** that eof (`None`) is seen as whitespace. /// /// ## References /// /// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -fn classify_character(code: Code) -> GroupKind { - match code { +fn classify_character(char: Option) -> GroupKind { + match char { // Custom characters. - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace => GroupKind::Whitespace, + None => GroupKind::Whitespace, // Unicode whitespace. - Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace, + Some(char) if char.is_whitespace() => GroupKind::Whitespace, // Unicode punctuation. - Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation, + Some(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation, // Everything else. - Code::Char(_) => GroupKind::Other, + Some(_) => GroupKind::Other, } } diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 3933596..399570b 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -103,7 +103,7 @@ use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX}; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of an autolink. /// @@ -115,7 +115,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('<') if tokenizer.parse_state.constructs.autolink => { + Some('<') if tokenizer.parse_state.constructs.autolink => { tokenizer.enter(Token::Autolink); tokenizer.enter(Token::AutolinkMarker); tokenizer.consume(); @@ -137,11 +137,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_alphabetic() => { + Some(char) if char.is_ascii_alphabetic() => { tokenizer.consume(); State::Fn(Box::new(scheme_or_email_atext)) } - Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer), + Some(char) if is_ascii_atext(char) => email_atext(tokenizer), _ => State::Nok, } } @@ -156,7 +156,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { scheme_inside_or_email_atext(tokenizer, 1) } _ => email_atext(tokenizer), @@ -173,11 +173,11 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char(':') => { + Some(':') => { tokenizer.consume(); State::Fn(Box::new(url_inside)) } - Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') + Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') if size < AUTOLINK_SCHEME_SIZE_MAX => { tokenizer.consume(); @@ -195,15 +195,13 @@ fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State /// ``` fn url_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.exit(Token::AutolinkProtocol); end(tokenizer) } - Code::Char(char) if char.is_ascii_control() => State::Nok, - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => { - State::Nok - } - Code::Char(_) => { + Some(char) if char.is_ascii_control() => State::Nok, + None | Some(' ') => State::Nok, + Some(_) => { tokenizer.consume(); State::Fn(Box::new(url_inside)) } @@ -218,11 +216,11 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('@') => { + Some('@') => { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Code::Char(char) if is_ascii_atext(char) => { + Some(char) if is_ascii_atext(char) => { tokenizer.consume(); State::Fn(Box::new(email_atext)) } @@ -238,7 +236,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size), + Some(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size), _ => State::Nok, } } @@ -251,11 +249,11 @@ fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char('.') => { + Some('.') => { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Code::Char('>') => { + Some('>') => { let index = tokenizer.events.len(); tokenizer.exit(Token::AutolinkProtocol); // Change the token type. @@ -277,11 +275,11 @@ fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_value(t, size + 1))) } - Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_label(t, size + 1))) } @@ -299,7 +297,7 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.enter(Token::AutolinkMarker); tokenizer.consume(); tokenizer.exit(Token::AutolinkMarker); diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 537ffc1..6780f40 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -33,7 +33,7 @@ //! [flow]: crate::content::flow use crate::construct::partial_space_or_tab::space_or_tab; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of a blank line. /// @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok, + None | Some('\n') => State::Ok, _ => State::Nok, } } diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 3bb4b8b..49a0ea0 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -36,7 +36,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of block quote. /// @@ -65,7 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.enter(Token::BlockQuote); cont_before(tokenizer) } @@ -98,7 +98,7 @@ pub fn cont(tokenizer: &mut Tokenizer) -> State { /// ``` fn cont_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.enter(Token::BlockQuotePrefix); tokenizer.enter(Token::BlockQuoteMarker); tokenizer.consume(); @@ -118,17 +118,14 @@ fn cont_before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn cont_after(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(Token::SpaceOrTab); - tokenizer.consume(); - tokenizer.exit(Token::SpaceOrTab); - tokenizer.exit(Token::BlockQuotePrefix); - State::Ok - } - _ => { - tokenizer.exit(Token::BlockQuotePrefix); - State::Ok - } + if let Some('\t' | ' ') = tokenizer.current { + tokenizer.enter(Token::SpaceOrTab); + tokenizer.consume(); + tokenizer.exit(Token::SpaceOrTab); + tokenizer.exit(Token::BlockQuotePrefix); + State::Ok + } else { + tokenizer.exit(Token::BlockQuotePrefix); + State::Ok } } diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 9e9b713..e9263af 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -34,7 +34,7 @@ //! [hard_break_escape]: crate::construct::hard_break_escape use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of a character escape. /// @@ -44,7 +44,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('\\') if tokenizer.parse_state.constructs.character_escape => { + Some('\\') if tokenizer.parse_state.constructs.character_escape => { tokenizer.enter(Token::CharacterEscape); tokenizer.enter(Token::CharacterEscapeMarker); tokenizer.consume(); @@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_punctuation() => { + Some(char) if char.is_ascii_punctuation() => { tokenizer.enter(Token::CharacterEscapeValue); tokenizer.consume(); tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 8521f15..59043d1 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,7 +66,8 @@ use crate::constant::{ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{Point, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// Kind of a character reference. #[derive(Debug, Clone, PartialEq)] @@ -120,8 +121,10 @@ impl Kind { /// State needed to parse character references. #[derive(Debug, Clone)] struct Info { - /// All parsed characters. - buffer: String, + /// Place of value start. + start: Point, + /// Size of value. + size: usize, /// Kind of character reference. kind: Kind, } @@ -138,7 +141,7 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('&') if tokenizer.parse_state.constructs.character_reference => { + Some('&') if tokenizer.parse_state.constructs.character_reference => { tokenizer.enter(Token::CharacterReference); tokenizer.enter(Token::CharacterReferenceMarker); tokenizer.consume(); @@ -161,18 +164,21 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn open(tokenizer: &mut Tokenizer) -> State { - let info = Info { - buffer: String::new(), - kind: Kind::Named, - }; - if let Code::Char('#') = tokenizer.current { + if let Some('#') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerNumeric); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerNumeric); - State::Fn(Box::new(|t| numeric(t, info))) + State::Fn(Box::new(numeric)) } else { tokenizer.enter(Token::CharacterReferenceValue); - value(tokenizer, info) + value( + tokenizer, + Info { + start: tokenizer.point.clone(), + size: 0, + kind: Kind::Named, + }, + ) } } @@ -185,17 +191,25 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | a b /// ^ /// ``` -fn numeric(tokenizer: &mut Tokenizer, mut info: Info) -> State { - if let Code::Char('x' | 'X') = tokenizer.current { +fn numeric(tokenizer: &mut Tokenizer) -> State { + if let Some('x' | 'X') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerHexadecimal); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); - info.kind = Kind::Hexadecimal; + let info = Info { + start: tokenizer.point.clone(), + size: 0, + kind: Kind::Hexadecimal, + }; State::Fn(Box::new(|t| value(t, info))) } else { tokenizer.enter(Token::CharacterReferenceValue); - info.kind = Kind::Decimal; + let info = Info { + start: tokenizer.point.clone(), + size: 0, + kind: Kind::Decimal, + }; value(tokenizer, info) } } @@ -215,24 +229,32 @@ fn numeric(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(';') if !info.buffer.is_empty() => { - let unknown_named = Kind::Named == info.kind - && !CHARACTER_REFERENCES.iter().any(|d| d.0 == info.buffer); + Some(';') if info.size > 0 => { + if Kind::Named == info.kind { + let value = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &info.start, + end: &tokenizer.point, + }, + ) + .serialize(); - if unknown_named { - State::Nok - } else { - tokenizer.exit(Token::CharacterReferenceValue); - tokenizer.enter(Token::CharacterReferenceMarkerSemi); - tokenizer.consume(); - tokenizer.exit(Token::CharacterReferenceMarkerSemi); - tokenizer.exit(Token::CharacterReference); - State::Ok + if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { + return State::Nok; + } } + + tokenizer.exit(Token::CharacterReferenceValue); + tokenizer.enter(Token::CharacterReferenceMarkerSemi); + tokenizer.consume(); + tokenizer.exit(Token::CharacterReferenceMarkerSemi); + tokenizer.exit(Token::CharacterReference); + State::Ok } - Code::Char(char) => { - if info.buffer.len() < info.kind.max() && info.kind.allowed(char) { - info.buffer.push(char); + Some(char) => { + if info.size < info.kind.max() && info.kind.allowed(char) { + info.size += 1; tokenizer.consume(); State::Fn(Box::new(|t| value(t, info))) } else { diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 2fea95e..98fa54f 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -107,8 +107,8 @@ use crate::construct::{ partial_space_or_tab::{space_or_tab, space_or_tab_min_max}, }; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; -use crate::util::span::from_exit_event; +use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// Kind of fences. #[derive(Debug, Clone, PartialEq)] @@ -155,17 +155,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not ``Code::Char('~' | '`')``. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse code (fenced). @@ -217,20 +206,23 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(event) = tail { if event.token_type == Token::SpaceOrTab { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; + prefix = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), + ) + .size(); } } match tokenizer.current { - Code::Char('`' | '~') => { + Some(char) if matches!(char, '`' | '~') => { tokenizer.enter(Token::CodeFencedFenceSequence); sequence_open( tokenizer, Info { prefix, size: 0, - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), }, ) } @@ -248,7 +240,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| { info.size += 1; @@ -273,7 +265,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -282,7 +274,7 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { _ => { tokenizer.enter(Token::CodeFencedFenceInfo); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - info_inside(tokenizer, info, vec![]) + info_inside(tokenizer, info) } } } @@ -295,9 +287,9 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec) -> State { +fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.exit(Token::CodeFencedFence); @@ -305,16 +297,15 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec) -> S tokenizer.concrete = true; at_break(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) } - Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok, - Code::Char(_) => { - codes.push(tokenizer.current); + Some('`') if info.kind == Kind::GraveAccent => State::Nok, + Some(_) => { tokenizer.consume(); - State::Fn(Box::new(|t| info_inside(t, info, codes))) + State::Fn(Box::new(|t| info_inside(t, info))) } } } @@ -329,7 +320,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec) -> S /// ``` fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -353,7 +344,7 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceMeta); tokenizer.exit(Token::CodeFencedFence); @@ -361,7 +352,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok, + Some('`') if info.kind == Kind::GraveAccent => State::Nok, _ => { tokenizer.consume(); State::Fn(Box::new(|t| meta(t, info))) @@ -422,7 +413,7 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -461,7 +452,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(Token::CodeFencedFenceSequence); close_sequence(tokenizer, info, 0) } @@ -479,7 +470,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) } @@ -501,7 +492,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { /// ``` fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); State::Ok } @@ -547,9 +538,7 @@ fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_break(tokenizer, info) - } + None | Some('\n') => at_break(tokenizer, info), _ => { tokenizer.enter(Token::CodeFlowChunk); content_continue(tokenizer, info) @@ -567,7 +556,7 @@ fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer, info) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 015c4a0..bb1615c 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -48,7 +48,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::TAB_SIZE; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of code (indented). /// @@ -78,11 +78,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => after(tokenizer), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer - .attempt(further_start, |ok| { - Box::new(if ok { at_break } else { after }) - })(tokenizer), + None => after(tokenizer), + Some('\n') => tokenizer.attempt(further_start, |ok| { + Box::new(if ok { at_break } else { after }) + })(tokenizer), _ => { tokenizer.enter(Token::CodeFlowChunk); content(tokenizer) @@ -98,7 +97,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ``` fn content(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer) } @@ -134,7 +133,7 @@ fn further_start(tokenizer: &mut Tokenizer) -> State { State::Nok } else { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -178,7 +177,7 @@ fn further_begin(tokenizer: &mut Tokenizer) -> State { /// ``` fn further_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => further_start(tokenizer), + Some('\n') => further_start(tokenizer), _ => State::Nok, } } diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index f5f92fc..150f63b 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -84,7 +84,7 @@ //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of code (text). /// @@ -98,9 +98,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { let len = tokenizer.events.len(); match tokenizer.current { - Code::Char('`') + Some('`') if tokenizer.parse_state.constructs.code_text - && (tokenizer.previous != Code::Char('`') + && (tokenizer.previous != Some('`') || (len > 0 && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => { @@ -119,7 +119,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { - if let Code::Char('`') = tokenizer.current { + if let Some('`') = tokenizer.current { tokenizer.consume(); State::Fn(Box::new(move |t| sequence_open(t, size + 1))) } else { @@ -136,14 +136,14 @@ fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Nok, + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); State::Fn(Box::new(move |t| between(t, size_open))) } - Code::Char('`') => { + Some('`') => { tokenizer.enter(Token::CodeTextSequence); sequence_close(tokenizer, size_open, 0) } @@ -162,7 +162,7 @@ fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// ``` fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '`') => { + None | Some('\n' | '`') => { tokenizer.exit(Token::CodeTextData); between(tokenizer, size_open) } @@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// ``` fn sequence_close(tokenizer: &mut Tokenizer, size_open: usize, size: usize) -> State { match tokenizer.current { - Code::Char('`') => { + Some('`') => { tokenizer.consume(); State::Fn(Box::new(move |t| sequence_close(t, size_open, size + 1))) } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index ffaaa98..f2b5ae0 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -100,7 +100,7 @@ use crate::construct::{ partial_title::{start as title, Options as TitleOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; /// At the start of a definition. @@ -137,7 +137,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => tokenizer.go( + Some('[') => tokenizer.go( |t| { label( t, @@ -162,7 +162,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn label_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(':') => { + Some(':') => { tokenizer.enter(Token::DefinitionMarker); tokenizer.consume(); tokenizer.exit(Token::DefinitionMarker); @@ -231,7 +231,7 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// ``` fn after_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Definition); // You’d be interrupting. tokenizer.interrupt = true; @@ -294,7 +294,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn title_after_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok, + None | Some('\n') => State::Ok, _ => State::Nok, } } diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 40a83ef..0585c4c 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -40,7 +40,7 @@ //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of a hard break (escape). /// @@ -51,7 +51,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('\\') if tokenizer.parse_state.constructs.hard_break_escape => { + Some('\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); tokenizer.consume(); State::Fn(Box::new(inside)) @@ -69,7 +69,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.exit(Token::HardBreakEscape); State::Ok } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 5de9a80..7a7cf2e 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -57,7 +57,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, Event, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; /// Start of a heading (atx). /// @@ -87,7 +87,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Code::Char('#') == tokenizer.current { + if Some('#') == tokenizer.current { tokenizer.enter(Token::HeadingAtxSequence); sequence_open(tokenizer, 0) } else { @@ -103,11 +103,11 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if rank > 0 => { + None | Some('\n') if rank > 0 => { tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) } - Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + Some('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |tokenizer| { sequence_open(tokenizer, rank + 1) @@ -129,21 +129,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HeadingAtx); tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.go(space_or_tab(), at_break)(tokenizer) - } - Code::Char('#') => { + Some('\t' | ' ') => tokenizer.go(space_or_tab(), at_break)(tokenizer), + Some('#') => { tokenizer.enter(Token::HeadingAtxSequence); further_sequence(tokenizer) } - Code::Char(_) => { + Some(_) => { tokenizer.enter_with_content(Token::Data, Some(ContentType::Text)); data(tokenizer) } @@ -159,7 +157,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn further_sequence(tokenizer: &mut Tokenizer) -> State { - if let Code::Char('#') = tokenizer.current { + if let Some('#') = tokenizer.current { tokenizer.consume(); State::Fn(Box::new(further_sequence)) } else { @@ -177,7 +175,7 @@ fn further_sequence(tokenizer: &mut Tokenizer) -> State { fn data(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. - Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + None | Some('\t' | '\n' | ' ') => { tokenizer.exit(Token::Data); at_break(tokenizer) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index a0f7545..f9dd3f7 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -60,7 +60,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::token::Token; -use crate::tokenizer::{Code, EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; /// Kind of underline. @@ -148,7 +148,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(char) if char == '-' || char == '=' => { + Some(char) if matches!(char, '-' | '=') => { tokenizer.enter(Token::HeadingSetextUnderline); inside(tokenizer, Kind::from_char(char)) } @@ -165,7 +165,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { match tokenizer.current { - Code::Char(char) if char == kind.as_char() => { + Some(char) if char == kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, kind))) } @@ -185,7 +185,7 @@ fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve)); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 24d6f98..238963d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -105,8 +105,10 @@ use crate::construct::{ partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; -use crate::util::codes::{parse, serialize}; +use crate::tokenizer::{Point, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; + +const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -168,17 +170,6 @@ impl QuoteKind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('"' | '\'')`. - fn from_code(code: Code) -> QuoteKind { - match code { - Code::Char(char) => QuoteKind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse HTML (flow). @@ -190,9 +181,9 @@ struct Info { start_tag: bool, /// Used depending on `kind` to either collect all parsed characters, or to /// store expected characters. - buffer: Vec, - /// `index` into `buffer` when expecting certain characters. - index: usize, + start: Option, + /// Collected index, for various reasons. + size: usize, /// Current quote, when in a double or single quoted attribute value. quote: Option, } @@ -234,7 +225,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Code::Char('<') == tokenizer.current { + if Some('<') == tokenizer.current { tokenizer.enter(Token::HtmlFlowData); tokenizer.consume(); State::Fn(Box::new(open)) @@ -259,21 +250,22 @@ fn open(tokenizer: &mut Tokenizer) -> State { kind: Kind::Basic, // Assume closing tag (or no tag). start_tag: false, - buffer: vec![], - index: 0, + start: None, + size: 0, quote: None, }; match tokenizer.current { - Code::Char('!') => { + Some('!') => { tokenizer.consume(); State::Fn(Box::new(|t| declaration_open(t, info))) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); + info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| tag_close_start(t, info))) } - Code::Char('?') => { + Some('?') => { info.kind = Kind::Instruction; tokenizer.consume(); // Do not form containers. @@ -282,8 +274,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { // right now, so we do need to search for `>`, similar to declarations. State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { info.start_tag = true; + info.start = Some(tokenizer.point.clone()); tag_name(tokenizer, info) } _ => State::Nok, @@ -302,19 +295,18 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); info.kind = Kind::Comment; State::Fn(Box::new(|t| comment_open_inside(t, info))) } - Code::Char('[') => { + Some('[') => { tokenizer.consume(); info.kind = Kind::Cdata; - info.buffer = parse("CDATA["); - info.index = 0; + info.size = 0; State::Fn(Box::new(|t| cdata_open_inside(t, info))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); info.kind = Kind::Declaration; // Do not form containers. @@ -333,7 +325,7 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -350,20 +342,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^^^^^^ /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { - if tokenizer.current == info.buffer[info.index] { - info.index += 1; - tokenizer.consume(); + match tokenizer.current { + Some(char) if char == CDATA_SEARCH[info.size] => { + info.size += 1; + tokenizer.consume(); - if info.index == info.buffer.len() { - info.buffer.clear(); - // Do not form containers. - tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation(t, info))) - } else { - State::Fn(Box::new(|t| cdata_open_inside(t, info))) + if info.size == CDATA_SEARCH.len() { + info.size = 0; + // Do not form containers. + tokenizer.concrete = true; + State::Fn(Box::new(|t| continuation(t, info))) + } else { + State::Fn(Box::new(|t| cdata_open_inside(t, info))) + } } - } else { - State::Nok + _ => State::Nok, } } @@ -373,11 +366,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | /// ^ /// ``` -fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); - info.buffer.push(tokenizer.current); State::Fn(Box::new(|t| tag_name(t, info))) } _ => State::Nok, @@ -394,22 +386,27 @@ fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { - let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); - let name = tag_name_buffer.as_str(); - let slash = matches!(tokenizer.current, Code::Char('/')); - - info.buffer.clear(); - - if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { + None | Some('\t' | '\n' | ' ' | '/' | '>') => { + let slash = matches!(tokenizer.current, Some('/')); + let start = info.start.take().unwrap(); + let name = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &start, + end: &tokenizer.point, + }, + ) + .serialize() + .trim() + .to_lowercase(); + println!("name: {:?}", name); + + if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) { info.kind = Kind::Raw; // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) - } else if HTML_BLOCK_NAMES.contains(&name) { + } else if HTML_BLOCK_NAMES.contains(&name.as_str()) { // Basic is assumed, no need to set `kind`. if slash { tokenizer.consume(); @@ -432,12 +429,11 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } } - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); - info.buffer.push(tokenizer.current); State::Fn(Box::new(|t| tag_name(t, info))) } - Code::Char(_) => State::Nok, + Some(_) => State::Nok, } } @@ -449,7 +445,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -467,7 +463,7 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_closing_tag_after(t, info))) } @@ -496,15 +492,15 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_end(t, info))) } - Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) } @@ -524,7 +520,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } @@ -543,11 +539,11 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('=') => { + Some('=') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) } @@ -566,13 +562,13 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State /// ``` fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok, - Code::Char('"' | '\'') => { + None | Some('<' | '=' | '>' | '`') => State::Nok, + Some(char) if matches!(char, '"' | '\'') => { + info.quote = Some(QuoteKind::from_char(char)); tokenizer.consume(); - info.quote = Some(QuoteKind::from_code(tokenizer.current)); State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } @@ -590,8 +586,8 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> /// ``` fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Nok, - Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => { + None | Some('\n') => State::Nok, + Some(char) if char == info.quote.as_ref().unwrap().as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) } @@ -610,13 +606,10 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + None | Some('\t' | '\n' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { complete_attribute_name_after(tokenizer, info) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_unquoted(t, info))) } @@ -632,9 +625,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S /// ``` fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { - complete_attribute_name_before(tokenizer, info) - } + Some('\t' | ' ' | '/' | '>') => complete_attribute_name_before(tokenizer, info), _ => State::Nok, } } @@ -647,7 +638,7 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) /// ``` fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } @@ -663,16 +654,16 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } - Code::Char(_) => State::Nok, + Some(_) => State::Nok, } } @@ -684,29 +675,27 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') if info.kind == Kind::Comment => { + Some('-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_comment_inside(t, info))) } - Code::Char('<') if info.kind == Kind::Raw => { + Some('<') if info.kind == Kind::Raw => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_raw_tag_open(t, info))) } - Code::Char('>') if info.kind == Kind::Declaration => { + Some('>') if info.kind == Kind::Declaration => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Code::Char('?') if info.kind == Kind::Instruction => { + Some('?') if info.kind == Kind::Instruction => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Code::Char(']') if info.kind == Kind::Cdata => { + Some(']') if info.kind == Kind::Cdata => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - if info.kind == Kind::Basic || info.kind == Kind::Complete => - { + Some('\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { tokenizer.exit(Token::HtmlFlowData); tokenizer.check(blank_line_before, |ok| { if ok { @@ -716,7 +705,7 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { } })(tokenizer) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_start(tokenizer, info) } @@ -753,7 +742,7 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -772,9 +761,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - continuation_start(tokenizer, info) - } + None | Some('\n') => continuation_start(tokenizer, info), _ => { tokenizer.enter(Token::HtmlFlowData); continuation(tokenizer, info) @@ -790,7 +777,7 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -804,10 +791,11 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | /// ^ /// ``` -fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('/') => { + Some('/') => { tokenizer.consume(); + info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => continuation(tokenizer, info), @@ -822,24 +810,34 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('>') => { - let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); - info.buffer.clear(); - - if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { + Some('>') => { + info.size = 0; + + let start = info.start.take().unwrap(); + let name = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &start, + end: &tokenizer.point, + }, + ) + .serialize() + .to_lowercase(); + + if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } else { continuation(tokenizer, info) } } - Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => { + Some('A'..='Z' | 'a'..='z') if info.size < HTML_RAW_SIZE_MAX => { tokenizer.consume(); - info.buffer.push(tokenizer.current); + info.size += 1; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => { - info.buffer.clear(); + info.size = 0; continuation(tokenizer, info) } } @@ -853,7 +851,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State /// ``` fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -877,11 +875,11 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> /// ``` fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Code::Char('-') if info.kind == Kind::Comment => { + Some('-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -897,7 +895,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_after(tokenizer) } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 3ac8d71..b1ad113 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -56,8 +56,9 @@ use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; -use crate::tokenizer::{Code, State, StateFn, Tokenizer}; -use crate::util::codes::parse; +use crate::tokenizer::{State, StateFn, Tokenizer}; + +const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// Start of HTML (text) /// @@ -66,7 +67,7 @@ use crate::util::codes::parse; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Code::Char('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { + if Some('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { tokenizer.enter(Token::HtmlText); tokenizer.enter(Token::HtmlTextData); tokenizer.consume(); @@ -88,19 +89,19 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('!') => { + Some('!') => { tokenizer.consume(); State::Fn(Box::new(declaration_open)) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(tag_close_start)) } - Code::Char('?') => { + Some('?') => { tokenizer.consume(); State::Fn(Box::new(instruction)) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } @@ -120,16 +121,15 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Code::Char('[') => { + Some('[') => { tokenizer.consume(); - let buffer = parse("CDATA["); - State::Fn(Box::new(|t| cdata_open_inside(t, buffer, 0))) + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } @@ -145,7 +145,7 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_start)) } @@ -168,8 +168,8 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => State::Nok, - Code::Char('-') => { + None | Some('>') => State::Nok, + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_start_dash)) } @@ -192,7 +192,7 @@ fn comment_start(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => State::Nok, + None | Some('>') => State::Nok, _ => comment(tokenizer), } } @@ -205,11 +205,9 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(comment)) - } - Code::Char('-') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(comment)), + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_close)) } @@ -228,7 +226,7 @@ fn comment(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(end)) } @@ -242,17 +240,18 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a &<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec, index: usize) -> State { - if tokenizer.current == buffer[index] { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { + match tokenizer.current { + Some(char) if char == CDATA_SEARCH[index] => { + tokenizer.consume(); - if index + 1 == buffer.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, buffer, index + 1))) + if index + 1 == CDATA_SEARCH.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) + } } - } else { - State::Nok + _ => State::Nok, } } @@ -264,11 +263,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec, index: usize) /// ``` fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(cdata)) - } - Code::Char(']') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(cdata)), + Some(']') => { tokenizer.consume(); State::Fn(Box::new(cdata_close)) } @@ -287,7 +284,7 @@ fn cdata(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.consume(); State::Fn(Box::new(cdata_end)) } @@ -303,8 +300,8 @@ fn cdata_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => end(tokenizer), - Code::Char(']') => cdata_close(tokenizer), + Some('>') => end(tokenizer), + Some(']') => cdata_close(tokenizer), _ => cdata(tokenizer), } } @@ -317,10 +314,8 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => end(tokenizer), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(declaration)) - } + None | Some('>') => end(tokenizer), + Some('\n') => at_line_ending(tokenizer, Box::new(declaration)), _ => { tokenizer.consume(); State::Fn(Box::new(declaration)) @@ -336,11 +331,9 @@ fn declaration(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(instruction)) - } - Code::Char('?') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(instruction)), + Some('?') => { tokenizer.consume(); State::Fn(Box::new(instruction_close)) } @@ -359,7 +352,7 @@ fn instruction(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => end(tokenizer), + Some('>') => end(tokenizer), _ => instruction(tokenizer), } } @@ -372,7 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -388,7 +381,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -404,10 +397,8 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_close_between)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_close_between)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_close_between)) } @@ -423,13 +414,11 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -442,18 +431,16 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_between)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_between)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_between)) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(end)) } - Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -469,7 +456,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -486,14 +473,12 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name_after)) } - Code::Char('=') => { + Some('=') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } @@ -510,19 +495,17 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + None | Some('<' | '=' | '>' | '`') => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Code::Char(char) if char == '"' || char == '\'' => { + Some(char) if char == '"' || char == '\'' => { tokenizer.consume(); State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, char))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) } @@ -537,12 +520,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending( + None => State::Nok, + Some('\n') => at_line_ending( tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Code::Char(char) if char == marker => { + Some(char) if char == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } @@ -563,11 +546,9 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> S /// ``` fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => State::Nok, - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer), - Code::Char(_) => { + None | Some('"' | '\'' | '<' | '=' | '`') => State::Nok, + Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) } @@ -583,9 +564,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer), + Some('\t' | '\n' | ' ' | '>' | '/') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -598,7 +577,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); tokenizer.exit(Token::HtmlTextData); tokenizer.exit(Token::HtmlText); @@ -620,7 +599,7 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.exit(Token::HtmlTextData); tokenizer.enter(Token::LineEnding); tokenizer.consume(); diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 6f0a707..5ea788f 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -1,4 +1,4 @@ -//! Label end is a construct that occurs in the [text][] content type. +//! Label end is a construct that occurs in the [text][] conten&t type. //! //! It forms with the following BNF: //! @@ -154,10 +154,11 @@ use crate::construct::{ partial_title::{start as title, Options as TitleOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType, Media, State, Tokenizer}; +use crate::tokenizer::{Event, EventType, Media, State, Tokenizer}; use crate::util::{ normalize_identifier::normalize_identifier, - span::{serialize, Span}, + skip, + slice::{Position, Slice}, }; /// State needed to parse label end. @@ -181,7 +182,7 @@ struct Info { /// > | [a] b /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Code::Char(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { + if Some(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { let mut label_start_index = None; let mut index = tokenizer.label_start_stack.len(); @@ -207,19 +208,23 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { } let label_end_start = tokenizer.events.len(); + let info = Info { label_start_index, media: Media { start: label_start.start, end: (label_end_start, label_end_start + 3), - id: normalize_identifier(&serialize( - &tokenizer.parse_state.codes, - &Span { - start_index: tokenizer.events[label_start.start.1].point.index, - end_index: tokenizer.events[label_end_start - 1].point.index, - }, - false, - )), + // To do: virtual spaces not needed, create a `to_str`? + id: normalize_identifier( + &Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &tokenizer.events[label_start.start.1].point, + end: &tokenizer.events[label_end_start - 1].point, + }, + ) + .serialize(), + ), }, }; @@ -253,7 +258,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { // Resource (`[asd](fgh)`)? - Code::Char('(') => tokenizer.attempt(resource, move |is_ok| { + Some('(') => tokenizer.attempt(resource, move |is_ok| { Box::new(move |t| { // Also fine if `defined`, as then it’s a valid shortcut. if is_ok || defined { @@ -264,7 +269,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { }) })(tokenizer), // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? - Code::Char('[') => tokenizer.attempt(full_reference, move |is_ok| { + Some('[') => tokenizer.attempt(full_reference, move |is_ok| { Box::new(move |t| { if is_ok { ok(t, info) @@ -377,7 +382,7 @@ fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { /// ``` fn resource(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('(') => { + Some('(') => { tokenizer.enter(Token::Resource); tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); @@ -406,7 +411,7 @@ fn resource_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(')') => resource_end(tokenizer), + Some(')') => resource_end(tokenizer), _ => tokenizer.go( |t| { destination( @@ -446,7 +451,7 @@ fn destination_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('"' | '\'' | '(') => tokenizer.go( + Some('"' | '\'' | '(') => tokenizer.go( |t| { title( t, @@ -481,7 +486,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(')') => { + Some(')') => { tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); tokenizer.exit(Token::ResourceMarker); @@ -500,7 +505,7 @@ fn resource_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn full_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => tokenizer.go( + Some('[') => tokenizer.go( |t| { label( t, @@ -524,36 +529,23 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn full_reference_after(tokenizer: &mut Tokenizer) -> State { - let events = &tokenizer.events; - let mut index = events.len() - 1; - let mut start: Option = None; - let mut end: Option = None; - - while index > 0 { - index -= 1; - let event = &events[index]; - if event.token_type == Token::ReferenceString { - if event.event_type == EventType::Exit { - end = Some(event.point.index); - } else { - start = Some(event.point.index); - break; - } - } - } + let end = skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::ReferenceString], + ); + + // To do: virtual spaces not needed, create a `to_str`? + let id = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, end), + ) + .serialize(); if tokenizer .parse_state .definitions - .contains(&normalize_identifier(&serialize( - &tokenizer.parse_state.codes, - &Span { - // Always found, otherwise we don’t get here. - start_index: start.unwrap(), - end_index: end.unwrap(), - }, - false, - ))) + .contains(&normalize_identifier(&id)) { State::Ok } else { @@ -571,7 +563,7 @@ fn full_reference_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => { + Some('[') => { tokenizer.enter(Token::Reference); tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); @@ -592,7 +584,7 @@ fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { /// ``` fn collapsed_reference_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); tokenizer.exit(Token::ReferenceMarker); @@ -735,7 +727,11 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) { 0, vec![Event { event_type: EventType::Exit, - token_type: Token::Link, + token_type: if group_enter_event.token_type == Token::LabelLink { + Token::Link + } else { + Token::Image + }, point: events[group_end_index].point.clone(), link: None, }], diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 8c12ffe..078026d 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -30,7 +30,7 @@ use super::label_end::resolve_media; use crate::token::Token; -use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; +use crate::tokenizer::{LabelStart, State, Tokenizer}; /// Start of label (image) start. /// @@ -40,7 +40,7 @@ use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('!') if tokenizer.parse_state.constructs.label_start_image => { + Some('!') if tokenizer.parse_state.constructs.label_start_image => { tokenizer.enter(Token::LabelImage); tokenizer.enter(Token::LabelImageMarker); tokenizer.consume(); @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => { + Some('[') => { tokenizer.enter(Token::LabelMarker); tokenizer.consume(); tokenizer.exit(Token::LabelMarker); diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index e13cd77..d7ae1d6 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -29,7 +29,7 @@ use super::label_end::resolve_media; use crate::token::Token; -use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; +use crate::tokenizer::{LabelStart, State, Tokenizer}; /// Start of label (link) start. /// @@ -39,7 +39,7 @@ use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') if tokenizer.parse_state.constructs.label_start_link => { + Some('[') if tokenizer.parse_state.constructs.label_start_link => { let start = tokenizer.events.len(); tokenizer.enter(Token::LabelLink); tokenizer.enter(Token::LabelMarker); diff --git a/src/construct/list.rs b/src/construct/list.rs index f5bb0ce..355eeee 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -50,10 +50,10 @@ use crate::construct::{ thematic_break::start as thematic_break, }; use crate::token::Token; -use crate::tokenizer::{Code, EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, Tokenizer}; use crate::util::{ skip, - span::{codes as codes_from_span, from_exit_event}, + slice::{Position, Slice}, }; /// Type of list. @@ -117,17 +117,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('.' | ')' | '*' | '+' | '-')`. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// Start of list item. @@ -160,11 +149,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Unordered. - Code::Char('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| { + Some('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| { Box::new(if ok { nok } else { before_unordered }) })(tokenizer), // Ordered. - Code::Char(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => { + Some(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => { tokenizer.enter(Token::ListItemPrefix); tokenizer.enter(Token::ListItemValue); inside(tokenizer, 0) @@ -194,11 +183,11 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + Some(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, size + 1))) } - Code::Char('.' | ')') if !tokenizer.interrupt || size < 2 => { + Some('.' | ')') if !tokenizer.interrupt || size < 2 => { tokenizer.exit(Token::ListItemValue); marker(tokenizer) } @@ -273,10 +262,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn whitespace_after(tokenizer: &mut Tokenizer) -> State { - if matches!( - tokenizer.current, - Code::VirtualSpace | Code::Char('\t' | ' ') - ) { + if matches!(tokenizer.current, Some('\t' | ' ')) { State::Nok } else { State::Ok @@ -291,7 +277,7 @@ fn whitespace_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn prefix_other(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); @@ -316,8 +302,18 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State { tokenizer.events.len() - 1, &[Token::ListItem], ); - let prefix = tokenizer.point.index - tokenizer.events[start].point.index - + (if blank { 1 } else { 0 }); + let mut prefix = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &tokenizer.events[start].point, + end: &tokenizer.point, + }, + ) + .size(); + + if blank { + prefix += 1; + } let container = tokenizer.container.as_mut().unwrap(); container.blank_initial = blank; @@ -403,12 +399,15 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { if event.token_type == Token::ListItem { if event.event_type == EventType::Enter { let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1; - let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]) + 1; - let codes = codes_from_span( - &tokenizer.parse_state.codes, - &from_exit_event(&tokenizer.events, marker), + let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); + let kind = Kind::from_char( + Slice::from_point( + &tokenizer.parse_state.chars, + &tokenizer.events[marker].point, + ) + .head() + .unwrap(), ); - let kind = Kind::from_code(codes[0]); let current = (kind, balance, index, end); let mut list_index = lists_wip.len(); diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 4bce6a4..5d230d3 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -33,7 +33,7 @@ //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element use crate::token::Token; -use crate::tokenizer::{Code, ContentType, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, EventType, State, Tokenizer}; use crate::util::skip::opt as skip_opt; /// Before a paragraph. @@ -44,7 +44,7 @@ use crate::util::skip::opt as skip_opt; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { unreachable!("unexpected eol/eof") } _ => { @@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::Paragraph); tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve)); diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 4216276..0b66b09 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -7,7 +7,7 @@ //! [text]: crate::content::text use crate::token::Token; -use crate::tokenizer::{Code, EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, Tokenizer}; /// At the beginning of data. /// @@ -15,13 +15,14 @@ use crate::tokenizer::{Code, EventType, State, Tokenizer}; /// > | abc /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { - if stop.contains(&tokenizer.current) { - tokenizer.enter(Token::Data); - tokenizer.consume(); - State::Fn(Box::new(move |t| data(t, stop))) - } else { - at_break(tokenizer, stop) +pub fn start(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { + match tokenizer.current { + Some(char) if stop.contains(&char) => { + tokenizer.enter(Token::Data); + tokenizer.consume(); + State::Fn(Box::new(move |t| data(t, stop))) + } + _ => at_break(tokenizer, stop), } } @@ -31,16 +32,16 @@ pub fn start(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { /// > | abc /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { +fn at_break(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { match tokenizer.current { - Code::None => State::Ok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Ok, + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); State::Fn(Box::new(move |t| at_break(t, stop))) } - _ if stop.contains(&tokenizer.current) => { + Some(char) if stop.contains(&char) => { tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); State::Ok } @@ -57,10 +58,10 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { /// > | abc /// ^^^ /// ``` -fn data(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { +fn data(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { let done = match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => true, - _ if stop.contains(&tokenizer.current) => true, + None | Some('\n') => true, + Some(char) if stop.contains(&char) => true, _ => false, }; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 6a984e2..6447228 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -72,7 +72,7 @@ //! [sanitize_uri]: crate::util::sanitize_uri use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, Tokenizer}; /// Configuration. /// @@ -117,7 +117,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { }; match tokenizer.current { - Code::Char('<') => { + Some('<') => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.literal.clone()); tokenizer.enter(info.options.marker.clone()); @@ -125,11 +125,9 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { tokenizer.exit(info.options.marker.clone()); State::Fn(Box::new(|t| enclosed_before(t, info))) } - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ' | ')') => { - State::Nok - } - Code::Char(char) if char.is_ascii_control() => State::Nok, - Code::Char(_) => { + None | Some(' ' | ')') => State::Nok, + Some(char) if char.is_ascii_control() => State::Nok, + Some(_) => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.raw.clone()); tokenizer.enter(info.options.string.clone()); @@ -146,7 +144,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ^ /// ``` fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { - if let Code::Char('>') = tokenizer.current { + if let Some('>') = tokenizer.current { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -168,13 +166,13 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, info) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '<') => State::Nok, - Code::Char('\\') => { + None | Some('\n' | '<') => State::Nok, + Some('\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed_escape(t, info))) } @@ -193,7 +191,7 @@ fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('<' | '>' | '\\') => { + Some('<' | '>' | '\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed(t, info))) } @@ -209,7 +207,7 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('(') => { + Some('(') => { if info.balance >= info.options.limit { State::Nok } else { @@ -218,7 +216,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Fn(Box::new(move |t| raw(t, info))) } } - Code::Char(')') => { + Some(')') => { if info.balance == 0 { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); @@ -231,10 +229,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Fn(Box::new(move |t| raw(t, info))) } } - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ') => { + None | Some('\t' | '\n' | ' ') => { if info.balance > 0 { State::Nok } else { @@ -245,12 +240,12 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Ok } } - Code::Char(char) if char.is_ascii_control() => State::Nok, - Code::Char('\\') => { + Some(char) if char.is_ascii_control() => State::Nok, + Some('\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw_escape(t, info))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(move |t| raw(t, info))) } @@ -265,7 +260,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn raw_escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('(' | ')' | '\\') => { + Some('(' | ')' | '\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw(t, info))) } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 91a0e26..ee31533 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -62,7 +62,7 @@ use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::constant::LINK_REFERENCE_SIZE_MAX; use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, Tokenizer}; /// Configuration. /// @@ -98,7 +98,7 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Code::Char('[') => { + Some('[') => { let info = Info { connect: false, data: false, @@ -124,10 +124,10 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::Char('[') => State::Nok, - Code::Char(']') if !info.data => State::Nok, + None | Some('[') => State::Nok, + Some(']') if !info.data => State::Nok, _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, - Code::Char(']') => { + Some(']') => { tokenizer.exit(info.options.string.clone()); tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); @@ -135,7 +135,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(info.options.label); State::Ok } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go( + Some('\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -168,7 +168,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '[' | ']') => { + None | Some('\n' | '[' | ']') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } @@ -176,12 +176,12 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| label(t, info))) } - Code::Char('\\') => { + Some('\\') => { tokenizer.consume(); info.size += 1; if !info.data { @@ -189,7 +189,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { } State::Fn(Box::new(|t| escape(t, info))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); info.size += 1; if !info.data { @@ -208,7 +208,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('[' | '\\' | ']') => { + Some('[' | '\\' | ']') => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| label(t, info))) diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs index bdc22e4..068e30f 100644 --- a/src/construct/partial_non_lazy_continuation.rs +++ b/src/construct/partial_non_lazy_continuation.rs @@ -11,7 +11,7 @@ //! [html_flow]: crate::construct::html_flow use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of continuation. /// @@ -22,7 +22,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 5f1a917..6070ffe 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -6,7 +6,7 @@ use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, StateFn, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateFn, Tokenizer}; /// Options to parse `space_or_tab`. #[derive(Debug)] @@ -134,7 +134,7 @@ pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box { /// ``` fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') if info.options.max > 0 => { + Some('\t' | ' ') if info.options.max > 0 => { tokenizer .enter_with_content(info.options.kind.clone(), info.options.content_type.clone()); @@ -165,7 +165,7 @@ fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') if info.size < info.options.max => { + Some('\t' | ' ') if info.size < info.options.max => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| inside(t, info))) @@ -190,7 +190,7 @@ fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn after_space_or_tab(tokenizer: &mut Tokenizer, mut info: EolInfo) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter_with_content(Token::LineEnding, info.options.content_type.clone()); if info.connect { @@ -239,10 +239,7 @@ fn after_eol(tokenizer: &mut Tokenizer, info: EolInfo) -> State { /// ``` fn after_more_space_or_tab(tokenizer: &mut Tokenizer) -> State { // Blank line not allowed. - if matches!( - tokenizer.current, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - ) { + if matches!(tokenizer.current, None | Some('\n')) { State::Nok } else { State::Ok diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index e9528fd..15fc25e 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -33,7 +33,7 @@ use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, Tokenizer}; /// Configuration. /// @@ -103,19 +103,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('(' | '"' | '\'')`. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse titles. @@ -137,10 +124,10 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Code::Char('"' | '\'' | '(') => { + Some(char) if matches!(char, '"' | '\'' | '(') => { let info = Info { connect: false, - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), options, }; tokenizer.enter(info.options.title.clone()); @@ -163,7 +150,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -185,12 +172,12 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.exit(info.options.string.clone()); begin(tokenizer, info) } - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go( + None => State::Nok, + Some('\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -223,15 +210,15 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn title(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Code::Char('\\') => { + Some('\\') => { tokenizer.consume(); State::Fn(Box::new(|t| escape(t, info))) } @@ -250,7 +237,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| title(t, info))) } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 4c94c7d..152824b 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -47,8 +47,8 @@ use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType, Tokenizer}; -use crate::util::span; +use crate::tokenizer::{Event, EventType, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// To do. pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) { @@ -85,30 +85,26 @@ fn trim_data( trim_end: bool, hard_break: bool, ) { - let mut codes = span::codes( - &tokenizer.parse_state.codes, - &span::from_exit_event(&tokenizer.events, exit_index), + let mut slice = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, exit_index), ); if trim_end { - let mut index = codes.len(); - let mut vs = 0; - let mut spaces_only = true; + let mut index = slice.chars.len(); + let vs = slice.after; + let mut spaces_only = vs == 0; while index > 0 { - match codes[index - 1] { - Code::Char(' ') => {} - Code::Char('\t') => spaces_only = false, - Code::VirtualSpace => { - vs += 1; - spaces_only = false; - } + match slice.chars[index - 1] { + ' ' => {} + '\t' => spaces_only = false, _ => break, } index -= 1; } - let diff = codes.len() - index; + let diff = slice.chars.len() - index; let token_type = if spaces_only && hard_break && exit_index + 1 < tokenizer.events.len() @@ -127,12 +123,12 @@ fn trim_data( return; } - if diff > 0 { + if diff > 0 || vs > 0 { let exit_point = tokenizer.events[exit_index].point.clone(); let mut enter_point = exit_point.clone(); enter_point.index -= diff; - enter_point.column -= diff - vs; - enter_point.offset -= diff - vs; + enter_point.column -= diff; + enter_point.vs = 0; tokenizer.map.add( exit_index + 1, @@ -154,17 +150,16 @@ fn trim_data( ); tokenizer.events[exit_index].point = enter_point; - codes = &codes[..index]; + slice.chars = &slice.chars[..index]; } } if trim_start { let mut index = 0; - let mut vs = 0; - while index < codes.len() { - match codes[index] { - Code::Char(' ' | '\t') => {} - Code::VirtualSpace => vs += 1, + let vs = slice.before; + while index < slice.chars.len() { + match slice.chars[index] { + ' ' | '\t' => {} _ => break, } @@ -173,18 +168,18 @@ fn trim_data( // The whole data is whitespace. // We can be very fast: we only change the token types. - if index == codes.len() { + if index == slice.chars.len() { tokenizer.events[exit_index - 1].token_type = Token::SpaceOrTab; tokenizer.events[exit_index].token_type = Token::SpaceOrTab; return; } - if index > 0 { + if index > 0 || vs > 0 { let enter_point = tokenizer.events[exit_index - 1].point.clone(); let mut exit_point = enter_point.clone(); exit_point.index += index; - exit_point.column += index - vs; - exit_point.offset += index - vs; + exit_point.column += index; + exit_point.vs = 0; tokenizer.map.add( exit_index - 1, diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 41dc6ae..bed454b 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -51,7 +51,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Type of thematic break. #[derive(Debug, PartialEq)] @@ -104,19 +104,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('*' | '-' | '_')`. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse thematic breaks. @@ -157,10 +144,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('*' | '-' | '_') => at_break( + Some(char) if matches!(char, '*' | '-' | '_') => at_break( tokenizer, Info { - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), size: 0, }, ), @@ -176,15 +163,13 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => - { + None | Some('\n' | '\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(Token::ThematicBreakSequence); sequence(tokenizer, info) } @@ -200,7 +185,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| sequence(t, info))) diff --git a/src/content/document.rs b/src/content/document.rs index 32b32ba..2924f6c 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -17,12 +17,12 @@ use crate::parser::ParseState; use crate::subtokenize::subtokenize; use crate::token::Token; use crate::tokenizer::{ - Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, + Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer, }; use crate::util::{ normalize_identifier::normalize_identifier, skip, - span::{from_exit_event, serialize}, + slice::{Position, Slice}, }; /// Phases where we can exit containers. @@ -78,7 +78,7 @@ struct DocumentInfo { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start)); + let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before)); tokenizer.flush(state, true); let mut index = 0; @@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { + // To do: when we operate on u8, we can use a `to_str` here as we + // don‘t need virtual spaces. let id = normalize_identifier( - serialize( - &parse_state.codes, - &from_exit_event(&tokenizer.events, index), - false, + &Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, index), ) - .as_str(), + .serialize(), ); if !definitions.contains(&id) { @@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { events } +/// At the beginning. +/// +/// Perhaps a BOM? +/// +/// ```markdown +/// > | a +/// ^ +/// ``` +fn before(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some('\u{FEFF}') => { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Fn(Box::new(start)) + } + _ => start(tokenizer), + } +} + /// Before document. // /// ```markdown @@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State // Parse flow, pausing after eols. tokenizer.go_until( state, - |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')), + |code| matches!(code, Some('\n')), move |state| Box::new(move |t| flow_end(t, info, state)), )(tokenizer) } diff --git a/src/content/flow.rs b/src/content/flow.rs index ea09cd9..09c4e2c 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -27,7 +27,7 @@ use crate::construct::{ thematic_break::start as thematic_break, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Before flow. /// @@ -41,7 +41,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt(blank_line, |ok| { Box::new(if ok { blank_line_after } else { initial_before }) })(tokenizer), @@ -62,7 +62,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn initial_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt_n( vec![ Box::new(code_indented), @@ -87,8 +87,8 @@ fn initial_before(tokenizer: &mut Tokenizer) -> State { /// ``` fn blank_line_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Ok, + Some('\n') => { tokenizer.enter(Token::BlankLineEnding); tokenizer.consume(); tokenizer.exit(Token::BlankLineEnding); @@ -111,8 +111,8 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Ok, + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); diff --git a/src/content/string.rs b/src/content/string.rs index c6c0094..8bc2b91 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -16,9 +16,9 @@ use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, partial_data::start as data, partial_whitespace::create_resolve_whitespace, }; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; -const MARKERS: [Code; 2] = [Code::Char('&'), Code::Char('\\')]; +const MARKERS: [char; 2] = ['&', '\\']; /// Start of string. pub fn start(tokenizer: &mut Tokenizer) -> State { @@ -32,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// Before string. fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt_n( vec![Box::new(character_reference), Box::new(character_escape)], |ok| Box::new(if ok { before } else { before_data }), diff --git a/src/content/text.rs b/src/content/text.rs index 4248053..ebdf888 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -28,18 +28,18 @@ use crate::construct::{ label_start_image::start as label_start_image, label_start_link::start as label_start_link, partial_data::start as data, partial_whitespace::create_resolve_whitespace, }; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; -const MARKERS: [Code; 9] = [ - Code::Char('!'), // `label_start_image` - Code::Char('&'), // `character_reference` - Code::Char('*'), // `attention` - Code::Char('<'), // `autolink`, `html_text` - Code::Char('['), // `label_start_link` - Code::Char('\\'), // `character_escape`, `hard_break_escape` - Code::Char(']'), // `label_end` - Code::Char('_'), // `attention` - Code::Char('`'), // `code_text` +const MARKERS: [char; 9] = [ + '!', // `label_start_image` + '&', // `character_reference` + '*', // `attention` + '<', // `autolink`, `html_text` + '[', // `label_start_link` + '\\', // `character_escape`, `hard_break_escape` + ']', // `label_end` + '_', // `attention` + '`', // `code_text` ]; /// Start of text. @@ -57,7 +57,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// Before text. pub fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Ok, + None => State::Ok, _ => tokenizer.attempt_n( vec![ Box::new(attention), diff --git a/src/lib.rs b/src/lib.rs index 4dc15e6..c1b0fa0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,7 +17,6 @@ mod util; use crate::compiler::compile; use crate::parser::parse; -use crate::tokenizer::Code; /// Type of line endings in markdown. #[derive(Debug, Default, Clone, PartialEq)] @@ -61,16 +60,16 @@ impl LineEnding { LineEnding::LineFeed => "\n", } } - /// Turn a [Code] into a line ending. + /// Turn a string into a line ending. /// /// ## Panics /// /// Panics if `code` is not `\r\n`, `\r`, or `\n`. - fn from_code(code: Code) -> LineEnding { - match code { - Code::CarriageReturnLineFeed => LineEnding::CarriageReturnLineFeed, - Code::Char('\r') => LineEnding::CarriageReturn, - Code::Char('\n') => LineEnding::LineFeed, + fn from_str(str: &str) -> LineEnding { + match str { + "\r\n" => LineEnding::CarriageReturnLineFeed, + "\r" => LineEnding::CarriageReturn, + "\n" => LineEnding::LineFeed, _ => unreachable!("invalid code"), } } @@ -425,5 +424,5 @@ pub fn micromark(value: &str) -> String { #[must_use] pub fn micromark_with_options(value: &str, options: &Options) -> String { let (events, result) = parse(value, options); - compile(&events, &result.codes, options) + compile(&events, &result.chars, options) } diff --git a/src/parser.rs b/src/parser.rs index 0f71daf..cc9c256 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,19 +1,18 @@ //! Turn a string of markdown into events. use crate::content::document::document; -use crate::tokenizer::{Code, Event, Point}; -use crate::util::codes::parse as parse_codes; +use crate::tokenizer::{Event, Point}; use crate::{Constructs, Options}; /// Information needed, in all content types, when parsing markdown. /// /// Importantly, this contains a set of known definitions. -/// It also references the input value as [`Code`][]s. +/// It also references the input value as a `Vec`. #[derive(Debug)] pub struct ParseState<'a> { pub constructs: &'a Constructs, - /// List of codes. - pub codes: Vec, + /// List of chars. + pub chars: Vec, /// Set of defined identifiers. pub definitions: Vec, } @@ -24,7 +23,8 @@ pub struct ParseState<'a> { pub fn parse<'a>(value: &str, options: &'a Options) -> (Vec, ParseState<'a>) { let mut parse_state = ParseState { constructs: &options.constructs, - codes: parse_codes(value), + // To do: change to `u8`s? + chars: value.chars().collect::<_>(), definitions: vec![], }; @@ -33,8 +33,8 @@ pub fn parse<'a>(value: &str, options: &'a Options) -> (Vec, ParseState<' Point { line: 1, column: 1, - offset: 0, index: 0, + vs: 0, }, ); diff --git a/src/token.rs b/src/token.rs index a0479e1..db3bffc 100644 --- a/src/token.rs +++ b/src/token.rs @@ -157,6 +157,17 @@ pub enum Token { /// | b /// ``` BlockQuotePrefix, + /// Byte order mark. + /// + /// ## Info + /// + /// * **Context**: + /// optional first event + /// * **Content model**: + /// void + /// * **Construct**: + /// [`document`][crate::content::document] + ByteOrderMark, /// Whole character escape. /// /// ## Info @@ -1822,13 +1833,14 @@ pub enum Token { } /// List of void tokens, used to make sure everything is working good. -pub const VOID_TOKENS: [Token; 39] = [ +pub const VOID_TOKENS: [Token; 40] = [ Token::AttentionSequence, Token::AutolinkEmail, Token::AutolinkMarker, Token::AutolinkProtocol, Token::BlankLineEnding, Token::BlockQuoteMarker, + Token::ByteOrderMark, Token::CharacterEscapeMarker, Token::CharacterEscapeValue, Token::CharacterReferenceMarker, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ba18956..ec70a2b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -11,6 +11,7 @@ //! [`attempt`]: Tokenizer::attempt //! [`check`]: Tokenizer::check +use crate::constant::TAB_SIZE; use crate::parser::ParseState; use crate::token::{Token, VOID_TOKENS}; use crate::util::edit_map::EditMap; @@ -24,20 +25,11 @@ pub enum ContentType { String, } -/// Enum representing a character code. -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum Code { - /// End of the input stream (called eof). - None, - /// Used to make parsing line endings easier as it represents both - /// `Code::Char('\r')` and `Code::Char('\n')` combined. - CarriageReturnLineFeed, - /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab - /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s. - VirtualSpace, - /// The most frequent variant of this enum is `Code::Char(char)`, which just - /// represents a char, but micromark adds meaning to certain other values. - Char(char), +#[derive(Debug, PartialEq)] +pub enum CharAction { + Normal(char), + Insert(char), + Ignore, } /// A location in the document (`line`/`column`/`offset`). @@ -54,9 +46,12 @@ pub struct Point { /// the same as editors. pub column: usize, /// 0-indexed position in the document. - pub offset: usize, - /// Index into `codes`. + /// + /// Also an `index` into `codes`. + // To do: call it `offset`? pub index: usize, + /// To do. + pub vs: usize, } /// Possible event types. @@ -86,7 +81,7 @@ pub struct Event { } /// The essence of the state machine are functions: `StateFn`. -/// It’s responsible for dealing with that single passed [`Code`][]. +/// It’s responsible for dealing with the current char. /// It yields a [`State`][]. pub type StateFn = dyn FnOnce(&mut Tokenizer) -> State; @@ -162,9 +157,9 @@ struct InternalState { /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. stack_len: usize, /// Previous code. - previous: Code, + previous: Option, /// Current code. - current: Code, + current: Option, /// Current relative and absolute position in the file. point: Point, } @@ -173,9 +168,11 @@ struct InternalState { #[allow(clippy::struct_excessive_bools)] pub struct Tokenizer<'a> { /// Jump between line endings. - column_start: Vec, + column_start: Vec<(usize, usize)>, // First line. - line_start: usize, + first_line: usize, + /// To do. + line_start: Point, /// Track whether a character is expected to be consumed, and whether it’s /// actually consumed /// @@ -184,9 +181,9 @@ pub struct Tokenizer<'a> { /// Track whether this tokenizer is done. resolved: bool, /// Current character code. - pub current: Code, + pub current: Option, /// Previous character code. - pub previous: Code, + pub previous: Option, /// Current relative and absolute place in the file. pub point: Point, /// Semantic labels of one or more codes in `codes`. @@ -237,11 +234,12 @@ impl<'a> Tokenizer<'a> { /// Create a new tokenizer. pub fn new(point: Point, parse_state: &'a ParseState) -> Tokenizer<'a> { Tokenizer { - previous: Code::None, - current: Code::None, + previous: None, + current: None, // To do: reserve size when feeding? column_start: vec![], - line_start: point.line, + first_line: point.line, + line_start: point.clone(), consumed: true, resolved: false, point, @@ -280,18 +278,18 @@ impl<'a> Tokenizer<'a> { /// Define a jump between two places. pub fn define_skip(&mut self, point: &Point) { - define_skip_impl(self, point.line, point.index); + define_skip_impl(self, point.line, (point.index, point.vs)); } /// Define the current place as a jump between two places. pub fn define_skip_current(&mut self) { - define_skip_impl(self, self.point.line, self.point.index); + define_skip_impl(self, self.point.line, (self.point.index, self.point.vs)); } /// Increment the current positional info if we’re right after a line /// ending, which has a skip defined. fn account_for_potential_skip(&mut self) { - let at = self.point.line - self.line_start; + let at = self.point.line - self.first_line; if self.point.column == 1 && at != self.column_start.len() { self.move_to(self.column_start[at]); @@ -299,10 +297,10 @@ impl<'a> Tokenizer<'a> { } /// Prepare for a next code to get consumed. - pub fn expect(&mut self, code: Code) { + pub fn expect(&mut self, char: Option) { assert!(self.consumed, "expected previous character to be consumed"); self.consumed = false; - self.current = code; + self.current = char; } /// Consume the current character. @@ -311,46 +309,60 @@ impl<'a> Tokenizer<'a> { pub fn consume(&mut self) { log::debug!("consume: `{:?}` ({:?})", self.current, self.point); assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); - self.move_to(self.point.index + 1); + + self.move_one(); + self.previous = self.current; + // While we’re not at the eof, it is at least better to not have the + // same current code as `previous` *and* `current`. + self.current = None; // Mark as consumed. self.consumed = true; } - /// To do. - pub fn move_to(&mut self, to: usize) { - while self.point.index < to { - let code = &self.parse_state.codes[self.point.index]; - self.point.index += 1; + /// Move to the next (virtual) character. + pub fn move_one(&mut self) { + match char_action(&self.parse_state.chars, &self.point) { + CharAction::Ignore => { + self.point.index += 1; + } + CharAction::Insert(char) => { + self.previous = Some(char); + self.point.column += 1; + self.point.vs += 1; + } + CharAction::Normal(char) => { + self.previous = Some(char); + self.point.vs = 0; + self.point.index += 1; - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + if char == '\n' { self.point.line += 1; self.point.column = 1; - self.point.offset += if *code == Code::CarriageReturnLineFeed { - 2 - } else { - 1 - }; - if self.point.line - self.line_start + 1 > self.column_start.len() { - self.column_start.push(self.point.index); + if self.point.line - self.first_line + 1 > self.column_start.len() { + self.column_start.push((self.point.index, self.point.vs)); } + self.line_start = self.point.clone(); + self.account_for_potential_skip(); log::debug!("position: after eol: `{:?}`", self.point); - } - Code::VirtualSpace => { - // Empty. - } - _ => { + } else { self.point.column += 1; - self.point.offset += 1; } } } } + /// Move (virtual) characters. + pub fn move_to(&mut self, to: (usize, usize)) { + let (to_index, to_vs) = to; + while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs { + self.move_one(); + } + } + /// Mark the start of a semantic label. pub fn enter(&mut self, token_type: Token) { self.enter_with_link(token_type, None); @@ -368,11 +380,23 @@ impl<'a> Tokenizer<'a> { } pub fn enter_with_link(&mut self, token_type: Token, link: Option) { - log::debug!("enter: `{:?}` ({:?})", token_type, self.point); + let mut point = self.point.clone(); + + // Move back past ignored chars. + while point.index > 0 { + point.index -= 1; + let action = char_action(&self.parse_state.chars, &point); + if !matches!(action, CharAction::Ignore) { + point.index += 1; + break; + } + } + + log::debug!("enter: `{:?}` ({:?})", token_type, point); self.events.push(Event { event_type: EventType::Enter, token_type: token_type.clone(), - point: self.point.clone(), + point, link, }); self.stack.push(token_type); @@ -391,7 +415,9 @@ impl<'a> Tokenizer<'a> { let mut point = self.point.clone(); assert!( - current_token != previous.token_type || previous.point.index != point.index, + current_token != previous.token_type + || previous.point.index != point.index + || previous.point.vs != point.vs, "expected non-empty token" ); @@ -406,18 +432,18 @@ impl<'a> Tokenizer<'a> { // A bit weird, but if we exit right after a line ending, we *don’t* want to consider // potential skips. - if matches!( - self.previous, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - ) { - point.column = 1; - point.offset = previous.point.offset - + if self.previous == Code::CarriageReturnLineFeed { - 2 - } else { - 1 - }; - point.index = previous.point.index + 1; + if matches!(self.previous, Some('\n')) { + point = self.line_start.clone(); + } else { + // Move back past ignored chars. + while point.index > 0 { + point.index -= 1; + let action = char_action(&self.parse_state.chars, &point); + if !matches!(action, CharAction::Ignore) { + point.index += 1; + break; + } + } } log::debug!("exit: `{:?}` ({:?})", token_type, point); @@ -494,7 +520,7 @@ impl<'a> Tokenizer<'a> { pub fn go_until( &mut self, state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - until: impl Fn(Code) -> bool + 'static, + until: impl Fn(Option) -> bool + 'static, done: impl FnOnce(State) -> Box + 'static, ) -> Box { attempt_impl( @@ -619,19 +645,32 @@ impl<'a> Tokenizer<'a> { assert!(!self.resolved, "cannot feed after drain"); assert!(min >= self.point.index, "cannot move backwards"); - self.move_to(min); + // To do: accept `vs`? + self.move_to((min, 0)); let mut state = State::Fn(Box::new(start)); while self.point.index < max { match state { State::Ok | State::Nok => break, - State::Fn(func) => { - let code = self.parse_state.codes[self.point.index]; - log::debug!("main: passing: `{:?}` ({:?})", code, self.point); - self.expect(code); - state = func(self); - } + State::Fn(func) => match char_action(&self.parse_state.chars, &self.point) { + CharAction::Ignore => { + state = State::Fn(Box::new(func)); + self.move_one(); + } + CharAction::Insert(char) => { + log::debug!("main: passing (fake): `{:?}` ({:?})", char, self.point); + self.expect(Some(char)); + state = func(self); + // self.point.column += 1; + // self.point.vs += 1; + } + CharAction::Normal(char) => { + log::debug!("main: passing: `{:?}` ({:?})", char, self.point); + self.expect(Some(char)); + state = func(self); + } + }, } } @@ -648,15 +687,35 @@ impl<'a> Tokenizer<'a> { match state { State::Ok | State::Nok => break, State::Fn(func) => { + // To do: clean this? // We sometimes move back when flushing, so then we use those codes. - let code = if self.point.index < max { - self.parse_state.codes[self.point.index] + if self.point.index == max { + let char = None; + log::debug!("main: flushing eof: `{:?}` ({:?})", char, self.point); + self.expect(char); + state = func(self); } else { - Code::None + match char_action(&self.parse_state.chars, &self.point) { + CharAction::Ignore => { + state = State::Fn(Box::new(func)); + self.move_one(); + } + CharAction::Insert(char) => { + log::debug!( + "main: flushing (fake): `{:?}` ({:?})", + char, + self.point + ); + self.expect(Some(char)); + state = func(self); + } + CharAction::Normal(char) => { + log::debug!("main: flushing: `{:?}` ({:?})", char, self.point); + self.expect(Some(char)); + state = func(self); + } + } }; - log::debug!("main: flushing {:?}", code); - self.expect(code); - state = func(self); } } } @@ -676,13 +735,58 @@ impl<'a> Tokenizer<'a> { } } +fn char_action(chars: &[char], point: &Point) -> CharAction { + if point.index < chars.len() { + let char = chars[point.index]; + + if char == '\0' { + CharAction::Normal(char::REPLACEMENT_CHARACTER) + } else if char == '\r' { + // CRLF. + if point.index < chars.len() - 1 && chars[point.index + 1] == '\n' { + CharAction::Ignore + } + // CR. + else { + CharAction::Normal('\n') + } + } else if char == '\t' { + let remainder = point.column % TAB_SIZE; + let vs = if remainder == 0 { + 0 + } else { + TAB_SIZE - remainder + }; + + // On the tab itself, first send it. + if point.vs == 0 { + if vs == 0 { + CharAction::Normal(char) + } else { + CharAction::Insert(char) + } + } else if vs == 0 { + CharAction::Normal(' ') + } else { + CharAction::Insert(' ') + } + } + // VS? + else { + CharAction::Normal(char) + } + } else { + unreachable!("out of bounds") + } +} + /// Internal utility to wrap states to also capture codes. /// /// Recurses into itself. /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( state: impl FnOnce(&mut Tokenizer) -> State + 'static, - pause: Option bool + 'static>>, + pause: Option) -> bool + 'static>>, start: usize, done: impl FnOnce(&mut Tokenizer, State) -> State + 'static, ) -> Box { @@ -706,14 +810,14 @@ fn attempt_impl( /// Define a jump between two places. /// /// This defines to which future index we move after a line ending. -fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, index: usize) { - log::debug!("position: define skip: {:?} -> ({:?})", line, index); - let at = line - tokenizer.line_start; +fn define_skip_impl(tokenizer: &mut Tokenizer, line: usize, info: (usize, usize)) { + log::debug!("position: define skip: {:?} -> ({:?})", line, info); + let at = line - tokenizer.first_line; - if at == tokenizer.column_start.len() { - tokenizer.column_start.push(index); + if at >= tokenizer.column_start.len() { + tokenizer.column_start.push(info); } else { - tokenizer.column_start[at] = index; + tokenizer.column_start[at] = info; } tokenizer.account_for_potential_skip(); diff --git a/src/util/codes.rs b/src/util/codes.rs deleted file mode 100644 index 5006a00..0000000 --- a/src/util/codes.rs +++ /dev/null @@ -1,125 +0,0 @@ -//! Utilities to deal with character codes. - -use crate::constant::TAB_SIZE; -use crate::tokenizer::Code; - -/// Turn a string into codes. -pub fn parse(value: &str) -> Vec { - // Note: It’ll grow a bit bigger with each `Code::VirtualSpace`, smaller - // with `Code::CarriageReturnLineFeed`. - let mut codes = Vec::with_capacity(value.len()); - let mut at_start = true; - let mut at_carriage_return = false; - let mut column = 1; - - for char in value.chars() { - if at_start { - at_start = false; - - if char == '\u{feff}' { - // Ignore. - continue; - } - } - - // Send a CRLF. - if at_carriage_return && '\n' == char { - at_carriage_return = false; - codes.push(Code::CarriageReturnLineFeed); - } else { - // Send the previous CR: we’re not at a next `\n`. - if at_carriage_return { - at_carriage_return = false; - codes.push(Code::Char('\r')); - } - - match char { - // Send a replacement character. - '\0' => { - column += 1; - codes.push(Code::Char(char::REPLACEMENT_CHARACTER)); - } - // Send a tab and virtual spaces. - '\t' => { - let remainder = column % TAB_SIZE; - let mut virtual_spaces = if remainder == 0 { - 0 - } else { - TAB_SIZE - remainder - }; - codes.push(Code::Char(char)); - column += 1; - while virtual_spaces > 0 { - codes.push(Code::VirtualSpace); - column += 1; - virtual_spaces -= 1; - } - } - // Send an LF. - '\n' => { - column = 1; - codes.push(Code::Char(char)); - } - // Don’t send anything yet. - '\r' => { - column = 1; - at_carriage_return = true; - } - // Send the char. - _ => { - column += 1; - codes.push(Code::Char(char)); - } - } - }; - } - - // Send the last CR: we’re not at a next `\n`. - if at_carriage_return { - codes.push(Code::Char('\r')); - } - - codes -} - -/// Serialize codes, optionally expanding tabs. -pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { - let mut at_tab = false; - // Note: It’ll grow a bit smaller with each - // `Code::Char('\t') | Code::VirtualSpace` if `expand_tabs` is false, - // and bigger with `Code::CarriageReturnLineFeed`, - let mut value = String::with_capacity(codes.len()); - - for code in codes { - let mut at_tab_next = false; - - match code { - Code::CarriageReturnLineFeed => { - value.push_str("\r\n"); - } - Code::Char(char) if *char == '\n' || *char == '\r' => { - value.push(*char); - } - Code::Char(char) if *char == '\t' => { - at_tab_next = true; - value.push(if expand_tabs { ' ' } else { *char }); - } - Code::VirtualSpace => { - if !expand_tabs && at_tab { - continue; - } - value.push(' '); - } - Code::Char(char) => { - value.push(*char); - } - Code::None => { - unreachable!("unexpected EOF code in codes"); - } - } - - at_tab = at_tab_next; - } - - value -} diff --git a/src/util/encode.rs b/src/util/encode.rs index 965ea5c..91c5462 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,7 +20,8 @@ /// ## References /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode>(value: S) -> String { +pub fn encode>(value: S, encode_html: bool) -> String { + let check = if encode_html { check_all } else { check_nil }; let mut value = value.into(); // It’ll grow a bit bigger for each dangerous character. @@ -31,6 +32,7 @@ pub fn encode>(value: S) -> String { let dangerous = value.pop().unwrap(); result.push_str(&value); result.push_str(match dangerous { + '\0' => "�", '&' => "&", '"' => """, '<' => "<", @@ -45,6 +47,10 @@ pub fn encode>(value: S) -> String { result } -fn check(char: char) -> bool { - matches!(char, '&' | '"' | '<' | '>') +fn check_all(char: char) -> bool { + matches!(char, '\0' | '&' | '"' | '<' | '>') +} + +fn check_nil(char: char) -> bool { + matches!(char, '\0') } diff --git a/src/util/mod.rs b/src/util/mod.rs index ae1add6..a01f31e 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,10 +1,9 @@ //! Utilities used when compiling markdown. -pub mod codes; pub mod decode_character_reference; pub mod edit_map; pub mod encode; pub mod normalize_identifier; pub mod sanitize_uri; pub mod skip; -pub mod span; +pub mod slice; diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 81450ae..8c09549 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,7 +32,7 @@ use crate::util::encode::encode; /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) pub fn sanitize_uri(value: &str, protocols: &Option>) -> String { - let value = encode(normalize_uri(value)); + let value = encode(normalize_uri(value), true); if let Some(protocols) = protocols { let end = value.find(|c| matches!(c, '?' | '#' | '/')); diff --git a/src/util/slice.rs b/src/util/slice.rs new file mode 100644 index 0000000..2134069 --- /dev/null +++ b/src/util/slice.rs @@ -0,0 +1,156 @@ +//! Utilities to deal with characters. + +use crate::constant::TAB_SIZE; +use crate::tokenizer::{Event, EventType, Point}; + +/// A range between two places. +#[derive(Debug)] +pub struct Position<'a> { + pub start: &'a Point, + pub end: &'a Point, +} + +impl<'a> Position<'a> { + /// Get a position from an exit event. + /// + /// Looks backwards for the corresponding `enter` event. + /// This does not support nested events (such as lists in lists). + /// + /// ## Panics + /// + /// This function panics if an enter event is given. + /// When `micromark` is used, this function never panics. + pub fn from_exit_event(events: &'a [Event], index: usize) -> Position<'a> { + let exit = &events[index]; + assert_eq!( + exit.event_type, + EventType::Exit, + "expected `from_exit_event` to be called on `exit` event" + ); + let mut enter_index = index - 1; + + loop { + let enter = &events[enter_index]; + if enter.event_type == EventType::Enter && enter.token_type == exit.token_type { + return Position { + start: &enter.point, + end: &exit.point, + }; + } + + enter_index -= 1; + } + } +} + +/// Chars belonging to a range. +/// +/// Includes information on virtual spaces before and after the chars. +#[derive(Debug)] +pub struct Slice<'a> { + pub chars: &'a [char], + pub before: usize, + pub after: usize, +} + +impl<'a> Slice<'a> { + /// Get the slice belonging to a position. + pub fn from_point(list: &'a [char], point: &Point) -> Slice<'a> { + let mut before = point.vs; + let mut start = point.index; + let end = if start < list.len() { start + 1 } else { start }; + + // If we have virtual spaces before, it means we are past the actual + // character at that index, and those virtual spaces. + if before > 0 { + before = TAB_SIZE - before; + start += 1; + }; + + Slice { + chars: if start < end { &list[start..end] } else { &[] }, + before, + after: 0, + } + } + + /// Get the slice belonging to a position. + pub fn from_position(list: &'a [char], position: &Position) -> Slice<'a> { + let mut before = position.start.vs; + let mut after = position.end.vs; + let mut start = position.start.index; + let mut end = position.end.index; + + // If we have virtual spaces before, it means we are past the actual + // character at that index, and those virtual spaces. + if before > 0 { + before = TAB_SIZE - before; + start += 1; + }; + + // If we have virtual spaces after, it means that character is included, + // and one less virtual space. + if after > 0 { + after -= 1; + end += 1; + } + + Slice { + chars: &list[start..end], + before, + after, + } + } + + /// To do. + pub fn size(&self) -> usize { + self.chars.len() + self.before + self.after + } + + // To do: + // When we have u8s, we could use: + // to implement an `as_str`. + + /// To do. + pub fn head(&self) -> Option { + if self.before > 0 { + Some(' ') + } else if self.chars.is_empty() { + None + } else { + Some(self.chars[0]) + } + } + + /// To do. + pub fn tail(&self) -> Option { + if self.after > 0 { + Some(' ') + } else { + let index = self.chars.len(); + if index > 0 { + Some(self.chars[index - 1]) + } else { + None + } + } + } + + /// To do. + pub fn serialize(&self) -> String { + let mut string = String::with_capacity(self.size()); + let mut index = self.before; + while index > 0 { + string.push(' '); + index -= 1; + } + string.push_str(&self.chars.iter().collect::()); + index = self.after; + while index > 0 { + string.push(' '); + index -= 1; + } + + string + } +} diff --git a/src/util/span.rs b/src/util/span.rs deleted file mode 100644 index ca25924..0000000 --- a/src/util/span.rs +++ /dev/null @@ -1,57 +0,0 @@ -//! Utilities to deal with semantic labels. - -use crate::tokenizer::{Code, Event, EventType}; -use crate::util::codes::serialize as serialize_codes; - -/// A struct representing the span of an opening and closing event of a token. -#[derive(Debug)] -pub struct Span { - /// Absolute offset (an `index` in `codes`) of where this span starts. - pub start_index: usize, - /// Absolute offset (an `index` in `codes`) of where this span ends. - pub end_index: usize, -} - -/// Get a span from an event. -/// -/// Get the span of an `exit` event, by looking backwards through the events to -/// find the corresponding `enter` event. -/// This assumes that tokens with the same are not nested. -/// -/// ## Panics -/// -/// This function panics if an enter event is given. -/// When `micromark` is used, this function never panics. -pub fn from_exit_event(events: &[Event], index: usize) -> Span { - let exit = &events[index]; - let end_index = exit.point.index; - let token_type = exit.token_type.clone(); - assert_eq!( - exit.event_type, - EventType::Exit, - "expected `from_exit_event` to be called on `exit` event" - ); - let mut enter_index = index - 1; - - loop { - let enter = &events[enter_index]; - if enter.event_type == EventType::Enter && enter.token_type == token_type { - return Span { - start_index: enter.point.index, - end_index, - }; - } - - enter_index -= 1; - } -} - -/// Serialize a span, optionally expanding tabs. -pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String { - serialize_codes(codes(all_codes, span), expand_tabs) -} - -/// Get a slice of codes from a span. -pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] { - &codes[span.start_index..span.end_index] -} diff --git a/tests/misc_tabs.rs b/tests/misc_tabs.rs index 7073c57..c5e5c43 100644 --- a/tests/misc_tabs.rs +++ b/tests/misc_tabs.rs @@ -68,12 +68,6 @@ fn tabs_flow() { "should not support a 3*SP + HT to start a thematic break" ); - assert_eq!( - micromark(" \t---"), - "
---\n
", - "should not support a 3*SP + HT to start a thematic break" - ); - assert_eq!( micromark(" \t```"), "
```\n
", -- cgit