diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
commit | 0eeff9148e327183e532752f46421a75506dd7a6 (patch) | |
tree | 4f0aed04f90aa759ce96a2e87aa719e7fa95c450 | |
parent | 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff) | |
download | markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2 markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip |
Refactor to improve states
* Remove custom kind wrappers, use plain bytes instead
* Remove `Into`s, use the explicit expected types instead
* Refactor to use `slice.as_str` in most places
* Remove unneeded unique check before adding a definition
* Use a shared CDATA prefix in constants
* Inline byte checks into matches
* Pass bytes back from parser instead of whole parse state
* Refactor to work more often on bytes
* Rename custom `size` to `len`
36 files changed, 941 insertions, 1230 deletions
@@ -130,7 +130,7 @@ async fn punctuation() { /// > It is generate from the latest Unicode data. /// /// Rust does not contain an `is_punctuation` method on `char`, while it does -/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). +/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric). /// /// `CommonMark` handles attention (emphasis, strong) markers based on what /// comes before or after them. diff --git a/src/compiler.rs b/src/compiler.rs index de76142..e0ab1e9 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -1,6 +1,5 @@ //! Turn events into a string of HTML. use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}; -use crate::construct::character_reference::Kind as CharacterReferenceKind; use crate::token::Token; use crate::tokenizer::{Event, EventType}; use crate::util::normalize_identifier::normalize_identifier; @@ -68,14 +67,14 @@ struct CompileContext<'a> { pub code_flow_seen_data: Option<bool>, pub code_fenced_fences_count: Option<usize>, pub code_text_inside: bool, - pub character_reference_kind: Option<CharacterReferenceKind>, + pub character_reference_marker: Option<u8>, pub expect_first_item: Option<bool>, pub media_stack: Vec<Media>, pub definitions: Vec<(String, Definition)>, pub tight_stack: Vec<bool>, /// Fields used to influance the current compilation. pub slurp_one_line_ending: bool, - pub tags: bool, + pub in_image_alt: bool, pub encode_html: bool, pub last_was_tag: bool, /// Configuration @@ -104,13 +103,13 @@ impl<'a> CompileContext<'a> { code_flow_seen_data: None, code_fenced_fences_count: None, code_text_inside: false, - character_reference_kind: None, + character_reference_marker: None, expect_first_item: None, media_stack: vec![], definitions: vec![], tight_stack: vec![], slurp_one_line_ending: false, - tags: true, + in_image_alt: false, encode_html: true, last_was_tag: false, protocol_href: if options.allow_dangerous_protocol { @@ -140,8 +139,7 @@ impl<'a> CompileContext<'a> { self.buffers.pop().expect("Cannot resume w/o buffer") } - pub fn push<'x, S: Into<&'x str>>(&mut self, value: S) { - let value = value.into(); + pub fn push(&mut self, value: &str) { self.buffers .last_mut() .expect("Cannot push w/o buffer") @@ -149,17 +147,8 @@ impl<'a> CompileContext<'a> { self.last_was_tag = false; } - pub fn push_raw<'x, S: Into<&'x str>>(&mut self, value: S) { - let value = value.into(); - self.push(&*encode(value, self.encode_html)); - } - - pub fn tag<'x, S: Into<&'x str>>(&mut self, value: S) { - if self.tags { - let value = value.into(); - self.push(&*encode(value, false)); - self.last_was_tag = true; - } + pub fn push_raw(&mut self, value: &str) { + self.push(&encode(value, self.encode_html)); } /// Get the current buffer. @@ -172,7 +161,7 @@ impl<'a> CompileContext<'a> { /// Add a line ending. pub fn line_ending(&mut self) { let eol = self.line_ending_default.as_str().to_string(); - self.push(&*eol); + self.push(&eol); } /// Add a line ending if needed (as in, there’s no eol/eof already). @@ -210,7 +199,7 @@ pub fn compile(events: &[Event], bytes: &[u8], options: &Options) -> String { && (event.token_type == Token::BlankLineEnding || event.token_type == Token::LineEnding) { line_ending_inferred = Some(LineEnding::from_str( - &Slice::from_position(bytes, &Position::from_exit_event(events, index)).serialize(), + Slice::from_position(bytes, &Position::from_exit_event(events, index)).as_str(), )); break; } @@ -398,14 +387,16 @@ fn on_enter_buffer(context: &mut CompileContext) { fn on_enter_block_quote(context: &mut CompileContext) { context.tight_stack.push(false); context.line_ending_if_needed(); - context.tag("<blockquote>"); + context.push("<blockquote>"); + context.last_was_tag = true; } /// Handle [`Enter`][EventType::Enter]:[`CodeIndented`][Token::CodeIndented]. fn on_enter_code_indented(context: &mut CompileContext) { context.code_flow_seen_data = Some(false); context.line_ending_if_needed(); - context.tag("<pre><code>"); + context.push("<pre><code>"); + context.last_was_tag = true; } /// Handle [`Enter`][EventType::Enter]:[`CodeFenced`][Token::CodeFenced]. @@ -413,14 +404,18 @@ fn on_enter_code_fenced(context: &mut CompileContext) { context.code_flow_seen_data = Some(false); context.line_ending_if_needed(); // Note that no `>` is used, which is added later. - context.tag("<pre><code"); + context.push("<pre><code"); + context.last_was_tag = true; context.code_fenced_fences_count = Some(0); } /// Handle [`Enter`][EventType::Enter]:[`CodeText`][Token::CodeText]. fn on_enter_code_text(context: &mut CompileContext) { context.code_text_inside = true; - context.tag("<code>"); + if !context.in_image_alt { + context.push("<code>"); + context.last_was_tag = true; + } context.buffer(); } @@ -445,7 +440,10 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) { /// Handle [`Enter`][EventType::Enter]:[`Emphasis`][Token::Emphasis]. fn on_enter_emphasis(context: &mut CompileContext) { - context.tag("<em>"); + if !context.in_image_alt { + context.push("<em>"); + context.last_was_tag = true; + } } /// Handle [`Enter`][EventType::Enter]:[`HtmlFlow`][Token::HtmlFlow]. @@ -473,7 +471,7 @@ fn on_enter_image(context: &mut CompileContext) { destination: None, title: None, }); - context.tags = false; // Disallow tags. + context.in_image_alt = true; // Disallow tags. } /// Handle [`Enter`][EventType::Enter]:[`Link`][Token::Link]. @@ -546,14 +544,12 @@ fn on_enter_list(context: &mut CompileContext) { context.tight_stack.push(!loose); context.line_ending_if_needed(); // Note: no `>`. - context.tag(&*format!( - "<{}", - if *token_type == Token::ListOrdered { - "ol" - } else { - "ul" - } - )); + context.push(if *token_type == Token::ListOrdered { + "<ol" + } else { + "<ul" + }); + context.last_was_tag = true; context.expect_first_item = Some(true); } @@ -562,11 +558,14 @@ fn on_enter_list_item_marker(context: &mut CompileContext) { let expect_first_item = context.expect_first_item.take().unwrap(); if expect_first_item { - context.tag(">"); + context.push(">"); + context.last_was_tag = true; } context.line_ending_if_needed(); - context.tag("<li>"); + + context.push("<li>"); + context.last_was_tag = true; context.expect_first_item = Some(false); // “Hack” to prevent a line ending from showing up if the item is empty. context.last_was_tag = false; @@ -578,15 +577,15 @@ fn on_enter_paragraph(context: &mut CompileContext) { if !tight { context.line_ending_if_needed(); - context.tag("<p>"); + context.push("<p>"); + context.last_was_tag = true; } } /// Handle [`Enter`][EventType::Enter]:[`Resource`][Token::Resource]. fn on_enter_resource(context: &mut CompileContext) { context.buffer(); // We can have line endings in the resource, ignore them. - let media = context.media_stack.last_mut().unwrap(); - media.destination = Some("".to_string()); + context.media_stack.last_mut().unwrap().destination = Some("".to_string()); } /// Handle [`Enter`][EventType::Enter]:[`ResourceDestinationString`][Token::ResourceDestinationString]. @@ -599,47 +598,67 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) { /// Handle [`Enter`][EventType::Enter]:[`Strong`][Token::Strong]. fn on_enter_strong(context: &mut CompileContext) { - context.tag("<strong>"); + if !context.in_image_alt { + context.push("<strong>"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail]. fn on_exit_autolink_email(context: &mut CompileContext) { - let value = Slice::from_position( + let slice = Slice::from_position( context.bytes, &Position::from_exit_event(context.events, context.index), - ) - .serialize(); + ); + let value = slice.as_str(); - context.tag(&*format!( - "<a href=\"{}\">", - sanitize_uri( - format!("mailto:{}", value.as_str()).as_str(), - &context.protocol_href - ) - )); - context.push_raw(&*value); - context.tag("</a>"); + if !context.in_image_alt { + context.push("<a href=\""); + context.push(&sanitize_uri( + &format!("mailto:{}", value), + &context.protocol_href, + )); + context.push("\">"); + context.last_was_tag = true; + } + + context.push_raw(value); + + if !context.in_image_alt { + context.push("</a>"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol]. fn on_exit_autolink_protocol(context: &mut CompileContext) { - let value = Slice::from_position( + let slice = Slice::from_position( context.bytes, &Position::from_exit_event(context.events, context.index), - ) - .serialize(); + ); + let value = slice.as_str(); - context.tag(&*format!( - "<a href=\"{}\">", - sanitize_uri(value.as_str(), &context.protocol_href) - )); - context.push_raw(&*value); - context.tag("</a>"); + if !context.in_image_alt { + context.push("<a href=\""); + context.push(&sanitize_uri(value, &context.protocol_href)); + context.push("\">"); + context.last_was_tag = true; + } + + context.push_raw(value); + + if !context.in_image_alt { + context.push("</a>"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:{[`HardBreakEscape`][Token::HardBreakEscape],[`HardBreakTrailing`][Token::HardBreakTrailing]}. fn on_exit_break(context: &mut CompileContext) { - context.tag("<br />"); + if !context.in_image_alt { + context.push("<br />"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:[`BlankLineEnding`][Token::BlankLineEnding]. @@ -654,56 +673,58 @@ fn on_exit_block_quote(context: &mut CompileContext) { context.tight_stack.pop(); context.line_ending_if_needed(); context.slurp_one_line_ending = false; - context.tag("</blockquote>"); + context.push("</blockquote>"); + context.last_was_tag = true; } /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarker`][Token::CharacterReferenceMarker]. fn on_exit_character_reference_marker(context: &mut CompileContext) { - context.character_reference_kind = Some(CharacterReferenceKind::Named); + context.character_reference_marker = Some(b'&'); } /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarkerHexadecimal`][Token::CharacterReferenceMarkerHexadecimal]. fn on_exit_character_reference_marker_hexadecimal(context: &mut CompileContext) { - context.character_reference_kind = Some(CharacterReferenceKind::Hexadecimal); + context.character_reference_marker = Some(b'x'); } /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarkerNumeric`][Token::CharacterReferenceMarkerNumeric]. fn on_exit_character_reference_marker_numeric(context: &mut CompileContext) { - context.character_reference_kind = Some(CharacterReferenceKind::Decimal); + context.character_reference_marker = Some(b'#'); } /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceValue`][Token::CharacterReferenceValue]. fn on_exit_character_reference_value(context: &mut CompileContext) { - let kind = context - .character_reference_kind + let marker = context + .character_reference_marker .take() .expect("expected `character_reference_kind` to be set"); - let reference = Slice::from_position( + let slice = Slice::from_position( context.bytes, &Position::from_exit_event(context.events, context.index), - ) - .serialize(); + ); + let value = slice.as_str(); - let ref_string = reference.as_str(); - let value = match kind { - CharacterReferenceKind::Decimal => decode_numeric(ref_string, 10).to_string(), - CharacterReferenceKind::Hexadecimal => decode_numeric(ref_string, 16).to_string(), - CharacterReferenceKind::Named => decode_named(ref_string), + let value = match marker { + b'#' => decode_numeric(value, 10), + b'x' => decode_numeric(value, 16), + b'&' => decode_named(value), + _ => panic!("impossible"), }; - context.push_raw(&*value); + context.push_raw(&value); } /// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk]. fn on_exit_code_flow_chunk(context: &mut CompileContext) { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - context.code_flow_seen_data = Some(true); - context.push_raw(&*value); + context.push_raw( + &Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + // Must serialize to get virtual spaces. + .serialize(), + ); } /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFence`][Token::CodeFencedFence]. @@ -715,7 +736,8 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) { }; if count == 0 { - context.tag(">"); + context.push(">"); + context.last_was_tag = true; context.slurp_one_line_ending = true; } @@ -725,7 +747,10 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFenceInfo`][Token::CodeFencedFenceInfo]. fn on_exit_code_fenced_fence_info(context: &mut CompileContext) { let value = context.resume(); - context.tag(&*format!(" class=\"language-{}\"", value)); + context.push(" class=\"language-"); + context.push(&value); + context.push("\""); + context.last_was_tag = true; } /// Handle [`Exit`][EventType::Exit]:{[`CodeFenced`][Token::CodeFenced],[`CodeIndented`][Token::CodeIndented]}. @@ -752,7 +777,8 @@ fn on_exit_code_flow(context: &mut CompileContext) { context.line_ending_if_needed(); } - context.tag("</code></pre>"); + context.push("</code></pre>"); + context.last_was_tag = true; if let Some(count) = context.code_fenced_fences_count.take() { if count < 2 { @@ -781,12 +807,16 @@ fn on_exit_code_text(context: &mut CompileContext) { } context.code_text_inside = false; - context.push(&*if trim { + context.push(&if trim { result[1..(result.len() - 1)].to_string() } else { result }); - context.tag("</code>"); + + if !context.in_image_alt { + context.push("</code>"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:*. @@ -798,72 +828,63 @@ fn on_exit_drop(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}. fn on_exit_data(context: &mut CompileContext) { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - - // Just output it. - context.push_raw(&*value); + context.push_raw( + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); } /// Handle [`Exit`][EventType::Exit]:[`Definition`][Token::Definition]. fn on_exit_definition(context: &mut CompileContext) { - let definition = context.media_stack.pop().unwrap(); - let reference_id = normalize_identifier(&definition.reference_id.unwrap()); - let destination = definition.destination; - let title = definition.title; - context.resume(); - - let mut index = 0; - - while index < context.definitions.len() { - if context.definitions[index].0 == reference_id { - return; - } - - index += 1; - } - - context - .definitions - .push((reference_id, Definition { destination, title })); + let media = context.media_stack.pop().unwrap(); + let id = normalize_identifier(&media.reference_id.unwrap()); + + context.definitions.push(( + id, + Definition { + destination: media.destination, + title: media.title, + }, + )); } /// Handle [`Exit`][EventType::Exit]:[`DefinitionDestinationString`][Token::DefinitionDestinationString]. fn on_exit_definition_destination_string(context: &mut CompileContext) { let buf = context.resume(); - let definition = context.media_stack.last_mut().unwrap(); - definition.destination = Some(buf); + context.media_stack.last_mut().unwrap().destination = Some(buf); context.encode_html = true; } /// Handle [`Exit`][EventType::Exit]:[`DefinitionLabelString`][Token::DefinitionLabelString]. fn on_exit_definition_label_string(context: &mut CompileContext) { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - // Discard label, use the source content instead. context.resume(); - let definition = context.media_stack.last_mut().unwrap(); - definition.reference_id = Some(value); + context.media_stack.last_mut().unwrap().reference_id = Some( + // To do: lifetimes, reference bytes? + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(), + ); } /// Handle [`Exit`][EventType::Exit]:[`DefinitionTitleString`][Token::DefinitionTitleString]. fn on_exit_definition_title_string(context: &mut CompileContext) { let buf = context.resume(); - let definition = context.media_stack.last_mut().unwrap(); - definition.title = Some(buf); + context.media_stack.last_mut().unwrap().title = Some(buf); } /// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Emphasis]. fn on_exit_emphasis(context: &mut CompileContext) { - context.tag("</em>"); + if !context.in_image_alt { + context.push("</em>"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:[`HeadingAtx`][Token::HeadingAtx]. @@ -873,7 +894,10 @@ fn on_exit_heading_atx(context: &mut CompileContext) { .take() .expect("`atx_opening_sequence_size` must be set in headings"); - context.tag(&*format!("</h{}>", rank)); + context.push("</h"); + context.push(&rank.to_string()); + context.push(">"); + context.last_was_tag = true; } /// Handle [`Exit`][EventType::Exit]:[`HeadingAtxSequence`][Token::HeadingAtxSequence]. @@ -884,17 +908,20 @@ fn on_exit_heading_atx_sequence(context: &mut CompileContext) { context.bytes, &Position::from_exit_event(context.events, context.index), ) - .size(); + .len(); context.line_ending_if_needed(); context.atx_opening_sequence_size = Some(rank); - context.tag(&*format!("<h{}>", rank)); + context.push("<h"); + context.push(&rank.to_string()); + context.push(">"); + context.last_was_tag = true; } } /// Handle [`Exit`][EventType::Exit]:[`HeadingAtxText`][Token::HeadingAtxText]. fn on_exit_heading_atx_text(context: &mut CompileContext) { let value = context.resume(); - context.push(&*value); + context.push(&value); } /// Handle [`Exit`][EventType::Exit]:[`HeadingSetextText`][Token::HeadingSetextText]. @@ -915,12 +942,18 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) { &Position::from_exit_event(context.events, context.index), ) .head(); - let level = if head == Some(b'-') { 2 } else { 1 }; + let rank = if head == Some(b'-') { "2" } else { "1" }; context.line_ending_if_needed(); - context.tag(&*format!("<h{}>", level)); - context.push(&*text); - context.tag(&*format!("</h{}>", level)); + context.push("<h"); + context.push(rank); + context.push(">"); + context.last_was_tag = true; + context.push(&text); + context.push("</h"); + context.push(rank); + context.push(">"); + context.last_was_tag = true; } /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlow`][Token::HtmlFlow],[`HtmlText`][Token::HtmlText]}. @@ -930,32 +963,31 @@ fn on_exit_html(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlowData`][Token::HtmlFlowData],[`HtmlTextData`][Token::HtmlTextData]}. fn on_exit_html_data(context: &mut CompileContext) { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - - context.push_raw(&*value); + context.push_raw( + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); } /// Handle [`Exit`][EventType::Exit]:[`Label`][Token::Label]. fn on_exit_label(context: &mut CompileContext) { let buf = context.resume(); - let media = context.media_stack.last_mut().unwrap(); - media.label = Some(buf); + context.media_stack.last_mut().unwrap().label = Some(buf); } /// Handle [`Exit`][EventType::Exit]:[`LabelText`][Token::LabelText]. fn on_exit_label_text(context: &mut CompileContext) { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - - let media = context.media_stack.last_mut().unwrap(); - media.label_id = Some(value); + context.media_stack.last_mut().unwrap().label_id = Some( + // To do: lifetimes, reference bytes? + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(), + ); } /// Handle [`Exit`][EventType::Exit]:[`LineEnding`][Token::LineEnding]. @@ -965,26 +997,28 @@ fn on_exit_line_ending(context: &mut CompileContext) { } else if context.slurp_one_line_ending { context.slurp_one_line_ending = false; } else { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - - context.push_raw(&*value); + context.push_raw( + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .as_str(), + ); } } /// Handle [`Exit`][EventType::Exit]:{[`ListOrdered`][Token::ListOrdered],[`ListUnordered`][Token::ListUnordered]}. fn on_exit_list(context: &mut CompileContext) { - let tag_name = if context.events[context.index].token_type == Token::ListOrdered { - "ol" - } else { - "ul" - }; context.tight_stack.pop(); context.line_ending(); - context.tag(&*format!("</{}>", tag_name)); + context.push( + if context.events[context.index].token_type == Token::ListOrdered { + "</ol>" + } else { + "</ul>" + }, + ); + context.last_was_tag = true; } /// Handle [`Exit`][EventType::Exit]:[`ListItem`][Token::ListItem]. @@ -1010,7 +1044,8 @@ fn on_exit_list_item(context: &mut CompileContext) { context.line_ending_if_needed(); } - context.tag("</li>"); + context.push("</li>"); + context.last_was_tag = true; } /// Handle [`Exit`][EventType::Exit]:[`ListItemValue`][Token::ListItemValue]. @@ -1018,17 +1053,17 @@ fn on_exit_list_item_value(context: &mut CompileContext) { let expect_first_item = context.expect_first_item.unwrap(); if expect_first_item { - let value = Slice::from_position( + let slice = Slice::from_position( context.bytes, &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - let value = value.parse::<u32>().ok().unwrap(); + ); + let value = slice.as_str().parse::<u32>().ok().unwrap(); if value != 1 { - context.tag(" start=\""); - context.tag(&*value.to_string()); - context.tag("\""); + context.push(" start=\""); + context.push(&value.to_string()); + context.push("\""); + context.last_was_tag = true; } } } @@ -1048,68 +1083,98 @@ fn on_exit_media(context: &mut CompileContext) { index += 1; } - context.tags = !is_in_image; + context.in_image_alt = is_in_image; let media = context.media_stack.pop().unwrap(); + let label = media.label.unwrap(); + let in_image_alt = context.in_image_alt; let id = media .reference_id .or(media.label_id) .map(|id| normalize_identifier(&id)); - let label = media.label.unwrap(); - let mut definition = None; - if let Some(id) = id { - let mut index = 0; + let definition_index = if media.destination.is_none() { + id.and_then(|id| { + let mut index = 0; - while index < context.definitions.len() { - if context.definitions[index].0 == id { - definition = Some(&context.definitions[index].1); - break; - } + while index < context.definitions.len() { + if context.definitions[index].0 == id { + return Some(index); + } - index += 1; - } - } + index += 1; + } - let destination = if media.destination.is_some() { - &media.destination + None + }) } else { - &definition.unwrap().destination - }; - let title = if media.destination.is_some() { - &media.title - } else { - &definition.unwrap().title + None }; - let destination = if let Some(destination) = destination { - destination - } else { - "" - }; + if !in_image_alt { + if media.image { + context.push("<img src=\""); + } else { + context.push("<a href=\""); + }; - let title = if let Some(title) = title { - format!(" title=\"{}\"", title) - } else { - "".to_string() - }; + let destination = if let Some(index) = definition_index { + context.definitions[index].1.destination.as_ref() + } else { + media.destination.as_ref() + }; + + if let Some(destination) = destination { + context.push(&sanitize_uri( + destination, + if media.image { + &context.protocol_src + } else { + &context.protocol_href + }, + )); + } + + if media.image { + context.push("\" alt=\""); + }; + } if media.image { - context.tag(&*format!( - "<img src=\"{}\" alt=\"", - sanitize_uri(destination, &context.protocol_src), - )); - context.push(&*label); - context.tag(&*format!("\"{} />", title)); - } else { - context.tag(&*format!( - "<a href=\"{}\"{}>", - sanitize_uri(destination, &context.protocol_href), - title, - )); - context.push(&*label); - context.tag("</a>"); - }; + context.push(&label); + } + + if !in_image_alt { + context.push("\""); + + let title = if let Some(index) = definition_index { + context.definitions[index].1.title.clone() + } else { + media.title + }; + + if let Some(title) = title { + context.push(" title=\""); + context.push(&title); + context.push("\""); + }; + + if media.image { + context.push(" /"); + } + + context.push(">"); + context.last_was_tag = true; + } + + if !media.image { + context.push(&label); + + if !in_image_alt { + context.push("</a>"); + context.last_was_tag = true; + } + } } /// Handle [`Exit`][EventType::Exit]:[`Paragraph`][Token::Paragraph]. @@ -1119,46 +1184,49 @@ fn on_exit_paragraph(context: &mut CompileContext) { if *tight { context.slurp_one_line_ending = true; } else { - context.tag("</p>"); + context.push("</p>"); + context.last_was_tag = true; } } /// Handle [`Exit`][EventType::Exit]:[`ReferenceString`][Token::ReferenceString]. fn on_exit_reference_string(context: &mut CompileContext) { - let value = Slice::from_position( - context.bytes, - &Position::from_exit_event(context.events, context.index), - ) - .serialize(); - // Drop stuff. context.resume(); - let media = context.media_stack.last_mut().unwrap(); - media.reference_id = Some(value); + // To do: lifetimes, reference bytes. + context.media_stack.last_mut().unwrap().reference_id = Some( + Slice::from_position( + context.bytes, + &Position::from_exit_event(context.events, context.index), + ) + .serialize(), + ); } /// Handle [`Exit`][EventType::Exit]:[`ResourceDestinationString`][Token::ResourceDestinationString]. fn on_exit_resource_destination_string(context: &mut CompileContext) { let buf = context.resume(); - let media = context.media_stack.last_mut().unwrap(); - media.destination = Some(buf); + context.media_stack.last_mut().unwrap().destination = Some(buf); context.encode_html = true; } /// Handle [`Exit`][EventType::Exit]:[`ResourceTitleString`][Token::ResourceTitleString]. fn on_exit_resource_title_string(context: &mut CompileContext) { let buf = context.resume(); - let media = context.media_stack.last_mut().unwrap(); - media.title = Some(buf); + context.media_stack.last_mut().unwrap().title = Some(buf); } /// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Strong]. fn on_exit_strong(context: &mut CompileContext) { - context.tag("</strong>"); + if !context.in_image_alt { + context.push("</strong>"); + context.last_was_tag = true; + } } /// Handle [`Exit`][EventType::Exit]:[`ThematicBreak`][Token::ThematicBreak]. fn on_exit_thematic_break(context: &mut CompileContext) { context.line_ending_if_needed(); - context.tag("<hr />"); + context.push("<hr />"); + context.last_was_tag = true; } diff --git a/src/constant.rs b/src/constant.rs index d84dda5..6ef851c 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -165,6 +165,15 @@ pub const HTML_BLOCK_NAMES: [&str; 61] = [ "ul", ]; +/// Magic string of CDATA (after `<![`). +/// +/// Used in the **cdata** production of [HTML (flow)][html_flow] and +/// [HTML (text)][html_text]. +/// +/// [html_flow]: crate::construct::html_flow +/// [html_text]: crate::construct::html_text +pub const HTML_CDATA_PREFIX: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; + /// List of HTML tag names that form the **raw** production of /// [HTML (flow)][html_flow]. /// diff --git a/src/construct/attention.rs b/src/construct/attention.rs index b042645..583fde2 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -88,54 +88,11 @@ enum GroupKind { Other, } -/// Type of sequence. -#[derive(Debug, PartialEq)] -enum MarkerKind { - /// In a run with asterisks. - /// - /// ## Example - /// - /// ```markdown - /// *a* - /// ``` - Asterisk, - /// In a run with underscores. - /// - /// ## Example - /// - /// ```markdown - /// _a_ - /// ``` - Underscore, -} - -impl MarkerKind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - MarkerKind::Asterisk => b'*', - MarkerKind::Underscore => b'_', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `*` or `_`. - fn from_byte(byte: u8) -> MarkerKind { - match byte { - b'*' => MarkerKind::Asterisk, - b'_' => MarkerKind::Underscore, - _ => unreachable!("invalid byte"), - } - } -} - /// Attentention sequence that we can take markers from. #[derive(Debug)] struct Sequence { - /// Marker used in this sequence. - marker: MarkerKind, + /// Marker as a byte (`u8`) used in this sequence. + marker: u8, /// The depth in events where this sequence resides. balance: usize, /// The index into events where this sequence’s `Enter` currently resides. @@ -160,9 +117,9 @@ struct Sequence { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => { + Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => { tokenizer.enter(Token::AttentionSequence); - inside(tokenizer, MarkerKind::from_byte(byte)) + inside(tokenizer, tokenizer.current.unwrap()) } _ => State::Nok, } @@ -174,14 +131,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// > | ** /// ^^ /// ``` -fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { - if tokenizer.current == Some(marker.as_byte()) { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, marker))) - } else { - tokenizer.exit(Token::AttentionSequence); - tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); - State::Ok +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { + match tokenizer.current { + Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => { + tokenizer.consume(); + State::Fn(Box::new(move |t| inside(t, marker))) + } + _ => { + tokenizer.exit(Token::AttentionSequence); + tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); + State::Ok + } } } @@ -219,16 +179,10 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]); let char_after = string_after.chars().next(); - let marker = MarkerKind::from_byte( - Slice::from_point(tokenizer.parse_state.bytes, &enter.point) - .head() - .unwrap(), - ); - let before = classify_character(if enter.point.index > 0 { - char_before - } else { - None - }); + let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) + .head() + .unwrap(); + let before = classify_character(char_before); let after = classify_character(char_after); let open = after == GroupKind::Other || (after == GroupKind::Punctuation && before != GroupKind::Other); @@ -245,12 +199,12 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { start_point: enter.point.clone(), end_point: exit.point.clone(), size: exit.point.index - enter.point.index, - open: if marker == MarkerKind::Asterisk { + open: if marker == b'*' { open } else { open && (before != GroupKind::Other || !close) }, - close: if marker == MarkerKind::Asterisk { + close: if marker == b'*' { close } else { close && (after != GroupKind::Other || !open) diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index b843af8..c0514ae 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_alphabetic() => { + // ASCII alphabetic. + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(scheme_or_email_atext)) } - Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer), - _ => State::Nok, + _ => email_atext(tokenizer), } } @@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::AutolinkProtocol); end(tokenizer) } - Some(byte) if byte.is_ascii_control() => State::Nok, - None | Some(b' ') => State::Nok, + // ASCII control or space. + None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(url_inside)) @@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Some(byte) if is_ascii_atext(byte) => { + // ASCII atext. + // + // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or + // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 + // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), + // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`), + // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE + // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE + // (`~`). + // + // See: + // **\[RFC5322]**: + // [Internet Message Format](https://tools.ietf.org/html/rfc5322). + // P. Resnick. + // IETF. + // + // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric + Some( + b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~', + ) => { tokenizer.consume(); State::Fn(Box::new(email_atext)) } @@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size), + // ASCII alphanumeric. + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size), _ => State::Nok, } } @@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { tokenizer.consume(); State::Fn(Box::new(move |t| email_value(t, size + 1))) } - Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + // ASCII alphanumeric. + Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_label(t, size + 1))) } @@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State { _ => unreachable!("expected `>`"), } } - -/// Check whether the character code represents an ASCII atext. -/// -/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in -/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), -/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F -/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E -/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE -/// (`{`) to U+007E TILDE (`~`). -/// -/// See: -/// **\[RFC5322]**: -/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). -/// P. Resnick. -/// IETF. -/// -/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric -fn is_ascii_atext(byte: u8) -> bool { - matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~') -} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 02e8b62..4419d7a 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -63,7 +63,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_punctuation() => { + // ASCII punctuation. + Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => { tokenizer.enter(Token::CharacterEscapeValue); tokenizer.consume(); tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 90763c1..cd489a4 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,67 +66,18 @@ use crate::constant::{ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, }; use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -/// Kind of a character reference. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Numeric decimal character reference. - /// - /// ```markdown - /// > | a	b - /// ^^^^^ - /// ``` - Decimal, - /// Numeric hexadecimal character reference. - /// - /// ```markdown - /// > | a{b - /// ^^^^^^ - /// ``` - Hexadecimal, - /// Named character reference. - /// - /// ```markdown - /// > | a&b - /// ^^^^^ - /// ``` - Named, -} - -impl Kind { - /// Get the maximum size of characters allowed in the value of a character - /// reference. - fn max(&self) -> usize { - match self { - Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, - Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, - Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX, - } - } - - /// Check if a byte ([`u8`]) is allowed. - fn allowed(&self, byte: u8) -> bool { - let check = match self { - Kind::Hexadecimal => u8::is_ascii_hexdigit, - Kind::Decimal => u8::is_ascii_digit, - Kind::Named => u8::is_ascii_alphanumeric, - }; - - check(&byte) - } -} +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice; /// State needed to parse character references. #[derive(Debug, Clone)] struct Info { - /// Place of value start. - start: Point, - /// Size of value. - size: usize, - /// Kind of character reference. - kind: Kind, + /// Index of where value starts. + start: usize, + /// Marker of character reference. + marker: u8, + /// Maximum number of characters in the value for this kind. + max: usize, } /// Start of a character reference. @@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { value( tokenizer, Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Named, + start: tokenizer.point.index, + marker: b'&', + max: CHARACTER_REFERENCE_NAMED_SIZE_MAX, }, ) } @@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); let info = Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Hexadecimal, + start: tokenizer.point.index, + marker: b'x', + max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, }; State::Fn(Box::new(|t| value(t, info))) } else { tokenizer.enter(Token::CharacterReferenceValue); let info = Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Decimal, + start: tokenizer.point.index, + marker: b'#', + max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, }; value(tokenizer, info) } @@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn value(tokenizer: &mut Tokenizer, info: Info) -> State { + let size = tokenizer.point.index - info.start; + match tokenizer.current { - Some(b';') if info.size > 0 => { - if Kind::Named == info.kind { - // To do: fix slice. - let value = Slice::from_position( + Some(b';') if size > 0 => { + // Named. + if info.marker == b'&' { + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &info.start, - end: &tokenizer.point, - }, - ) - .serialize(); + info.start, + tokenizer.point.index, + ); + let name = slice.as_str(); - if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { + if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { return State::Nok; } } @@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::CharacterReference); State::Ok } - Some(byte) => { - if info.size < info.kind.max() && info.kind.allowed(byte) { - info.size += 1; - tokenizer.consume(); - State::Fn(Box::new(|t| value(t, info))) - } else { - State::Nok - } + // ASCII digit, for named, decimal, and hexadecimal references. + Some(b'0'..=b'9') if size < info.max => { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) + } + // ASCII hex letters, for named and hexadecimal references. + Some(b'A'..=b'F' | b'a'..=b'f') + if matches!(info.marker, b'&' | b'x') && size < info.max => + { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) + } + // Non-hex ASCII alphabeticals, for named references. + Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) } _ => State::Nok, } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 21e9259..c4c3e86 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -110,53 +110,6 @@ use crate::token::Token; use crate::tokenizer::{ContentType, State, Tokenizer}; use crate::util::slice::{Position, Slice}; -/// Kind of fences. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Grave accent (tick) code. - /// - /// ## Example - /// - /// ````markdown - /// ```rust - /// println!("I <3 🦀"); - /// ``` - /// ```` - GraveAccent, - /// Tilde code. - /// - /// ## Example - /// - /// ```markdown - /// ~~~rust - /// println!("I <3 🦀"); - /// ~~~ - /// ``` - Tilde, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - Kind::GraveAccent => b'`', - Kind::Tilde => b'~', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `~` or `` ` ``. - fn from_byte(byte: u8) -> Kind { - match byte { - b'`' => Kind::GraveAccent, - b'~' => Kind::Tilde, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse code (fenced). #[derive(Debug, Clone)] struct Info { @@ -165,8 +118,8 @@ struct Info { /// Number of tabs or spaces of indentation before the opening fence /// sequence. prefix: usize, - /// Kind of fences. - kind: Kind, + /// Marker of fences (`u8`). + marker: u8, } /// Start of fenced code. @@ -178,15 +131,20 @@ struct Info { /// | ~~~ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; if tokenizer.parse_state.constructs.code_fenced { tokenizer.enter(Token::CodeFenced); tokenizer.enter(Token::CodeFencedFence); - tokenizer.go(space_or_tab_min_max(0, max), before_sequence_open)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before_sequence_open, + )(tokenizer) } else { State::Nok } @@ -210,23 +168,22 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), ) - .size(); + .len(); } } - match tokenizer.current { - Some(byte) if matches!(byte, b'`' | b'~') => { - tokenizer.enter(Token::CodeFencedFenceSequence); - sequence_open( - tokenizer, - Info { - prefix, - size: 0, - kind: Kind::from_byte(byte), - }, - ) - } - _ => State::Nok, + if let Some(b'`' | b'~') = tokenizer.current { + tokenizer.enter(Token::CodeFencedFenceSequence); + sequence_open( + tokenizer, + Info { + prefix, + size: 0, + marker: tokenizer.current.unwrap(), + }, + ) + } else { + State::Nok } } @@ -240,7 +197,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { tokenizer.consume(); State::Fn(Box::new(|t| { info.size += 1; @@ -302,7 +259,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) } - Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, + Some(b'`') if info.marker == b'`' => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(|t| info_inside(t, info))) @@ -352,7 +309,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, + Some(b'`') if info.marker == b'`' => State::Nok, _ => { tokenizer.consume(); State::Fn(Box::new(|t| meta(t, info))) @@ -432,14 +389,18 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^ /// ``` fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - tokenizer.enter(Token::CodeFencedFence); - tokenizer.go(space_or_tab_min_max(0, max), |t| close_before(t, info))(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + |t| close_before(t, info), + )(tokenizer) } /// In a closing fence, after optional whitespace, before sequence. @@ -452,7 +413,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { tokenizer.enter(Token::CodeFencedFenceSequence); close_sequence(tokenizer, info, 0) } @@ -470,7 +431,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => { tokenizer.consume(); State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 4a3a9f6..81a3080 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -62,11 +62,11 @@ use crate::tokenizer::{State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { // Do not interrupt paragraphs. - if tokenizer.interrupt || !tokenizer.parse_state.constructs.code_indented { - State::Nok - } else { + if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented { tokenizer.enter(Token::CodeIndented); tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer) + } else { + State::Nok } } @@ -129,29 +129,26 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// | bbb /// ``` fn further_start(tokenizer: &mut Tokenizer) -> State { - if tokenizer.lazy { - State::Nok - } else { - match tokenizer.current { - Some(b'\n') => { - tokenizer.enter(Token::LineEnding); - tokenizer.consume(); - tokenizer.exit(Token::LineEnding); - State::Fn(Box::new(further_start)) - } - _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { - Box::new(if ok { further_end } else { further_begin }) - })(tokenizer), + match tokenizer.current { + Some(b'\n') if !tokenizer.lazy => { + tokenizer.enter(Token::LineEnding); + tokenizer.consume(); + tokenizer.exit(Token::LineEnding); + State::Fn(Box::new(further_start)) } + _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { + Box::new(if ok { further_end } else { further_begin }) + })(tokenizer), + _ => State::Nok, } } -/// After a proper indent. +/// At an eol, which is followed by an indented line. /// /// ```markdown -/// | aaa -/// > | bbb -/// ^ +/// > | aaa +/// ^ +/// | bbb /// ``` fn further_end(_tokenizer: &mut Tokenizer) -> State { State::Ok diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index b36a208..d70fbc2 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -95,14 +95,13 @@ use crate::tokenizer::{State, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let len = tokenizer.events.len(); - match tokenizer.current { Some(b'`') if tokenizer.parse_state.constructs.code_text && (tokenizer.previous != Some(b'`') - || (len > 0 - && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => + || (!tokenizer.events.is_empty() + && tokenizer.events[tokenizer.events.len() - 1].token_type + == Token::CharacterEscape)) => { tokenizer.enter(Token::CodeText); tokenizer.enter(Token::CodeTextSequence); diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 14755c9..bd7df82 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -110,17 +110,18 @@ use crate::util::skip::opt_back as skip_opt_back; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let definition_before = !tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::LineEnding, Token::SpaceOrTab], - )] - .token_type - == Token::Definition; - // Do not interrupt paragraphs (but do follow definitions). - if (!tokenizer.interrupt || definition_before) && tokenizer.parse_state.constructs.definition { + let possible = !tokenizer.interrupt + || (!tokenizer.events.is_empty() + && tokenizer.events[skip_opt_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::LineEnding, Token::SpaceOrTab], + )] + .token_type + == Token::Definition); + + if possible && tokenizer.parse_state.constructs.definition { tokenizer.enter(Token::Definition); // Note: arbitrary whitespace allowed even if code (indented) is on. tokenizer.attempt_opt(space_or_tab(), before)(tokenizer) diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index cdbc192..d09bf54 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); tokenizer.consume(); - State::Fn(Box::new(inside)) + State::Fn(Box::new(after)) } _ => State::Nok, } @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// | b /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { Some(b'\n') => { tokenizer.exit(Token::HardBreakEscape); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 9a73b77..aa388ee 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,15 +66,19 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.heading_atx { tokenizer.enter(Token::HeadingAtx); - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -101,19 +105,19 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | ## aa /// ^ /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { +fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - None | Some(b'\n') if rank > 0 => { + None | Some(b'\n') if size > 0 => { tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) } - Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |tokenizer| { - sequence_open(tokenizer, rank + 1) + sequence_open(tokenizer, size + 1) })) } - _ if rank > 0 => { + _ if size > 0 => { tokenizer.exit(Token::HeadingAtxSequence); tokenizer.go(space_or_tab(), at_break)(tokenizer) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 2a4adbf..98d7843 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -63,52 +63,6 @@ use crate::token::Token; use crate::tokenizer::{EventType, State, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; -/// Kind of underline. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Dash (rank 2) heading. - /// - /// ## Example - /// - /// ```markdown - /// alpha - /// ----- - /// ``` - Dash, - - /// Equals to (rank 1) heading. - /// - /// ## Example - /// - /// ```markdown - /// alpha - /// ===== - /// ``` - EqualsTo, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - Kind::Dash => b'-', - Kind::EqualsTo => b'=', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `-` or `=`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'-' => Kind::Dash, - b'=' => Kind::EqualsTo, - _ => unreachable!("invalid byte"), - } - } -} - /// At a line ending, presumably an underline. /// /// ```markdown @@ -117,23 +71,29 @@ impl Kind { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - let paragraph_before = !tokenizer.events.is_empty() - && tokenizer.events[skip_opt_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::LineEnding, Token::SpaceOrTab], - )] - .token_type - == Token::Paragraph; - - // Require a paragraph before and do not allow on a lazy line. - if paragraph_before && !tokenizer.lazy && tokenizer.parse_state.constructs.heading_setext { - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + if tokenizer.parse_state.constructs.heading_setext + && !tokenizer.lazy + // Require a paragraph before. + && (!tokenizer.events.is_empty() + && tokenizer.events[skip_opt_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::LineEnding, Token::SpaceOrTab], + )] + .token_type + == Token::Paragraph) + { + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -148,9 +108,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if matches!(byte, b'-' | b'=') => { + Some(b'-' | b'=') => { tokenizer.enter(Token::HeadingSetextUnderline); - inside(tokenizer, Kind::from_byte(byte)) + inside(tokenizer, tokenizer.current.unwrap()) } _ => State::Nok, } @@ -163,11 +123,11 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// > | == /// ^ /// ``` -fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { match tokenizer.current { - Some(byte) if byte == kind.as_byte() => { + Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => { tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, kind))) + State::Fn(Box::new(move |t| inside(t, marker))) } _ => { tokenizer.exit(Token::HeadingSetextUnderline); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 5860c5d..064da35 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -98,17 +98,17 @@ //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}; +use crate::constant::{ + HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE, +}; use crate::construct::{ blank_line::start as blank_line, partial_non_lazy_continuation::start as partial_non_lazy_continuation, partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions}, }; use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -129,49 +129,6 @@ enum Kind { Complete, } -/// Type of quote, if we’re in a quoted attribute, in complete (condition 7). -#[derive(Debug, PartialEq)] -enum QuoteKind { - /// In a double quoted (`"`) attribute value. - /// - /// ## Example - /// - /// ```markdown - /// <a b="c" /> - /// ``` - Double, - /// In a single quoted (`'`) attribute value. - /// - /// ## Example - /// - /// ```markdown - /// <a b='c' /> - /// ``` - Single, -} - -impl QuoteKind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - QuoteKind::Double => b'"', - QuoteKind::Single => b'\'', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `"` or `'`. - fn from_byte(byte: u8) -> QuoteKind { - match byte { - b'"' => QuoteKind::Double, - b'\'' => QuoteKind::Single, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse HTML (flow). #[derive(Debug)] struct Info { @@ -179,12 +136,10 @@ struct Info { kind: Kind, /// Whether this is a start tag (`<` not followed by `/`). start_tag: bool, - /// Used depending on `kind` to collect all parsed bytes. - start: Option<Point>, - /// Collected index, for various reasons. - size: usize, + /// Start index of a tag name or cdata prefix. + start: usize, /// Current quote, when in a double or single quoted attribute value. - quote: Option<QuoteKind>, + quote: u8, } /// Start of HTML (flow), before optional whitespace. @@ -194,19 +149,17 @@ struct Info { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.html_flow { tokenizer.enter(Token::HtmlFlow); tokenizer.go( space_or_tab_with_options(SpaceOrTabOptions { kind: Token::HtmlFlowData, min: 0, - max, + max: if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, connect: false, content_type: None, }), @@ -249,9 +202,8 @@ fn open(tokenizer: &mut Tokenizer) -> State { kind: Kind::Basic, // Assume closing tag (or no tag). start_tag: false, - start: None, - size: 0, - quote: None, + start: 0, + quote: 0, }; match tokenizer.current { @@ -261,7 +213,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { } Some(b'/') => { tokenizer.consume(); - info.start = Some(tokenizer.point.clone()); + info.start = tokenizer.point.index; State::Fn(Box::new(|t| tag_close_start(t, info))) } Some(b'?') => { @@ -273,9 +225,10 @@ fn open(tokenizer: &mut Tokenizer) -> State { // right now, so we do need to search for `>`, similar to declarations. State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { info.start_tag = true; - info.start = Some(tokenizer.point.clone()); + info.start = tokenizer.point.index; tag_name(tokenizer, info) } _ => State::Nok, @@ -299,12 +252,6 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { info.kind = Kind::Comment; State::Fn(Box::new(|t| comment_open_inside(t, info))) } - Some(b'[') => { - tokenizer.consume(); - info.kind = Kind::Cdata; - info.size = 0; - State::Fn(Box::new(|t| cdata_open_inside(t, info))) - } Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); info.kind = Kind::Declaration; @@ -312,6 +259,12 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.concrete = true; State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } + Some(b'[') => { + tokenizer.consume(); + info.kind = Kind::Cdata; + info.start = tokenizer.point.index; + State::Fn(Box::new(|t| cdata_open_inside(t, info))) + } _ => State::Nok, } } @@ -342,12 +295,11 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == CDATA_SEARCH[info.size] => { - info.size += 1; + Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => { tokenizer.consume(); - if info.size == CDATA_SEARCH.len() { - info.size = 0; + if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() { + info.start = 0; // Do not form containers. tokenizer.concrete = true; State::Fn(Box::new(|t| continuation(t, info))) @@ -367,6 +319,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| tag_name(t, info))) @@ -387,17 +340,18 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => { let slash = matches!(tokenizer.current, Some(b'/')); - let start = info.start.take().unwrap(); - let name = Slice::from_position( + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &start, - end: &tokenizer.point, - }, - ) - .serialize() - .trim() - .to_lowercase(); + info.start, + tokenizer.point.index, + ); + let name = slice + .as_str() + // The line ending case might result in a `\r` that is already accounted for. + .trim() + .to_ascii_lowercase(); + info.start = 0; if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) { info.kind = Kind::Raw; @@ -427,6 +381,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } } + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| tag_name(t, info))) @@ -490,18 +445,19 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + Some(b'\t' | b' ') => { + tokenizer.consume(); + State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) + } Some(b'/') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_end(t, info))) } + // ASCII alphanumerical and `:` and `_`. Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } - Some(b'\t' | b' ') => { - tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) - } _ => complete_end(tokenizer, info), } } @@ -518,6 +474,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) @@ -537,14 +494,14 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(b'=') => { - tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) - } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) } + Some(b'=') => { + tokenizer.consume(); + State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) + } _ => complete_attribute_name_before(tokenizer, info), } } @@ -561,15 +518,15 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, - Some(byte) if matches!(byte, b'"' | b'\'') => { - info.quote = Some(QuoteKind::from_byte(byte)); - tokenizer.consume(); - State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) - } Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } + Some(b'"' | b'\'') => { + info.quote = tokenizer.current.unwrap(); + tokenizer.consume(); + State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) + } _ => complete_attribute_value_unquoted(tokenizer, info), } } @@ -585,7 +542,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { None | Some(b'\n') => State::Nok, - Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => { + Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) } @@ -673,6 +630,21 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { + tokenizer.exit(Token::HtmlFlowData); + tokenizer.check(blank_line_before, |ok| { + if ok { + Box::new(continuation_after) + } else { + Box::new(move |t| continuation_start(t, info)) + } + })(tokenizer) + } + // Note: important that this is after the basic/complete case. + None | Some(b'\n') => { + tokenizer.exit(Token::HtmlFlowData); + continuation_start(tokenizer, info) + } Some(b'-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_comment_inside(t, info))) @@ -693,20 +665,6 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.consume(); State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) } - Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { - tokenizer.exit(Token::HtmlFlowData); - tokenizer.check(blank_line_before, |ok| { - if ok { - Box::new(continuation_after) - } else { - Box::new(move |t| continuation_start(t, info)) - } - })(tokenizer) - } - None | Some(b'\n') => { - tokenizer.exit(Token::HtmlFlowData); - continuation_start(tokenizer, info) - } _ => { tokenizer.consume(); State::Fn(Box::new(|t| continuation(t, info))) @@ -793,7 +751,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State match tokenizer.current { Some(b'/') => { tokenizer.consume(); - info.start = Some(tokenizer.point.clone()); + info.start = tokenizer.point.index; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => continuation(tokenizer, info), @@ -809,18 +767,15 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { Some(b'>') => { - info.size = 0; - - let start = info.start.take().unwrap(); - let name = Slice::from_position( + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &start, - end: &tokenizer.point, - }, - ) - .serialize() - .to_lowercase(); + info.start, + tokenizer.point.index, + ); + let name = slice.as_str().to_ascii_lowercase(); + + info.start = 0; if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); @@ -829,13 +784,14 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State continuation(tokenizer, info) } } - Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => { + Some(b'A'..=b'Z' | b'a'..=b'z') + if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX => + { tokenizer.consume(); - info.size += 1; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => { - info.size = 0; + info.start = 0; continuation(tokenizer, info) } } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index f10a476..51beda5 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -54,12 +54,11 @@ //! [html_flow]: crate::construct::html_flow //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +use crate::constant::HTML_CDATA_PREFIX; use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; - /// Start of HTML (text) /// /// ```markdown @@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(instruction)) } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Some(b'[') => { - tokenizer.consume(); - State::Fn(Box::new(|t| cdata_open_inside(t, 0))) - } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } + Some(b'[') => { + tokenizer.consume(); + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) + } _ => State::Nok, } } @@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { - match tokenizer.current { - Some(byte) if byte == CDATA_SEARCH[index] => { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { + if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { + tokenizer.consume(); - if index + 1 == CDATA_SEARCH.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) - } + if size + 1 == HTML_CDATA_PREFIX.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1))) } - _ => State::Nok, + } else { + State::Nok } } @@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(end)) } + // ASCII alphabetical and `:` and `_`. Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Some(byte) if byte == b'"' || byte == b'\'' => { + Some(b'"' | b'\'') => { + let marker = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) + State::Fn(Box::new(move |t| { + tag_open_attribute_value_quoted(t, marker) + })) } Some(_) => { tokenizer.consume(); @@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Some(byte) if byte == marker => { + Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 6399f81..a1ec8d9 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -214,16 +214,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { media: Media { start: label_start.start, end: (label_end_start, label_end_start + 3), - // To do: virtual spaces not needed, create a `to_str`? id: normalize_identifier( - &Slice::from_position( + // We don’t care about virtual spaces, so `indices` and `as_str` are fine. + Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &tokenizer.events[label_start.start.1].point, - end: &tokenizer.events[label_end_start - 1].point, - }, + tokenizer.events[label_start.start.1].point.index, + tokenizer.events[label_end_start - 1].point.index, ) - .serialize(), + .as_str(), ), }, }; @@ -366,11 +364,11 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ^ /// ``` fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { - let label_start = tokenizer + tokenizer .label_start_stack .get_mut(label_start_index) - .unwrap(); - label_start.balanced = true; + .unwrap() + .balanced = true; State::Nok } @@ -529,23 +527,24 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn full_reference_after(tokenizer: &mut Tokenizer) -> State { - let end = skip::to_back( - &tokenizer.events, - tokenizer.events.len() - 1, - &[Token::ReferenceString], - ); - - // To do: virtual spaces not needed, create a `to_str`? - let id = Slice::from_position( - tokenizer.parse_state.bytes, - &Position::from_exit_event(&tokenizer.events, end), - ) - .serialize(); - if tokenizer .parse_state .definitions - .contains(&normalize_identifier(&id)) + // We don’t care about virtual spaces, so `as_str` is fine. + .contains(&normalize_identifier( + Slice::from_position( + tokenizer.parse_state.bytes, + &Position::from_exit_event( + &tokenizer.events, + skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::ReferenceString], + ), + ), + ) + .as_str(), + )) { State::Ok } else { diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index d30b8dd..4a3508e 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -64,9 +64,8 @@ pub fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); tokenizer.exit(Token::LabelMarker); tokenizer.exit(Token::LabelImage); - let end = tokenizer.events.len() - 1; tokenizer.label_start_stack.push(LabelStart { - start: (end - 5, end), + start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1), balanced: false, inactive: false, }); diff --git a/src/construct/list.rs b/src/construct/list.rs index 9b59130..d5a9899 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -56,69 +56,6 @@ use crate::util::{ slice::{Position, Slice}, }; -/// Type of list. -#[derive(Debug, PartialEq)] -enum Kind { - /// In a dot (`.`) list item. - /// - /// ## Example - /// - /// ```markdown - /// 1. a - /// ``` - Dot, - /// In a paren (`)`) list item. - /// - /// ## Example - /// - /// ```markdown - /// 1) a - /// ``` - Paren, - /// In an asterisk (`*`) list item. - /// - /// ## Example - /// - /// ```markdown - /// * a - /// ``` - Asterisk, - /// In a plus (`+`) list item. - /// - /// ## Example - /// - /// ```markdown - /// + a - /// ``` - Plus, - /// In a dash (`-`) list item. - /// - /// ## Example - /// - /// ```markdown - /// - a - /// ``` - Dash, -} - -impl Kind { - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'.' => Kind::Dot, - b')' => Kind::Paren, - b'*' => Kind::Asterisk, - b'+' => Kind::Plus, - b'-' => Kind::Dash, - _ => unreachable!("invalid byte"), - } - } -} - /// Start of list item. /// /// ```markdown @@ -126,15 +63,19 @@ impl Kind { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.list { tokenizer.enter(Token::ListItem); - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -149,15 +90,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Unordered. - Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| { + Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| { Box::new(if ok { nok } else { before_unordered }) })(tokenizer), + Some(b'+') => before_unordered(tokenizer), // Ordered. - Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => { - tokenizer.enter(Token::ListItemPrefix); - tokenizer.enter(Token::ListItemValue); - inside(tokenizer, 0) - } + Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer), + Some(b'1') => before_ordered(tokenizer), _ => State::Nok, } } @@ -175,6 +114,18 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { marker(tokenizer) } +/// Start of an ordered list item. +/// +/// ```markdown +/// > | * a +/// ^ +/// ``` +fn before_ordered(tokenizer: &mut Tokenizer) -> State { + tokenizer.enter(Token::ListItemPrefix); + tokenizer.enter(Token::ListItemValue); + inside(tokenizer, 0) +} + /// In an ordered list item value. /// /// ```markdown @@ -183,14 +134,14 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, size + 1))) - } Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => { tokenizer.exit(Token::ListItemValue); marker(tokenizer) } + Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + tokenizer.consume(); + State::Fn(Box::new(move |t| inside(t, size + 1))) + } _ => State::Nok, } } @@ -262,7 +213,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn whitespace_after(tokenizer: &mut Tokenizer) -> State { - if matches!(tokenizer.current, Some(b'\t' | b' ')) { + if let Some(b'\t' | b' ') = tokenizer.current { State::Nok } else { State::Ok @@ -309,7 +260,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State { end: &tokenizer.point, }, ) - .size(); + .len(); if blank { prefix += 1; @@ -389,8 +340,8 @@ fn nok(_tokenizer: &mut Tokenizer) -> State { pub fn resolve_list_item(tokenizer: &mut Tokenizer) { let mut index = 0; let mut balance = 0; - let mut lists_wip: Vec<(Kind, usize, usize, usize)> = vec![]; - let mut lists: Vec<(Kind, usize, usize, usize)> = vec![]; + let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; + let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; // Merge list items. while index < tokenizer.events.len() { @@ -400,12 +351,14 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { if event.event_type == EventType::Enter { let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1; let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); - let kind = Kind::from_byte( - Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point) - .head() - .unwrap(), - ); - let current = (kind, balance, index, end); + // Guaranteed to be a valid ASCII byte. + let marker = Slice::from_index( + tokenizer.parse_state.bytes, + tokenizer.events[marker].point.index, + ) + .head() + .unwrap(); + let current = (marker, balance, index, end); let mut list_index = lists_wip.len(); let mut matched = false; @@ -475,7 +428,7 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { let mut list_start = tokenizer.events[list_item.2].clone(); let mut list_end = tokenizer.events[list_item.3].clone(); let token_type = match list_item.0 { - Kind::Paren | Kind::Dot => Token::ListOrdered, + b'.' | b')' => Token::ListOrdered, _ => Token::ListUnordered, }; list_start.token_type = token_type.clone(); diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 146dc40..ec5669c 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -81,10 +81,9 @@ fn inside(tokenizer: &mut Tokenizer) -> State { /// Merge “`Paragraph`”s, which currently span a single line, into actual /// `Paragraph`s that span multiple lines. pub fn resolve(tokenizer: &mut Tokenizer) { - let len = tokenizer.events.len(); let mut index = 0; - while index < len { + while index < tokenizer.events.len() { let event = &tokenizer.events[index]; if event.event_type == EventType::Enter && event.token_type == Token::Paragraph { diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index be8d6c8..155a1a3 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -10,13 +10,12 @@ use crate::tokenizer::{State, Tokenizer}; /// ^^^^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(0xEF) => { - tokenizer.enter(Token::ByteOrderMark); - tokenizer.consume(); - State::Fn(Box::new(cont)) - } - _ => State::Nok, + if tokenizer.current == Some(0xEF) { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + State::Fn(Box::new(cont)) + } else { + State::Nok } } @@ -27,12 +26,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^^^^ /// ``` fn cont(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(0xBB) => { - tokenizer.consume(); - State::Fn(Box::new(end)) - } - _ => State::Nok, + if tokenizer.current == Some(0xBB) { + tokenizer.consume(); + State::Fn(Box::new(end)) + } else { + State::Nok } } @@ -43,12 +41,11 @@ fn cont(tokenizer: &mut Tokenizer) -> State { /// ^^^^ /// ``` fn end(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some(0xBF) => { - tokenizer.consume(); - tokenizer.exit(Token::ByteOrderMark); - State::Ok - } - _ => State::Nok, + if tokenizer.current == Some(0xBF) { + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Ok + } else { + State::Nok } } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 0a3721c..809aa27 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { tokenizer.exit(info.options.marker.clone()); State::Fn(Box::new(|t| enclosed_before(t, info))) } - None | Some(b' ' | b')') => State::Nok, - Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, + // ASCII control, space, closing paren, but *not* `\0`. + None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok, Some(_) => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.raw.clone()); @@ -166,12 +166,12 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { + None | Some(b'\n' | b'<') => State::Nok, Some(b'>') => { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, info) } - None | Some(b'\n' | b'<') => State::Nok, Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed_escape(t, info))) @@ -207,40 +207,25 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(b'(') => { - if info.balance >= info.options.limit { - State::Nok - } else { - tokenizer.consume(); - info.balance += 1; - State::Fn(Box::new(move |t| raw(t, info))) - } + None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => { + tokenizer.exit(Token::Data); + tokenizer.exit(info.options.string.clone()); + tokenizer.exit(info.options.raw.clone()); + tokenizer.exit(info.options.destination); + State::Ok } - Some(b')') => { - if info.balance == 0 { - tokenizer.exit(Token::Data); - tokenizer.exit(info.options.string.clone()); - tokenizer.exit(info.options.raw.clone()); - tokenizer.exit(info.options.destination); - State::Ok - } else { - tokenizer.consume(); - info.balance -= 1; - State::Fn(Box::new(move |t| raw(t, info))) - } + Some(b'(') if info.balance < info.options.limit => { + tokenizer.consume(); + info.balance += 1; + State::Fn(Box::new(move |t| raw(t, info))) } - None | Some(b'\t' | b'\n' | b' ') => { - if info.balance > 0 { - State::Nok - } else { - tokenizer.exit(Token::Data); - tokenizer.exit(info.options.string.clone()); - tokenizer.exit(info.options.raw.clone()); - tokenizer.exit(info.options.destination); - State::Ok - } + // ASCII control (but *not* `\0`) and space and `(`. + None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok, + Some(b')') => { + tokenizer.consume(); + info.balance -= 1; + State::Fn(Box::new(move |t| raw(t, info))) } - Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw_escape(t, info))) diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 7e40a2d..6fdb70d 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -123,39 +123,43 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ^ /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { - match tokenizer.current { - None | Some(b'[') => State::Nok, - Some(b']') if !info.data => State::Nok, - _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, - Some(b']') => { - tokenizer.exit(info.options.string.clone()); - tokenizer.enter(info.options.marker.clone()); - tokenizer.consume(); - tokenizer.exit(info.options.marker.clone()); - tokenizer.exit(info.options.label); - State::Ok - } - Some(b'\n') => tokenizer.go( - space_or_tab_eol_with_options(EolOptions { - content_type: Some(ContentType::String), - connect: info.connect, - }), - |t| { - info.connect = true; - at_break(t, info) - }, - )(tokenizer), - _ => { - tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - - if info.connect { - let index = tokenizer.events.len() - 1; - link(&mut tokenizer.events, index); - } else { - info.connect = true; + if info.size > LINK_REFERENCE_SIZE_MAX + || matches!(tokenizer.current, None | Some(b'[')) + || (matches!(tokenizer.current, Some(b']')) && !info.data) + { + State::Nok + } else { + match tokenizer.current { + Some(b'\n') => tokenizer.go( + space_or_tab_eol_with_options(EolOptions { + content_type: Some(ContentType::String), + connect: info.connect, + }), + |t| { + info.connect = true; + at_break(t, info) + }, + )(tokenizer), + Some(b']') => { + tokenizer.exit(info.options.string.clone()); + tokenizer.enter(info.options.marker.clone()); + tokenizer.consume(); + tokenizer.exit(info.options.marker.clone()); + tokenizer.exit(info.options.label); + State::Ok } + _ => { + tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); + + if info.connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } else { + info.connect = true; + } - label(tokenizer, info) + label(tokenizer, info) + } } } } @@ -172,30 +176,19 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - _ if info.size > LINK_REFERENCE_SIZE_MAX => { - tokenizer.exit(Token::Data); - at_break(tokenizer, info) - } - Some(b'\t' | b' ') => { - tokenizer.consume(); - info.size += 1; - State::Fn(Box::new(|t| label(t, info))) - } - Some(b'\\') => { - tokenizer.consume(); - info.size += 1; - if !info.data { - info.data = true; - } - State::Fn(Box::new(|t| escape(t, info))) - } - Some(_) => { - tokenizer.consume(); - info.size += 1; - if !info.data { - info.data = true; + Some(byte) => { + if info.size > LINK_REFERENCE_SIZE_MAX { + tokenizer.exit(Token::Data); + at_break(tokenizer, info) + } else { + let func = if matches!(byte, b'\\') { escape } else { label }; + tokenizer.consume(); + info.size += 1; + if !info.data && !matches!(byte, b'\t' | b' ') { + info.data = true; + } + State::Fn(Box::new(move |t| func(t, info))) } - State::Fn(Box::new(|t| label(t, info))) } } } diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 80861af..9cf2f14 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -48,70 +48,13 @@ pub struct Options { pub string: Token, } -/// Type of title. -#[derive(Debug, PartialEq)] -enum Kind { - /// In a parenthesized (`(` and `)`) title. - /// - /// ## Example - /// - /// ```markdown - /// (a) - /// ``` - Paren, - /// In a double quoted (`"`) title. - /// - /// ## Example - /// - /// ```markdown - /// "a" - /// ``` - Double, - /// In a single quoted (`'`) title. - /// - /// ## Example - /// - /// ```markdown - /// 'a' - /// ``` - Single, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - /// - /// > 👉 **Note**: a closing paren is used for `Kind::Paren`. - fn as_byte(&self) -> u8 { - match self { - Kind::Paren => b')', - Kind::Double => b'"', - Kind::Single => b'\'', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. - /// - /// ## Panics - /// - /// Panics if `byte` is not `(`, `"`, or `'`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'(' => Kind::Paren, - b'"' => Kind::Double, - b'\'' => Kind::Single, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse titles. #[derive(Debug)] struct Info { /// Whether we’ve seen data. connect: bool, - /// Kind of title. - kind: Kind, + /// Closing marker. + marker: u8, /// Configuration. options: Options, } @@ -124,10 +67,11 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => { + Some(b'"' | b'\'' | b'(') => { + let marker = tokenizer.current.unwrap(); let info = Info { connect: false, - kind: Kind::from_byte(byte), + marker: if marker == b'(' { b')' } else { marker }, options, }; tokenizer.enter(info.options.title.clone()); @@ -150,7 +94,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -172,10 +116,6 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { - tokenizer.exit(info.options.string.clone()); - begin(tokenizer, info) - } None => State::Nok, Some(b'\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { @@ -187,7 +127,11 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { at_break(t, info) }, )(tokenizer), - _ => { + Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { + tokenizer.exit(info.options.string.clone()); + begin(tokenizer, info) + } + Some(_) => { tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); if info.connect { @@ -210,21 +154,18 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn title(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + None | Some(b'\n') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - None | Some(b'\n') => { + Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Some(b'\\') => { + Some(byte) => { + let func = if matches!(byte, b'\\') { escape } else { title }; tokenizer.consume(); - State::Fn(Box::new(|t| escape(t, info))) - } - _ => { - tokenizer.consume(); - State::Fn(Box::new(|t| title(t, info))) + State::Fn(Box::new(move |t| func(t, info))) } } } @@ -237,7 +178,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'"' | b'\'' | b')') => { tokenizer.consume(); State::Fn(Box::new(|t| title(t, info))) } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 13815cb..4f872ba 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -92,8 +92,7 @@ fn trim_data( if trim_end { let mut index = slice.bytes.len(); - let vs = slice.after; - let mut spaces_only = vs == 0; + let mut spaces_only = slice.after == 0; while index > 0 { match slice.bytes[index - 1] { b' ' => {} @@ -105,10 +104,10 @@ fn trim_data( } let diff = slice.bytes.len() - index; - let token_type = if spaces_only - && hard_break - && exit_index + 1 < tokenizer.events.len() + let token_type = if hard_break + && spaces_only && diff >= HARD_BREAK_PREFIX_SIZE_MIN + && exit_index + 1 < tokenizer.events.len() { Token::HardBreakTrailing } else { @@ -123,7 +122,7 @@ fn trim_data( return; } - if diff > 0 || vs > 0 { + if diff > 0 || slice.after > 0 { let exit_point = tokenizer.events[exit_index].point.clone(); let mut enter_point = exit_point.clone(); enter_point.index -= diff; @@ -156,14 +155,11 @@ fn trim_data( if trim_start { let mut index = 0; - let vs = slice.before; while index < slice.bytes.len() { match slice.bytes[index] { - b' ' | b'\t' => {} + b' ' | b'\t' => index += 1, _ => break, } - - index += 1; } // The whole data is whitespace. @@ -174,7 +170,7 @@ fn trim_data( return; } - if index > 0 || vs > 0 { + if index > 0 || slice.before > 0 { let enter_point = tokenizer.events[exit_index - 1].point.clone(); let mut exit_point = enter_point.clone(); exit_point.index += index; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 4fc4dc4..785d132 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -53,64 +53,11 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; -/// Type of thematic break. -#[derive(Debug, PartialEq)] -enum Kind { - /// In a thematic break using asterisks (`*`). - /// - /// ## Example - /// - /// ```markdown - /// *** - /// ``` - Asterisk, - /// In a thematic break using dashes (`-`). - /// - /// ## Example - /// - /// ```markdown - /// --- - /// ``` - Dash, - /// In a thematic break using underscores (`_`). - /// - /// ## Example - /// - /// ```markdown - /// ___ - /// ``` - Underscore, -} - -impl Kind { - /// Turn the kind into a byte ([u8]). - fn as_byte(&self) -> u8 { - match self { - Kind::Asterisk => b'*', - Kind::Dash => b'-', - Kind::Underscore => b'_', - } - } - /// Turn a byte ([u8]) into a kind. - /// - /// ## Panics - /// - /// Panics if `byte` is not `*`, `-`, or `_`. - fn from_byte(byte: u8) -> Kind { - match byte { - b'*' => Kind::Asterisk, - b'-' => Kind::Dash, - b'_' => Kind::Underscore, - _ => unreachable!("invalid byte"), - } - } -} - /// State needed to parse thematic breaks. #[derive(Debug)] struct Info { - /// Kind of marker. - kind: Kind, + /// Marker. + marker: u8, /// Number of markers. size: usize, } @@ -122,15 +69,19 @@ struct Info { /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - let max = if tokenizer.parse_state.constructs.code_indented { - TAB_SIZE - 1 - } else { - usize::MAX - }; - if tokenizer.parse_state.constructs.thematic_break { tokenizer.enter(Token::ThematicBreak); - tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) + tokenizer.go( + space_or_tab_min_max( + 0, + if tokenizer.parse_state.constructs.code_indented { + TAB_SIZE - 1 + } else { + usize::MAX + }, + ), + before, + )(tokenizer) } else { State::Nok } @@ -144,10 +95,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break( + Some(b'*' | b'-' | b'_') => at_break( tokenizer, Info { - kind: Kind::from_byte(byte), + marker: tokenizer.current.unwrap(), size: 0, }, ), @@ -163,13 +114,13 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { + None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Some(byte) if byte == info.kind.as_byte() => { + Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => { tokenizer.enter(Token::ThematicBreakSequence); sequence(tokenizer, info) } @@ -185,7 +136,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(byte) if byte == info.kind.as_byte() => { + Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| sequence(t, info))) diff --git a/src/content/document.rs b/src/content/document.rs index 828431d..76d510a 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -89,14 +89,13 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> { let event = &tokenizer.events[index]; if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { - // To do: when we operate on u8, we can use a `to_str` here as we - // don‘t need virtual spaces. + // Note: we don‘t care about virtual spaces, so `as_str` is fine. let id = normalize_identifier( - &Slice::from_position( + Slice::from_position( tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, index), ) - .serialize(), + .as_str(), ); if !definitions.contains(&id) { @@ -423,6 +423,6 @@ pub fn micromark(value: &str) -> String { /// ``` #[must_use] pub fn micromark_with_options(value: &str, options: &Options) -> String { - let (events, result) = parse(value, options); - compile(&events, result.bytes, options) + let (events, bytes) = parse(value, options); + compile(&events, bytes, options) } diff --git a/src/parser.rs b/src/parser.rs index 613b206..23afb37 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -20,7 +20,7 @@ pub struct ParseState<'a> { /// Turn a string of markdown into events. /// /// Passes the codes back so the compiler can access the source. -pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, ParseState<'a>) { +pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, &'a [u8]) { let mut parse_state = ParseState { constructs: &options.constructs, bytes: value.as_bytes(), @@ -37,6 +37,5 @@ pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, ParseStat }, ); - // To do: return bytes only? - (events, parse_state) + (events, parse_state.bytes) } diff --git a/src/unicode.rs b/src/unicode.rs index a8445f9..764d4c7 100644 --- a/src/unicode.rs +++ b/src/unicode.rs @@ -6,7 +6,7 @@ /// > It is generate from the latest Unicode data. /// /// Rust does not contain an `is_punctuation` method on `char`, while it does -/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). +/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric). /// /// `CommonMark` handles attention (emphasis, strong) markers based on what /// comes before or after them. diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs index 5277f90..f8fd18f 100644 --- a/src/util/decode_character_reference.rs +++ b/src/util/decode_character_reference.rs @@ -57,9 +57,9 @@ pub fn decode_named(value: &str) -> String { /// ```rust ignore /// use micromark::util::decode_character_reference::decode_numeric; /// -/// assert_eq!(decode_numeric("123", 10), '{'); -/// assert_eq!(decode_numeric("9", 16), '\t'); -/// assert_eq!(decode_numeric("0", 10), '�'); // Not allowed. +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed. /// ``` /// /// ## Panics @@ -74,27 +74,19 @@ pub fn decode_named(value: &str) -> String { /// /// * [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference) /// * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_numeric(value: &str, radix: u32) -> char { - let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int"); - - if - // C0 except for HT, LF, FF, CR, space - code < 0x09 || - code == 0x0B || - (code > 0x0D && code < 0x20) || - // Control character (DEL) of the basic block and C1 controls. - (code > 0x7E && code < 0xA0) || - // Lone high surrogates and low surrogates. - (code > 0xd7ff && code < 0xe000) || - // Noncharacters. - (code > 0xfdcf && code < 0xfdf0) || - ((code & 0xffff) == 0xffff) || - ((code & 0xffff) == 0xfffe) || - // Out of range - code > 0x0010_ffff - { - char::REPLACEMENT_CHARACTER - } else { - char::from_u32(code).expect("expected valid `code`") +pub fn decode_numeric(value: &str, radix: u32) -> String { + if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { + if !matches!(char, + // C0 except for HT, LF, FF, CR, space + '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | + // Control character (DEL) of c0, and C1 controls. + '\u{7F}'..='\u{9F}' + // Lone surrogates, noncharacters, and out of range are handled by + // Rust. + ) { + return char.to_string(); + } } + + char::REPLACEMENT_CHARACTER.to_string() } diff --git a/src/util/encode.rs b/src/util/encode.rs index 91c5462..d37a2de 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,37 +20,33 @@ /// ## References /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String { - let check = if encode_html { check_all } else { check_nil }; - let mut value = value.into(); - +pub fn encode(value: &str, encode_html: bool) -> String { // It’ll grow a bit bigger for each dangerous character. let mut result = String::with_capacity(value.len()); + let bytes = value.as_bytes(); + let mut index = 0; + let mut start = 0; - while let Some(indice) = value.find(check) { - let after = value.split_off(indice + 1); - let dangerous = value.pop().unwrap(); - result.push_str(&value); - result.push_str(match dangerous { - '\0' => "�", - '&' => "&", - '"' => """, - '<' => "<", - '>' => ">", - _ => unreachable!("xxx"), - }); - value = after; - } + while index < bytes.len() { + let byte = bytes[index]; + if matches!(byte, b'\0') || (encode_html && matches!(byte, b'&' | b'"' | b'<' | b'>')) { + result.push_str(&value[start..index]); + result.push_str(match byte { + b'\0' => "�", + b'&' => "&", + b'"' => """, + b'<' => "<", + b'>' => ">", + _ => panic!("impossible"), + }); - result.push_str(&value); + start = index + 1; + } - result -} + index += 1; + } -fn check_all(char: char) -> bool { - matches!(char, '\0' | '&' | '"' | '<' | '>') -} + result.push_str(&value[start..]); -fn check_nil(char: char) -> bool { - matches!(char, '\0') + result } diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 42a2bb0..f5b12d0 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -34,25 +34,34 @@ pub fn normalize_identifier(value: &str) -> String { // Note: it’ll grow a bit smaller for consecutive whitespace. let mut result = String::with_capacity(value.len()); - let mut at_start = true; - let mut at_whitespace = true; + let bytes = value.as_bytes(); + let mut in_whitespace = true; + let mut index = 0; + let mut start = 0; - // Collapse markdown whitespace and trim it. - for char in value.chars() { - match char { - '\t' | '\n' | '\r' | ' ' => { - at_whitespace = true; + while index < bytes.len() { + if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') { + // First whitespace we see after non-whitespace. + if !in_whitespace { + result.push_str(&value[start..index]); + in_whitespace = true; } - _ => { - if at_whitespace && !at_start { - result.push(' '); - } - - result.push(char); - at_start = false; - at_whitespace = false; + } + // First non-whitespace we see after whitespace. + else if in_whitespace { + if start != 0 { + result.push(' '); } + + start = index; + in_whitespace = false; } + + index += 1; + } + + if !in_whitespace { + result.push_str(&value[start..]); } // Some characters are considered “uppercase”, but if their lowercase diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 8c09549..051e1e1 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,7 +32,7 @@ use crate::util::encode::encode; /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { - let value = encode(normalize_uri(value), true); + let value = encode(&*normalize_uri(value), true); if let Some(protocols) = protocols { let end = value.find(|c| matches!(c, '?' | '#' | '/')); diff --git a/src/util/slice.rs b/src/util/slice.rs index cd3641e..d899dac 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -2,6 +2,7 @@ use crate::constant::TAB_SIZE; use crate::tokenizer::{Event, EventType, Point}; +use std::str; /// A range between two places. #[derive(Debug)] @@ -78,6 +79,15 @@ impl<'a> Slice<'a> { } } + /// To do. + pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> { + Slice { + bytes: &bytes[index..=index], + before: 0, + after: 0, + } + } + /// Get the slice belonging to a position. pub fn from_position(bytes: &'a [u8], position: &Position) -> Slice<'a> { let mut before = position.start.vs; @@ -107,14 +117,18 @@ impl<'a> Slice<'a> { } /// To do. - // To do: rename to `len`? - pub fn size(&self) -> usize { - self.bytes.len() + self.before + self.after + pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> { + Slice { + bytes: &bytes[start..end], + before: 0, + after: 0, + } } - // To do: - // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html> - // to implement an `as_str`. + /// To do. + pub fn len(&self) -> usize { + self.bytes.len() + self.before + self.after + } /// To do. pub fn head(&self) -> Option<u8> { @@ -127,16 +141,20 @@ impl<'a> Slice<'a> { } } + // To do: + pub fn as_str(&self) -> &str { + str::from_utf8(self.bytes).unwrap() + } + /// To do. pub fn serialize(&self) -> String { - let mut string = String::with_capacity(self.size()); + let mut string = String::with_capacity(self.len()); let mut index = self.before; while index > 0 { string.push(' '); index -= 1; } - // To do: invalid UTF8? - string.push_str(std::str::from_utf8(self.bytes).unwrap()); + string.push_str(self.as_str()); index = self.after; while index > 0 { string.push(' '); diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs index f251952..99f729e 100644 --- a/tests/code_fenced.rs +++ b/tests/code_fenced.rs @@ -221,6 +221,12 @@ fn code_fenced() { ); assert_eq!( + micromark("```a\\&b\0c"), + "<pre><code class=\"language-a&b�c\"></code></pre>\n", + "should encode dangerous characters in languages" + ); + + assert_eq!( micromark(" ```\naaa\n ```"), "<pre><code>aaa\n ```\n</code></pre>\n", "should not support a closing sequence w/ too much indent, regardless of opening sequence (1)" |