diff options
Diffstat (limited to '')
34 files changed, 934 insertions, 1229 deletions
| diff --git a/src/compiler.rs b/src/compiler.rs index de76142..e0ab1e9 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -1,6 +1,5 @@  //! Turn events into a string of HTML.  use crate::constant::{SAFE_PROTOCOL_HREF, SAFE_PROTOCOL_SRC}; -use crate::construct::character_reference::Kind as CharacterReferenceKind;  use crate::token::Token;  use crate::tokenizer::{Event, EventType};  use crate::util::normalize_identifier::normalize_identifier; @@ -68,14 +67,14 @@ struct CompileContext<'a> {      pub code_flow_seen_data: Option<bool>,      pub code_fenced_fences_count: Option<usize>,      pub code_text_inside: bool, -    pub character_reference_kind: Option<CharacterReferenceKind>, +    pub character_reference_marker: Option<u8>,      pub expect_first_item: Option<bool>,      pub media_stack: Vec<Media>,      pub definitions: Vec<(String, Definition)>,      pub tight_stack: Vec<bool>,      /// Fields used to influance the current compilation.      pub slurp_one_line_ending: bool, -    pub tags: bool, +    pub in_image_alt: bool,      pub encode_html: bool,      pub last_was_tag: bool,      /// Configuration @@ -104,13 +103,13 @@ impl<'a> CompileContext<'a> {              code_flow_seen_data: None,              code_fenced_fences_count: None,              code_text_inside: false, -            character_reference_kind: None, +            character_reference_marker: None,              expect_first_item: None,              media_stack: vec![],              definitions: vec![],              tight_stack: vec![],              slurp_one_line_ending: false, -            tags: true, +            in_image_alt: false,              encode_html: true,              last_was_tag: false,              protocol_href: if options.allow_dangerous_protocol { @@ -140,8 +139,7 @@ impl<'a> CompileContext<'a> {          self.buffers.pop().expect("Cannot resume w/o buffer")      } -    pub fn push<'x, S: Into<&'x str>>(&mut self, value: S) { -        let value = value.into(); +    pub fn push(&mut self, value: &str) {          self.buffers              .last_mut()              .expect("Cannot push w/o buffer") @@ -149,17 +147,8 @@ impl<'a> CompileContext<'a> {          self.last_was_tag = false;      } -    pub fn push_raw<'x, S: Into<&'x str>>(&mut self, value: S) { -        let value = value.into(); -        self.push(&*encode(value, self.encode_html)); -    } - -    pub fn tag<'x, S: Into<&'x str>>(&mut self, value: S) { -        if self.tags { -            let value = value.into(); -            self.push(&*encode(value, false)); -            self.last_was_tag = true; -        } +    pub fn push_raw(&mut self, value: &str) { +        self.push(&encode(value, self.encode_html));      }      /// Get the current buffer. @@ -172,7 +161,7 @@ impl<'a> CompileContext<'a> {      /// Add a line ending.      pub fn line_ending(&mut self) {          let eol = self.line_ending_default.as_str().to_string(); -        self.push(&*eol); +        self.push(&eol);      }      /// Add a line ending if needed (as in, there’s no eol/eof already). @@ -210,7 +199,7 @@ pub fn compile(events: &[Event], bytes: &[u8], options: &Options) -> String {              && (event.token_type == Token::BlankLineEnding || event.token_type == Token::LineEnding)          {              line_ending_inferred = Some(LineEnding::from_str( -                &Slice::from_position(bytes, &Position::from_exit_event(events, index)).serialize(), +                Slice::from_position(bytes, &Position::from_exit_event(events, index)).as_str(),              ));              break;          } @@ -398,14 +387,16 @@ fn on_enter_buffer(context: &mut CompileContext) {  fn on_enter_block_quote(context: &mut CompileContext) {      context.tight_stack.push(false);      context.line_ending_if_needed(); -    context.tag("<blockquote>"); +    context.push("<blockquote>"); +    context.last_was_tag = true;  }  /// Handle [`Enter`][EventType::Enter]:[`CodeIndented`][Token::CodeIndented].  fn on_enter_code_indented(context: &mut CompileContext) {      context.code_flow_seen_data = Some(false);      context.line_ending_if_needed(); -    context.tag("<pre><code>"); +    context.push("<pre><code>"); +    context.last_was_tag = true;  }  /// Handle [`Enter`][EventType::Enter]:[`CodeFenced`][Token::CodeFenced]. @@ -413,14 +404,18 @@ fn on_enter_code_fenced(context: &mut CompileContext) {      context.code_flow_seen_data = Some(false);      context.line_ending_if_needed();      // Note that no `>` is used, which is added later. -    context.tag("<pre><code"); +    context.push("<pre><code"); +    context.last_was_tag = true;      context.code_fenced_fences_count = Some(0);  }  /// Handle [`Enter`][EventType::Enter]:[`CodeText`][Token::CodeText].  fn on_enter_code_text(context: &mut CompileContext) {      context.code_text_inside = true; -    context.tag("<code>"); +    if !context.in_image_alt { +        context.push("<code>"); +        context.last_was_tag = true; +    }      context.buffer();  } @@ -445,7 +440,10 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) {  /// Handle [`Enter`][EventType::Enter]:[`Emphasis`][Token::Emphasis].  fn on_enter_emphasis(context: &mut CompileContext) { -    context.tag("<em>"); +    if !context.in_image_alt { +        context.push("<em>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Enter`][EventType::Enter]:[`HtmlFlow`][Token::HtmlFlow]. @@ -473,7 +471,7 @@ fn on_enter_image(context: &mut CompileContext) {          destination: None,          title: None,      }); -    context.tags = false; // Disallow tags. +    context.in_image_alt = true; // Disallow tags.  }  /// Handle [`Enter`][EventType::Enter]:[`Link`][Token::Link]. @@ -546,14 +544,12 @@ fn on_enter_list(context: &mut CompileContext) {      context.tight_stack.push(!loose);      context.line_ending_if_needed();      // Note: no `>`. -    context.tag(&*format!( -        "<{}", -        if *token_type == Token::ListOrdered { -            "ol" -        } else { -            "ul" -        } -    )); +    context.push(if *token_type == Token::ListOrdered { +        "<ol" +    } else { +        "<ul" +    }); +    context.last_was_tag = true;      context.expect_first_item = Some(true);  } @@ -562,11 +558,14 @@ fn on_enter_list_item_marker(context: &mut CompileContext) {      let expect_first_item = context.expect_first_item.take().unwrap();      if expect_first_item { -        context.tag(">"); +        context.push(">"); +        context.last_was_tag = true;      }      context.line_ending_if_needed(); -    context.tag("<li>"); + +    context.push("<li>"); +    context.last_was_tag = true;      context.expect_first_item = Some(false);      // “Hack” to prevent a line ending from showing up if the item is empty.      context.last_was_tag = false; @@ -578,15 +577,15 @@ fn on_enter_paragraph(context: &mut CompileContext) {      if !tight {          context.line_ending_if_needed(); -        context.tag("<p>"); +        context.push("<p>"); +        context.last_was_tag = true;      }  }  /// Handle [`Enter`][EventType::Enter]:[`Resource`][Token::Resource].  fn on_enter_resource(context: &mut CompileContext) {      context.buffer(); // We can have line endings in the resource, ignore them. -    let media = context.media_stack.last_mut().unwrap(); -    media.destination = Some("".to_string()); +    context.media_stack.last_mut().unwrap().destination = Some("".to_string());  }  /// Handle [`Enter`][EventType::Enter]:[`ResourceDestinationString`][Token::ResourceDestinationString]. @@ -599,47 +598,67 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) {  /// Handle [`Enter`][EventType::Enter]:[`Strong`][Token::Strong].  fn on_enter_strong(context: &mut CompileContext) { -    context.tag("<strong>"); +    if !context.in_image_alt { +        context.push("<strong>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail].  fn on_exit_autolink_email(context: &mut CompileContext) { -    let value = Slice::from_position( +    let slice = Slice::from_position(          context.bytes,          &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); +    ); +    let value = slice.as_str(); -    context.tag(&*format!( -        "<a href=\"{}\">", -        sanitize_uri( -            format!("mailto:{}", value.as_str()).as_str(), -            &context.protocol_href -        ) -    )); -    context.push_raw(&*value); -    context.tag("</a>"); +    if !context.in_image_alt { +        context.push("<a href=\""); +        context.push(&sanitize_uri( +            &format!("mailto:{}", value), +            &context.protocol_href, +        )); +        context.push("\">"); +        context.last_was_tag = true; +    } + +    context.push_raw(value); + +    if !context.in_image_alt { +        context.push("</a>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol].  fn on_exit_autolink_protocol(context: &mut CompileContext) { -    let value = Slice::from_position( +    let slice = Slice::from_position(          context.bytes,          &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); +    ); +    let value = slice.as_str(); -    context.tag(&*format!( -        "<a href=\"{}\">", -        sanitize_uri(value.as_str(), &context.protocol_href) -    )); -    context.push_raw(&*value); -    context.tag("</a>"); +    if !context.in_image_alt { +        context.push("<a href=\""); +        context.push(&sanitize_uri(value, &context.protocol_href)); +        context.push("\">"); +        context.last_was_tag = true; +    } + +    context.push_raw(value); + +    if !context.in_image_alt { +        context.push("</a>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:{[`HardBreakEscape`][Token::HardBreakEscape],[`HardBreakTrailing`][Token::HardBreakTrailing]}.  fn on_exit_break(context: &mut CompileContext) { -    context.tag("<br />"); +    if !context.in_image_alt { +        context.push("<br />"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:[`BlankLineEnding`][Token::BlankLineEnding]. @@ -654,56 +673,58 @@ fn on_exit_block_quote(context: &mut CompileContext) {      context.tight_stack.pop();      context.line_ending_if_needed();      context.slurp_one_line_ending = false; -    context.tag("</blockquote>"); +    context.push("</blockquote>"); +    context.last_was_tag = true;  }  /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarker`][Token::CharacterReferenceMarker].  fn on_exit_character_reference_marker(context: &mut CompileContext) { -    context.character_reference_kind = Some(CharacterReferenceKind::Named); +    context.character_reference_marker = Some(b'&');  }  /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarkerHexadecimal`][Token::CharacterReferenceMarkerHexadecimal].  fn on_exit_character_reference_marker_hexadecimal(context: &mut CompileContext) { -    context.character_reference_kind = Some(CharacterReferenceKind::Hexadecimal); +    context.character_reference_marker = Some(b'x');  }  /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceMarkerNumeric`][Token::CharacterReferenceMarkerNumeric].  fn on_exit_character_reference_marker_numeric(context: &mut CompileContext) { -    context.character_reference_kind = Some(CharacterReferenceKind::Decimal); +    context.character_reference_marker = Some(b'#');  }  /// Handle [`Exit`][EventType::Exit]:[`CharacterReferenceValue`][Token::CharacterReferenceValue].  fn on_exit_character_reference_value(context: &mut CompileContext) { -    let kind = context -        .character_reference_kind +    let marker = context +        .character_reference_marker          .take()          .expect("expected `character_reference_kind` to be set"); -    let reference = Slice::from_position( +    let slice = Slice::from_position(          context.bytes,          &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); +    ); +    let value = slice.as_str(); -    let ref_string = reference.as_str(); -    let value = match kind { -        CharacterReferenceKind::Decimal => decode_numeric(ref_string, 10).to_string(), -        CharacterReferenceKind::Hexadecimal => decode_numeric(ref_string, 16).to_string(), -        CharacterReferenceKind::Named => decode_named(ref_string), +    let value = match marker { +        b'#' => decode_numeric(value, 10), +        b'x' => decode_numeric(value, 16), +        b'&' => decode_named(value), +        _ => panic!("impossible"),      }; -    context.push_raw(&*value); +    context.push_raw(&value);  }  /// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk].  fn on_exit_code_flow_chunk(context: &mut CompileContext) { -    let value = Slice::from_position( -        context.bytes, -        &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); -      context.code_flow_seen_data = Some(true); -    context.push_raw(&*value); +    context.push_raw( +        &Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        // Must serialize to get virtual spaces. +        .serialize(), +    );  }  /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFence`][Token::CodeFencedFence]. @@ -715,7 +736,8 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {      };      if count == 0 { -        context.tag(">"); +        context.push(">"); +        context.last_was_tag = true;          context.slurp_one_line_ending = true;      } @@ -725,7 +747,10 @@ fn on_exit_code_fenced_fence(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:[`CodeFencedFenceInfo`][Token::CodeFencedFenceInfo].  fn on_exit_code_fenced_fence_info(context: &mut CompileContext) {      let value = context.resume(); -    context.tag(&*format!(" class=\"language-{}\"", value)); +    context.push(" class=\"language-"); +    context.push(&value); +    context.push("\""); +    context.last_was_tag = true;  }  /// Handle [`Exit`][EventType::Exit]:{[`CodeFenced`][Token::CodeFenced],[`CodeIndented`][Token::CodeIndented]}. @@ -752,7 +777,8 @@ fn on_exit_code_flow(context: &mut CompileContext) {          context.line_ending_if_needed();      } -    context.tag("</code></pre>"); +    context.push("</code></pre>"); +    context.last_was_tag = true;      if let Some(count) = context.code_fenced_fences_count.take() {          if count < 2 { @@ -781,12 +807,16 @@ fn on_exit_code_text(context: &mut CompileContext) {      }      context.code_text_inside = false; -    context.push(&*if trim { +    context.push(&if trim {          result[1..(result.len() - 1)].to_string()      } else {          result      }); -    context.tag("</code>"); + +    if !context.in_image_alt { +        context.push("</code>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:*. @@ -798,72 +828,63 @@ fn on_exit_drop(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}.  fn on_exit_data(context: &mut CompileContext) { -    let value = Slice::from_position( -        context.bytes, -        &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); - -    // Just output it. -    context.push_raw(&*value); +    context.push_raw( +        Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        .as_str(), +    );  }  /// Handle [`Exit`][EventType::Exit]:[`Definition`][Token::Definition].  fn on_exit_definition(context: &mut CompileContext) { -    let definition = context.media_stack.pop().unwrap(); -    let reference_id = normalize_identifier(&definition.reference_id.unwrap()); -    let destination = definition.destination; -    let title = definition.title; -      context.resume(); - -    let mut index = 0; - -    while index < context.definitions.len() { -        if context.definitions[index].0 == reference_id { -            return; -        } - -        index += 1; -    } - -    context -        .definitions -        .push((reference_id, Definition { destination, title })); +    let media = context.media_stack.pop().unwrap(); +    let id = normalize_identifier(&media.reference_id.unwrap()); + +    context.definitions.push(( +        id, +        Definition { +            destination: media.destination, +            title: media.title, +        }, +    ));  }  /// Handle [`Exit`][EventType::Exit]:[`DefinitionDestinationString`][Token::DefinitionDestinationString].  fn on_exit_definition_destination_string(context: &mut CompileContext) {      let buf = context.resume(); -    let definition = context.media_stack.last_mut().unwrap(); -    definition.destination = Some(buf); +    context.media_stack.last_mut().unwrap().destination = Some(buf);      context.encode_html = true;  }  /// Handle [`Exit`][EventType::Exit]:[`DefinitionLabelString`][Token::DefinitionLabelString].  fn on_exit_definition_label_string(context: &mut CompileContext) { -    let value = Slice::from_position( -        context.bytes, -        &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); -      // Discard label, use the source content instead.      context.resume(); -    let definition = context.media_stack.last_mut().unwrap(); -    definition.reference_id = Some(value); +    context.media_stack.last_mut().unwrap().reference_id = Some( +        // To do: lifetimes, reference bytes? +        Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        .serialize(), +    );  }  /// Handle [`Exit`][EventType::Exit]:[`DefinitionTitleString`][Token::DefinitionTitleString].  fn on_exit_definition_title_string(context: &mut CompileContext) {      let buf = context.resume(); -    let definition = context.media_stack.last_mut().unwrap(); -    definition.title = Some(buf); +    context.media_stack.last_mut().unwrap().title = Some(buf);  }  /// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Emphasis].  fn on_exit_emphasis(context: &mut CompileContext) { -    context.tag("</em>"); +    if !context.in_image_alt { +        context.push("</em>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingAtx`][Token::HeadingAtx]. @@ -873,7 +894,10 @@ fn on_exit_heading_atx(context: &mut CompileContext) {          .take()          .expect("`atx_opening_sequence_size` must be set in headings"); -    context.tag(&*format!("</h{}>", rank)); +    context.push("</h"); +    context.push(&rank.to_string()); +    context.push(">"); +    context.last_was_tag = true;  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingAtxSequence`][Token::HeadingAtxSequence]. @@ -884,17 +908,20 @@ fn on_exit_heading_atx_sequence(context: &mut CompileContext) {              context.bytes,              &Position::from_exit_event(context.events, context.index),          ) -        .size(); +        .len();          context.line_ending_if_needed();          context.atx_opening_sequence_size = Some(rank); -        context.tag(&*format!("<h{}>", rank)); +        context.push("<h"); +        context.push(&rank.to_string()); +        context.push(">"); +        context.last_was_tag = true;      }  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingAtxText`][Token::HeadingAtxText].  fn on_exit_heading_atx_text(context: &mut CompileContext) {      let value = context.resume(); -    context.push(&*value); +    context.push(&value);  }  /// Handle [`Exit`][EventType::Exit]:[`HeadingSetextText`][Token::HeadingSetextText]. @@ -915,12 +942,18 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) {          &Position::from_exit_event(context.events, context.index),      )      .head(); -    let level = if head == Some(b'-') { 2 } else { 1 }; +    let rank = if head == Some(b'-') { "2" } else { "1" };      context.line_ending_if_needed(); -    context.tag(&*format!("<h{}>", level)); -    context.push(&*text); -    context.tag(&*format!("</h{}>", level)); +    context.push("<h"); +    context.push(rank); +    context.push(">"); +    context.last_was_tag = true; +    context.push(&text); +    context.push("</h"); +    context.push(rank); +    context.push(">"); +    context.last_was_tag = true;  }  /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlow`][Token::HtmlFlow],[`HtmlText`][Token::HtmlText]}. @@ -930,32 +963,31 @@ fn on_exit_html(context: &mut CompileContext) {  /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlowData`][Token::HtmlFlowData],[`HtmlTextData`][Token::HtmlTextData]}.  fn on_exit_html_data(context: &mut CompileContext) { -    let value = Slice::from_position( -        context.bytes, -        &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); - -    context.push_raw(&*value); +    context.push_raw( +        Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        .as_str(), +    );  }  /// Handle [`Exit`][EventType::Exit]:[`Label`][Token::Label].  fn on_exit_label(context: &mut CompileContext) {      let buf = context.resume(); -    let media = context.media_stack.last_mut().unwrap(); -    media.label = Some(buf); +    context.media_stack.last_mut().unwrap().label = Some(buf);  }  /// Handle [`Exit`][EventType::Exit]:[`LabelText`][Token::LabelText].  fn on_exit_label_text(context: &mut CompileContext) { -    let value = Slice::from_position( -        context.bytes, -        &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); - -    let media = context.media_stack.last_mut().unwrap(); -    media.label_id = Some(value); +    context.media_stack.last_mut().unwrap().label_id = Some( +        // To do: lifetimes, reference bytes? +        Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        .serialize(), +    );  }  /// Handle [`Exit`][EventType::Exit]:[`LineEnding`][Token::LineEnding]. @@ -965,26 +997,28 @@ fn on_exit_line_ending(context: &mut CompileContext) {      } else if context.slurp_one_line_ending {          context.slurp_one_line_ending = false;      } else { -        let value = Slice::from_position( -            context.bytes, -            &Position::from_exit_event(context.events, context.index), -        ) -        .serialize(); - -        context.push_raw(&*value); +        context.push_raw( +            Slice::from_position( +                context.bytes, +                &Position::from_exit_event(context.events, context.index), +            ) +            .as_str(), +        );      }  }  /// Handle [`Exit`][EventType::Exit]:{[`ListOrdered`][Token::ListOrdered],[`ListUnordered`][Token::ListUnordered]}.  fn on_exit_list(context: &mut CompileContext) { -    let tag_name = if context.events[context.index].token_type == Token::ListOrdered { -        "ol" -    } else { -        "ul" -    };      context.tight_stack.pop();      context.line_ending(); -    context.tag(&*format!("</{}>", tag_name)); +    context.push( +        if context.events[context.index].token_type == Token::ListOrdered { +            "</ol>" +        } else { +            "</ul>" +        }, +    ); +    context.last_was_tag = true;  }  /// Handle [`Exit`][EventType::Exit]:[`ListItem`][Token::ListItem]. @@ -1010,7 +1044,8 @@ fn on_exit_list_item(context: &mut CompileContext) {          context.line_ending_if_needed();      } -    context.tag("</li>"); +    context.push("</li>"); +    context.last_was_tag = true;  }  /// Handle [`Exit`][EventType::Exit]:[`ListItemValue`][Token::ListItemValue]. @@ -1018,17 +1053,17 @@ fn on_exit_list_item_value(context: &mut CompileContext) {      let expect_first_item = context.expect_first_item.unwrap();      if expect_first_item { -        let value = Slice::from_position( +        let slice = Slice::from_position(              context.bytes,              &Position::from_exit_event(context.events, context.index), -        ) -        .serialize(); -        let value = value.parse::<u32>().ok().unwrap(); +        ); +        let value = slice.as_str().parse::<u32>().ok().unwrap();          if value != 1 { -            context.tag(" start=\""); -            context.tag(&*value.to_string()); -            context.tag("\""); +            context.push(" start=\""); +            context.push(&value.to_string()); +            context.push("\""); +            context.last_was_tag = true;          }      }  } @@ -1048,68 +1083,98 @@ fn on_exit_media(context: &mut CompileContext) {          index += 1;      } -    context.tags = !is_in_image; +    context.in_image_alt = is_in_image;      let media = context.media_stack.pop().unwrap(); +    let label = media.label.unwrap(); +    let in_image_alt = context.in_image_alt;      let id = media          .reference_id          .or(media.label_id)          .map(|id| normalize_identifier(&id)); -    let label = media.label.unwrap(); -    let mut definition = None; -    if let Some(id) = id { -        let mut index = 0; +    let definition_index = if media.destination.is_none() { +        id.and_then(|id| { +            let mut index = 0; -        while index < context.definitions.len() { -            if context.definitions[index].0 == id { -                definition = Some(&context.definitions[index].1); -                break; -            } +            while index < context.definitions.len() { +                if context.definitions[index].0 == id { +                    return Some(index); +                } -            index += 1; -        } -    } +                index += 1; +            } -    let destination = if media.destination.is_some() { -        &media.destination +            None +        })      } else { -        &definition.unwrap().destination -    }; -    let title = if media.destination.is_some() { -        &media.title -    } else { -        &definition.unwrap().title +        None      }; -    let destination = if let Some(destination) = destination { -        destination -    } else { -        "" -    }; +    if !in_image_alt { +        if media.image { +            context.push("<img src=\""); +        } else { +            context.push("<a href=\""); +        }; -    let title = if let Some(title) = title { -        format!(" title=\"{}\"", title) -    } else { -        "".to_string() -    }; +        let destination = if let Some(index) = definition_index { +            context.definitions[index].1.destination.as_ref() +        } else { +            media.destination.as_ref() +        }; + +        if let Some(destination) = destination { +            context.push(&sanitize_uri( +                destination, +                if media.image { +                    &context.protocol_src +                } else { +                    &context.protocol_href +                }, +            )); +        } + +        if media.image { +            context.push("\" alt=\""); +        }; +    }      if media.image { -        context.tag(&*format!( -            "<img src=\"{}\" alt=\"", -            sanitize_uri(destination, &context.protocol_src), -        )); -        context.push(&*label); -        context.tag(&*format!("\"{} />", title)); -    } else { -        context.tag(&*format!( -            "<a href=\"{}\"{}>", -            sanitize_uri(destination, &context.protocol_href), -            title, -        )); -        context.push(&*label); -        context.tag("</a>"); -    }; +        context.push(&label); +    } + +    if !in_image_alt { +        context.push("\""); + +        let title = if let Some(index) = definition_index { +            context.definitions[index].1.title.clone() +        } else { +            media.title +        }; + +        if let Some(title) = title { +            context.push(" title=\""); +            context.push(&title); +            context.push("\""); +        }; + +        if media.image { +            context.push(" /"); +        } + +        context.push(">"); +        context.last_was_tag = true; +    } + +    if !media.image { +        context.push(&label); + +        if !in_image_alt { +            context.push("</a>"); +            context.last_was_tag = true; +        } +    }  }  /// Handle [`Exit`][EventType::Exit]:[`Paragraph`][Token::Paragraph]. @@ -1119,46 +1184,49 @@ fn on_exit_paragraph(context: &mut CompileContext) {      if *tight {          context.slurp_one_line_ending = true;      } else { -        context.tag("</p>"); +        context.push("</p>"); +        context.last_was_tag = true;      }  }  /// Handle [`Exit`][EventType::Exit]:[`ReferenceString`][Token::ReferenceString].  fn on_exit_reference_string(context: &mut CompileContext) { -    let value = Slice::from_position( -        context.bytes, -        &Position::from_exit_event(context.events, context.index), -    ) -    .serialize(); -      // Drop stuff.      context.resume(); -    let media = context.media_stack.last_mut().unwrap(); -    media.reference_id = Some(value); +    // To do: lifetimes, reference bytes. +    context.media_stack.last_mut().unwrap().reference_id = Some( +        Slice::from_position( +            context.bytes, +            &Position::from_exit_event(context.events, context.index), +        ) +        .serialize(), +    );  }  /// Handle [`Exit`][EventType::Exit]:[`ResourceDestinationString`][Token::ResourceDestinationString].  fn on_exit_resource_destination_string(context: &mut CompileContext) {      let buf = context.resume(); -    let media = context.media_stack.last_mut().unwrap(); -    media.destination = Some(buf); +    context.media_stack.last_mut().unwrap().destination = Some(buf);      context.encode_html = true;  }  /// Handle [`Exit`][EventType::Exit]:[`ResourceTitleString`][Token::ResourceTitleString].  fn on_exit_resource_title_string(context: &mut CompileContext) {      let buf = context.resume(); -    let media = context.media_stack.last_mut().unwrap(); -    media.title = Some(buf); +    context.media_stack.last_mut().unwrap().title = Some(buf);  }  /// Handle [`Exit`][EventType::Exit]:[`Strong`][Token::Strong].  fn on_exit_strong(context: &mut CompileContext) { -    context.tag("</strong>"); +    if !context.in_image_alt { +        context.push("</strong>"); +        context.last_was_tag = true; +    }  }  /// Handle [`Exit`][EventType::Exit]:[`ThematicBreak`][Token::ThematicBreak].  fn on_exit_thematic_break(context: &mut CompileContext) {      context.line_ending_if_needed(); -    context.tag("<hr />"); +    context.push("<hr />"); +    context.last_was_tag = true;  } diff --git a/src/constant.rs b/src/constant.rs index d84dda5..6ef851c 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -165,6 +165,15 @@ pub const HTML_BLOCK_NAMES: [&str; 61] = [      "ul",  ]; +/// Magic string of CDATA (after `<![`). +/// +/// Used in the **cdata** production of [HTML (flow)][html_flow] and +/// [HTML (text)][html_text]. +/// +/// [html_flow]: crate::construct::html_flow +/// [html_text]: crate::construct::html_text +pub const HTML_CDATA_PREFIX: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; +  /// List of HTML tag names that form the **raw** production of  /// [HTML (flow)][html_flow].  /// diff --git a/src/construct/attention.rs b/src/construct/attention.rs index b042645..583fde2 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -88,54 +88,11 @@ enum GroupKind {      Other,  } -/// Type of sequence. -#[derive(Debug, PartialEq)] -enum MarkerKind { -    /// In a run with asterisks. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// *a* -    /// ``` -    Asterisk, -    /// In a run with underscores. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// _a_ -    /// ``` -    Underscore, -} - -impl MarkerKind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            MarkerKind::Asterisk => b'*', -            MarkerKind::Underscore => b'_', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `*` or `_`. -    fn from_byte(byte: u8) -> MarkerKind { -        match byte { -            b'*' => MarkerKind::Asterisk, -            b'_' => MarkerKind::Underscore, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// Attentention sequence that we can take markers from.  #[derive(Debug)]  struct Sequence { -    /// Marker used in this sequence. -    marker: MarkerKind, +    /// Marker as a byte (`u8`) used in this sequence. +    marker: u8,      /// The depth in events where this sequence resides.      balance: usize,      /// The index into events where this sequence’s `Enter` currently resides. @@ -160,9 +117,9 @@ struct Sequence {  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => { +        Some(b'*' | b'_') if tokenizer.parse_state.constructs.attention => {              tokenizer.enter(Token::AttentionSequence); -            inside(tokenizer, MarkerKind::from_byte(byte)) +            inside(tokenizer, tokenizer.current.unwrap())          }          _ => State::Nok,      } @@ -174,14 +131,17 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// > | **  ///     ^^  /// ``` -fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { -    if tokenizer.current == Some(marker.as_byte()) { -        tokenizer.consume(); -        State::Fn(Box::new(move |t| inside(t, marker))) -    } else { -        tokenizer.exit(Token::AttentionSequence); -        tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); -        State::Ok +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State { +    match tokenizer.current { +        Some(b'*' | b'_') if tokenizer.current.unwrap() == marker => { +            tokenizer.consume(); +            State::Fn(Box::new(move |t| inside(t, marker))) +        } +        _ => { +            tokenizer.exit(Token::AttentionSequence); +            tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); +            State::Ok +        }      }  } @@ -219,16 +179,10 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {                      String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]);                  let char_after = string_after.chars().next(); -                let marker = MarkerKind::from_byte( -                    Slice::from_point(tokenizer.parse_state.bytes, &enter.point) -                        .head() -                        .unwrap(), -                ); -                let before = classify_character(if enter.point.index > 0 { -                    char_before -                } else { -                    None -                }); +                let marker = Slice::from_point(tokenizer.parse_state.bytes, &enter.point) +                    .head() +                    .unwrap(); +                let before = classify_character(char_before);                  let after = classify_character(char_after);                  let open = after == GroupKind::Other                      || (after == GroupKind::Punctuation && before != GroupKind::Other); @@ -245,12 +199,12 @@ fn resolve_attention(tokenizer: &mut Tokenizer) {                      start_point: enter.point.clone(),                      end_point: exit.point.clone(),                      size: exit.point.index - enter.point.index, -                    open: if marker == MarkerKind::Asterisk { +                    open: if marker == b'*' {                          open                      } else {                          open && (before != GroupKind::Other || !close)                      }, -                    close: if marker == MarkerKind::Asterisk { +                    close: if marker == b'*' {                          close                      } else {                          close && (after != GroupKind::Other || !open) diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index b843af8..c0514ae 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -137,12 +137,12 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn open(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_alphabetic() => { +        // ASCII alphabetic. +        Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(scheme_or_email_atext))          } -        Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer), -        _ => State::Nok, +        _ => email_atext(tokenizer),      }  } @@ -199,8 +199,8 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State {              tokenizer.exit(Token::AutolinkProtocol);              end(tokenizer)          } -        Some(byte) if byte.is_ascii_control() => State::Nok, -        None | Some(b' ') => State::Nok, +        // ASCII control or space. +        None | Some(b'\0'..=0x1F | b' ' | 0x7F) => State::Nok,          Some(_) => {              tokenizer.consume();              State::Fn(Box::new(url_inside)) @@ -220,7 +220,26 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0)))          } -        Some(byte) if is_ascii_atext(byte) => { +        // ASCII atext. +        // +        // atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or +        // a byte in the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 +        // APOSTROPHE (`'`), U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), +        // U+002D DASH (`-`), U+002F SLASH (`/`), U+003D EQUALS TO (`=`), +        // U+003F QUESTION MARK (`?`), U+005E CARET (`^`) to U+0060 GRAVE +        // ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE (`{`) to U+007E TILDE +        // (`~`). +        // +        // See: +        // **\[RFC5322]**: +        // [Internet Message Format](https://tools.ietf.org/html/rfc5322). +        // P. Resnick. +        // IETF. +        // +        // [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric +        Some( +            b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~', +        ) => {              tokenizer.consume();              State::Fn(Box::new(email_atext))          } @@ -236,7 +255,8 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State {  /// ```  fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size), +        // ASCII alphanumeric. +        Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => email_value(tokenizer, size),          _ => State::Nok,      }  } @@ -279,7 +299,8 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State {              tokenizer.consume();              State::Fn(Box::new(move |t| email_value(t, size + 1)))          } -        Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { +        // ASCII alphanumeric. +        Some(b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_DOMAIN_SIZE_MAX => {              tokenizer.consume();              State::Fn(Box::new(move |t| email_label(t, size + 1)))          } @@ -307,23 +328,3 @@ fn end(tokenizer: &mut Tokenizer) -> State {          _ => unreachable!("expected `>`"),      }  } - -/// Check whether the character code represents an ASCII atext. -/// -/// atext is an ASCII alphanumeric (see [`is_ascii_alphanumeric`][]), or a character in -/// the inclusive ranges U+0023 NUMBER SIGN (`#`) to U+0027 APOSTROPHE (`'`), -/// U+002A ASTERISK (`*`), U+002B PLUS SIGN (`+`), U+002D DASH (`-`), U+002F -/// SLASH (`/`), U+003D EQUALS TO (`=`), U+003F QUESTION MARK (`?`), U+005E -/// CARET (`^`) to U+0060 GRAVE ACCENT (`` ` ``), or U+007B LEFT CURLY BRACE -/// (`{`) to U+007E TILDE (`~`). -/// -/// See: -/// **\[RFC5322]**: -/// [Internet Message Format](https://tools.ietf.org/html/rfc5322). -/// P. Resnick. -/// IETF. -/// -/// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric -fn is_ascii_atext(byte: u8) -> bool { -    matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~') -} diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 02e8b62..4419d7a 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -63,7 +63,8 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn inside(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_punctuation() => { +        // ASCII punctuation. +        Some(b'!'..=b'/' | b':'..=b'@' | b'['..=b'`' | b'{'..=b'~') => {              tokenizer.enter(Token::CharacterEscapeValue);              tokenizer.consume();              tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 90763c1..cd489a4 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,67 +66,18 @@ use crate::constant::{      CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX,  };  use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -/// Kind of a character reference. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { -    /// Numeric decimal character reference. -    /// -    /// ```markdown -    /// > | a	b -    ///      ^^^^^ -    /// ``` -    Decimal, -    /// Numeric hexadecimal character reference. -    /// -    /// ```markdown -    /// > | a{b -    ///      ^^^^^^ -    /// ``` -    Hexadecimal, -    /// Named character reference. -    /// -    /// ```markdown -    /// > | a&b -    ///      ^^^^^ -    /// ``` -    Named, -} - -impl Kind { -    /// Get the maximum size of characters allowed in the value of a character -    /// reference. -    fn max(&self) -> usize { -        match self { -            Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, -            Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, -            Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX, -        } -    } - -    /// Check if a byte ([`u8`]) is allowed. -    fn allowed(&self, byte: u8) -> bool { -        let check = match self { -            Kind::Hexadecimal => u8::is_ascii_hexdigit, -            Kind::Decimal => u8::is_ascii_digit, -            Kind::Named => u8::is_ascii_alphanumeric, -        }; - -        check(&byte) -    } -} +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice;  /// State needed to parse character references.  #[derive(Debug, Clone)]  struct Info { -    /// Place of value start. -    start: Point, -    /// Size of value. -    size: usize, -    /// Kind of character reference. -    kind: Kind, +    /// Index of where value starts. +    start: usize, +    /// Marker of character reference. +    marker: u8, +    /// Maximum number of characters in the value for this kind. +    max: usize,  }  /// Start of a character reference. @@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State {          value(              tokenizer,              Info { -                start: tokenizer.point.clone(), -                size: 0, -                kind: Kind::Named, +                start: tokenizer.point.index, +                marker: b'&', +                max: CHARACTER_REFERENCE_NAMED_SIZE_MAX,              },          )      } @@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {          tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal);          tokenizer.enter(Token::CharacterReferenceValue);          let info = Info { -            start: tokenizer.point.clone(), -            size: 0, -            kind: Kind::Hexadecimal, +            start: tokenizer.point.index, +            marker: b'x', +            max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX,          };          State::Fn(Box::new(|t| value(t, info)))      } else {          tokenizer.enter(Token::CharacterReferenceValue);          let info = Info { -            start: tokenizer.point.clone(), -            size: 0, -            kind: Kind::Decimal, +            start: tokenizer.point.index, +            marker: b'#', +            max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX,          };          value(tokenizer, info)      } @@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State {  /// > | a	b  ///         ^  /// ``` -fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn value(tokenizer: &mut Tokenizer, info: Info) -> State { +    let size = tokenizer.point.index - info.start; +      match tokenizer.current { -        Some(b';') if info.size > 0 => { -            if Kind::Named == info.kind { -                // To do: fix slice. -                let value = Slice::from_position( +        Some(b';') if size > 0 => { +            // Named. +            if info.marker == b'&' { +                // Guaranteed to be valid ASCII bytes. +                let slice = Slice::from_indices(                      tokenizer.parse_state.bytes, -                    &Position { -                        start: &info.start, -                        end: &tokenizer.point, -                    }, -                ) -                .serialize(); +                    info.start, +                    tokenizer.point.index, +                ); +                let name = slice.as_str(); -                if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { +                if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) {                      return State::Nok;                  }              } @@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State {              tokenizer.exit(Token::CharacterReference);              State::Ok          } -        Some(byte) => { -            if info.size < info.kind.max() && info.kind.allowed(byte) { -                info.size += 1; -                tokenizer.consume(); -                State::Fn(Box::new(|t| value(t, info))) -            } else { -                State::Nok -            } +        // ASCII digit, for named, decimal, and hexadecimal references. +        Some(b'0'..=b'9') if size < info.max => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| value(t, info))) +        } +        // ASCII hex letters, for named and hexadecimal references. +        Some(b'A'..=b'F' | b'a'..=b'f') +            if matches!(info.marker, b'&' | b'x') && size < info.max => +        { +            tokenizer.consume(); +            State::Fn(Box::new(|t| value(t, info))) +        } +        // Non-hex ASCII alphabeticals, for named references. +        Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| value(t, info)))          }          _ => State::Nok,      } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 21e9259..c4c3e86 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -110,53 +110,6 @@ use crate::token::Token;  use crate::tokenizer::{ContentType, State, Tokenizer};  use crate::util::slice::{Position, Slice}; -/// Kind of fences. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { -    /// Grave accent (tick) code. -    /// -    /// ## Example -    /// -    /// ````markdown -    /// ```rust -    /// println!("I <3 🦀"); -    /// ``` -    /// ```` -    GraveAccent, -    /// Tilde code. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// ~~~rust -    /// println!("I <3 🦀"); -    /// ~~~ -    /// ``` -    Tilde, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            Kind::GraveAccent => b'`', -            Kind::Tilde => b'~', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `~` or `` ` ``. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'`' => Kind::GraveAccent, -            b'~' => Kind::Tilde, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse code (fenced).  #[derive(Debug, Clone)]  struct Info { @@ -165,8 +118,8 @@ struct Info {      /// Number of tabs or spaces of indentation before the opening fence      /// sequence.      prefix: usize, -    /// Kind of fences. -    kind: Kind, +    /// Marker of fences (`u8`). +    marker: u8,  }  /// Start of fenced code. @@ -178,15 +131,20 @@ struct Info {  ///   | ~~~  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    };      if tokenizer.parse_state.constructs.code_fenced {          tokenizer.enter(Token::CodeFenced);          tokenizer.enter(Token::CodeFencedFence); -        tokenizer.go(space_or_tab_min_max(0, max), before_sequence_open)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before_sequence_open, +        )(tokenizer)      } else {          State::Nok      } @@ -210,23 +168,22 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {                  tokenizer.parse_state.bytes,                  &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1),              ) -            .size(); +            .len();          }      } -    match tokenizer.current { -        Some(byte) if matches!(byte, b'`' | b'~') => { -            tokenizer.enter(Token::CodeFencedFenceSequence); -            sequence_open( -                tokenizer, -                Info { -                    prefix, -                    size: 0, -                    kind: Kind::from_byte(byte), -                }, -            ) -        } -        _ => State::Nok, +    if let Some(b'`' | b'~') = tokenizer.current { +        tokenizer.enter(Token::CodeFencedFenceSequence); +        sequence_open( +            tokenizer, +            Info { +                prefix, +                size: 0, +                marker: tokenizer.current.unwrap(), +            }, +        ) +    } else { +        State::Nok      }  } @@ -240,7 +197,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State {  /// ```  fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {              tokenizer.consume();              State::Fn(Box::new(|t| {                  info.size += 1; @@ -302,7 +259,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State {              tokenizer.exit(Token::CodeFencedFenceInfo);              tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer)          } -        Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, +        Some(b'`') if info.marker == b'`' => State::Nok,          Some(_) => {              tokenizer.consume();              State::Fn(Box::new(|t| info_inside(t, info))) @@ -352,7 +309,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State {              tokenizer.concrete = true;              at_break(tokenizer, info)          } -        Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, +        Some(b'`') if info.marker == b'`' => State::Nok,          _ => {              tokenizer.consume();              State::Fn(Box::new(|t| meta(t, info))) @@ -432,14 +389,18 @@ fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State {  ///     ^  /// ```  fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      tokenizer.enter(Token::CodeFencedFence); -    tokenizer.go(space_or_tab_min_max(0, max), |t| close_before(t, info))(tokenizer) +    tokenizer.go( +        space_or_tab_min_max( +            0, +            if tokenizer.parse_state.constructs.code_indented { +                TAB_SIZE - 1 +            } else { +                usize::MAX +            }, +        ), +        |t| close_before(t, info), +    )(tokenizer)  }  /// In a closing fence, after optional whitespace, before sequence. @@ -452,7 +413,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {              tokenizer.enter(Token::CodeFencedFenceSequence);              close_sequence(tokenizer, info, 0)          } @@ -470,7 +431,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'`' | b'~') if tokenizer.current.unwrap() == info.marker => {              tokenizer.consume();              State::Fn(Box::new(move |t| close_sequence(t, info, size + 1)))          } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 4a3a9f6..81a3080 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -62,11 +62,11 @@ use crate::tokenizer::{State, Tokenizer};  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State {      // Do not interrupt paragraphs. -    if tokenizer.interrupt || !tokenizer.parse_state.constructs.code_indented { -        State::Nok -    } else { +    if !tokenizer.interrupt && tokenizer.parse_state.constructs.code_indented {          tokenizer.enter(Token::CodeIndented);          tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer) +    } else { +        State::Nok      }  } @@ -129,29 +129,26 @@ fn after(tokenizer: &mut Tokenizer) -> State {  ///   |     bbb  /// ```  fn further_start(tokenizer: &mut Tokenizer) -> State { -    if tokenizer.lazy { -        State::Nok -    } else { -        match tokenizer.current { -            Some(b'\n') => { -                tokenizer.enter(Token::LineEnding); -                tokenizer.consume(); -                tokenizer.exit(Token::LineEnding); -                State::Fn(Box::new(further_start)) -            } -            _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { -                Box::new(if ok { further_end } else { further_begin }) -            })(tokenizer), +    match tokenizer.current { +        Some(b'\n') if !tokenizer.lazy => { +            tokenizer.enter(Token::LineEnding); +            tokenizer.consume(); +            tokenizer.exit(Token::LineEnding); +            State::Fn(Box::new(further_start))          } +        _ if !tokenizer.lazy => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| { +            Box::new(if ok { further_end } else { further_begin }) +        })(tokenizer), +        _ => State::Nok,      }  } -/// After a proper indent. +/// At an eol, which is followed by an indented line.  ///  /// ```markdown -///   |     aaa -/// > |     bbb -///         ^ +/// >  |     aaa +///             ^ +///    |     bbb  /// ```  fn further_end(_tokenizer: &mut Tokenizer) -> State {      State::Ok diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index b36a208..d70fbc2 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -95,14 +95,13 @@ use crate::tokenizer::{State, Tokenizer};  ///      ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let len = tokenizer.events.len(); -      match tokenizer.current {          Some(b'`')              if tokenizer.parse_state.constructs.code_text                  && (tokenizer.previous != Some(b'`') -                    || (len > 0 -                        && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => +                    || (!tokenizer.events.is_empty() +                        && tokenizer.events[tokenizer.events.len() - 1].token_type +                            == Token::CharacterEscape)) =>          {              tokenizer.enter(Token::CodeText);              tokenizer.enter(Token::CodeTextSequence); diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 14755c9..bd7df82 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -110,17 +110,18 @@ use crate::util::skip::opt_back as skip_opt_back;  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let definition_before = !tokenizer.events.is_empty() -        && tokenizer.events[skip_opt_back( -            &tokenizer.events, -            tokenizer.events.len() - 1, -            &[Token::LineEnding, Token::SpaceOrTab], -        )] -        .token_type -            == Token::Definition; -      // Do not interrupt paragraphs (but do follow definitions). -    if (!tokenizer.interrupt || definition_before) && tokenizer.parse_state.constructs.definition { +    let possible = !tokenizer.interrupt +        || (!tokenizer.events.is_empty() +            && tokenizer.events[skip_opt_back( +                &tokenizer.events, +                tokenizer.events.len() - 1, +                &[Token::LineEnding, Token::SpaceOrTab], +            )] +            .token_type +                == Token::Definition); + +    if possible && tokenizer.parse_state.constructs.definition {          tokenizer.enter(Token::Definition);          // Note: arbitrary whitespace allowed even if code (indented) is on.          tokenizer.attempt_opt(space_or_tab(), before)(tokenizer) diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index cdbc192..d09bf54 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -54,7 +54,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {          Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => {              tokenizer.enter(Token::HardBreakEscape);              tokenizer.consume(); -            State::Fn(Box::new(inside)) +            State::Fn(Box::new(after))          }          _ => State::Nok,      } @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  ///       ^  ///   | b  /// ``` -fn inside(tokenizer: &mut Tokenizer) -> State { +fn after(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current {          Some(b'\n') => {              tokenizer.exit(Token::HardBreakEscape); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 9a73b77..aa388ee 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -66,15 +66,19 @@ use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer};  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.heading_atx {          tokenizer.enter(Token::HeadingAtx); -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -101,19 +105,19 @@ fn before(tokenizer: &mut Tokenizer) -> State {  /// > | ## aa  ///     ^  /// ``` -fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { +fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State {      match tokenizer.current { -        None | Some(b'\n') if rank > 0 => { +        None | Some(b'\n') if size > 0 => {              tokenizer.exit(Token::HeadingAtxSequence);              at_break(tokenizer)          } -        Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { +        Some(b'#') if size < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {              tokenizer.consume();              State::Fn(Box::new(move |tokenizer| { -                sequence_open(tokenizer, rank + 1) +                sequence_open(tokenizer, size + 1)              }))          } -        _ if rank > 0 => { +        _ if size > 0 => {              tokenizer.exit(Token::HeadingAtxSequence);              tokenizer.go(space_or_tab(), at_break)(tokenizer)          } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 2a4adbf..98d7843 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -63,52 +63,6 @@ use crate::token::Token;  use crate::tokenizer::{EventType, State, Tokenizer};  use crate::util::skip::opt_back as skip_opt_back; -/// Kind of underline. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { -    /// Dash (rank 2) heading. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// alpha -    /// ----- -    /// ``` -    Dash, - -    /// Equals to (rank 1) heading. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// alpha -    /// ===== -    /// ``` -    EqualsTo, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            Kind::Dash => b'-', -            Kind::EqualsTo => b'=', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `-` or `=`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'-' => Kind::Dash, -            b'=' => Kind::EqualsTo, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// At a line ending, presumably an underline.  ///  /// ```markdown @@ -117,23 +71,29 @@ impl Kind {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -    let paragraph_before = !tokenizer.events.is_empty() -        && tokenizer.events[skip_opt_back( -            &tokenizer.events, -            tokenizer.events.len() - 1, -            &[Token::LineEnding, Token::SpaceOrTab], -        )] -        .token_type -            == Token::Paragraph; - -    // Require a paragraph before and do not allow on a lazy line. -    if paragraph_before && !tokenizer.lazy && tokenizer.parse_state.constructs.heading_setext { -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +    if tokenizer.parse_state.constructs.heading_setext +        && !tokenizer.lazy +        // Require a paragraph before. +        && (!tokenizer.events.is_empty() +            && tokenizer.events[skip_opt_back( +                &tokenizer.events, +                tokenizer.events.len() - 1, +                &[Token::LineEnding, Token::SpaceOrTab], +            )] +            .token_type +                == Token::Paragraph) +    { +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -148,9 +108,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn before(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if matches!(byte, b'-' | b'=') => { +        Some(b'-' | b'=') => {              tokenizer.enter(Token::HeadingSetextUnderline); -            inside(tokenizer, Kind::from_byte(byte)) +            inside(tokenizer, tokenizer.current.unwrap())          }          _ => State::Nok,      } @@ -163,11 +123,11 @@ fn before(tokenizer: &mut Tokenizer) -> State {  /// > | ==  ///     ^  /// ``` -fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { +fn inside(tokenizer: &mut Tokenizer, marker: u8) -> State {      match tokenizer.current { -        Some(byte) if byte == kind.as_byte() => { +        Some(b'-' | b'=') if tokenizer.current.unwrap() == marker => {              tokenizer.consume(); -            State::Fn(Box::new(move |t| inside(t, kind))) +            State::Fn(Box::new(move |t| inside(t, marker)))          }          _ => {              tokenizer.exit(Token::HeadingSetextUnderline); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 5860c5d..064da35 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -98,17 +98,17 @@  //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES  //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing -use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE}; +use crate::constant::{ +    HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE, +};  use crate::construct::{      blank_line::start as blank_line,      partial_non_lazy_continuation::start as partial_non_lazy_continuation,      partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions},  };  use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice;  /// Kind of HTML (flow).  #[derive(Debug, PartialEq)] @@ -129,49 +129,6 @@ enum Kind {      Complete,  } -/// Type of quote, if we’re in a quoted attribute, in complete (condition 7). -#[derive(Debug, PartialEq)] -enum QuoteKind { -    /// In a double quoted (`"`) attribute value. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// <a b="c" /> -    /// ``` -    Double, -    /// In a single quoted (`'`) attribute value. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// <a b='c' /> -    /// ``` -    Single, -} - -impl QuoteKind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            QuoteKind::Double => b'"', -            QuoteKind::Single => b'\'', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `"` or `'`. -    fn from_byte(byte: u8) -> QuoteKind { -        match byte { -            b'"' => QuoteKind::Double, -            b'\'' => QuoteKind::Single, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse HTML (flow).  #[derive(Debug)]  struct Info { @@ -179,12 +136,10 @@ struct Info {      kind: Kind,      /// Whether this is a start tag (`<` not followed by `/`).      start_tag: bool, -    /// Used depending on `kind` to collect all parsed bytes. -    start: Option<Point>, -    /// Collected index, for various reasons. -    size: usize, +    /// Start index of a tag name or cdata prefix. +    start: usize,      /// Current quote, when in a double or single quoted attribute value. -    quote: Option<QuoteKind>, +    quote: u8,  }  /// Start of HTML (flow), before optional whitespace. @@ -194,19 +149,17 @@ struct Info {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.html_flow {          tokenizer.enter(Token::HtmlFlow);          tokenizer.go(              space_or_tab_with_options(SpaceOrTabOptions {                  kind: Token::HtmlFlowData,                  min: 0, -                max, +                max: if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                },                  connect: false,                  content_type: None,              }), @@ -249,9 +202,8 @@ fn open(tokenizer: &mut Tokenizer) -> State {          kind: Kind::Basic,          // Assume closing tag (or no tag).          start_tag: false, -        start: None, -        size: 0, -        quote: None, +        start: 0, +        quote: 0,      };      match tokenizer.current { @@ -261,7 +213,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {          }          Some(b'/') => {              tokenizer.consume(); -            info.start = Some(tokenizer.point.clone()); +            info.start = tokenizer.point.index;              State::Fn(Box::new(|t| tag_close_start(t, info)))          }          Some(b'?') => { @@ -273,9 +225,10 @@ fn open(tokenizer: &mut Tokenizer) -> State {              // right now, so we do need to search for `>`, similar to declarations.              State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))          } +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              info.start_tag = true; -            info.start = Some(tokenizer.point.clone()); +            info.start = tokenizer.point.index;              tag_name(tokenizer, info)          }          _ => State::Nok, @@ -299,12 +252,6 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {              info.kind = Kind::Comment;              State::Fn(Box::new(|t| comment_open_inside(t, info)))          } -        Some(b'[') => { -            tokenizer.consume(); -            info.kind = Kind::Cdata; -            info.size = 0; -            State::Fn(Box::new(|t| cdata_open_inside(t, info))) -        }          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              info.kind = Kind::Declaration; @@ -312,6 +259,12 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {              tokenizer.concrete = true;              State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))          } +        Some(b'[') => { +            tokenizer.consume(); +            info.kind = Kind::Cdata; +            info.start = tokenizer.point.index; +            State::Fn(Box::new(|t| cdata_open_inside(t, info))) +        }          _ => State::Nok,      }  } @@ -342,12 +295,11 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == CDATA_SEARCH[info.size] => { -            info.size += 1; +        Some(byte) if byte == HTML_CDATA_PREFIX[tokenizer.point.index - info.start] => {              tokenizer.consume(); -            if info.size == CDATA_SEARCH.len() { -                info.size = 0; +            if tokenizer.point.index - info.start == HTML_CDATA_PREFIX.len() { +                info.start = 0;                  // Do not form containers.                  tokenizer.concrete = true;                  State::Fn(Box::new(|t| continuation(t, info))) @@ -367,6 +319,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {  /// ```  fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| tag_name(t, info))) @@ -387,17 +340,18 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current {          None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => {              let slash = matches!(tokenizer.current, Some(b'/')); -            let start = info.start.take().unwrap(); -            let name = Slice::from_position( +            // Guaranteed to be valid ASCII bytes. +            let slice = Slice::from_indices(                  tokenizer.parse_state.bytes, -                &Position { -                    start: &start, -                    end: &tokenizer.point, -                }, -            ) -            .serialize() -            .trim() -            .to_lowercase(); +                info.start, +                tokenizer.point.index, +            ); +            let name = slice +                .as_str() +                // The line ending case might result in a `\r` that is already accounted for. +                .trim() +                .to_ascii_lowercase(); +            info.start = 0;              if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) {                  info.kind = Kind::Raw; @@ -427,6 +381,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {                  }              }          } +        // ASCII alphanumerical and `-`.          Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| tag_name(t, info))) @@ -490,18 +445,19 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        Some(b'\t' | b' ') => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) +        }          Some(b'/') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_end(t, info)))          } +        // ASCII alphanumerical and `:` and `_`.          Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_name(t, info)))          } -        Some(b'\t' | b' ') => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) -        }          _ => complete_end(tokenizer, info),      }  } @@ -518,6 +474,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat  /// ```  fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        // ASCII alphanumerical and `-`, `.`, `:`, and `_`.          Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_name(t, info))) @@ -537,14 +494,14 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(b'=') => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) -        }          Some(b'\t' | b' ') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_name_after(t, info)))          } +        Some(b'=') => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) +        }          _ => complete_attribute_name_before(tokenizer, info),      }  } @@ -561,15 +518,15 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State  fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current {          None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, -        Some(byte) if matches!(byte, b'"' | b'\'') => { -            info.quote = Some(QuoteKind::from_byte(byte)); -            tokenizer.consume(); -            State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) -        }          Some(b'\t' | b' ') => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))          } +        Some(b'"' | b'\'') => { +            info.quote = tokenizer.current.unwrap(); +            tokenizer.consume(); +            State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) +        }          _ => complete_attribute_value_unquoted(tokenizer, info),      }  } @@ -585,7 +542,7 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) ->  fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current {          None | Some(b'\n') => State::Nok, -        Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => { +        Some(b'"' | b'\'') if tokenizer.current.unwrap() == info.quote => {              tokenizer.consume();              State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info)))          } @@ -673,6 +630,21 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { +            tokenizer.exit(Token::HtmlFlowData); +            tokenizer.check(blank_line_before, |ok| { +                if ok { +                    Box::new(continuation_after) +                } else { +                    Box::new(move |t| continuation_start(t, info)) +                } +            })(tokenizer) +        } +        // Note: important that this is after the basic/complete case. +        None | Some(b'\n') => { +            tokenizer.exit(Token::HtmlFlowData); +            continuation_start(tokenizer, info) +        }          Some(b'-') if info.kind == Kind::Comment => {              tokenizer.consume();              State::Fn(Box::new(|t| continuation_comment_inside(t, info))) @@ -693,20 +665,6 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {              tokenizer.consume();              State::Fn(Box::new(|t| continuation_character_data_inside(t, info)))          } -        Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { -            tokenizer.exit(Token::HtmlFlowData); -            tokenizer.check(blank_line_before, |ok| { -                if ok { -                    Box::new(continuation_after) -                } else { -                    Box::new(move |t| continuation_start(t, info)) -                } -            })(tokenizer) -        } -        None | Some(b'\n') => { -            tokenizer.exit(Token::HtmlFlowData); -            continuation_start(tokenizer, info) -        }          _ => {              tokenizer.consume();              State::Fn(Box::new(|t| continuation(t, info))) @@ -793,7 +751,7 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State      match tokenizer.current {          Some(b'/') => {              tokenizer.consume(); -            info.start = Some(tokenizer.point.clone()); +            info.start = tokenizer.point.index;              State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))          }          _ => continuation(tokenizer, info), @@ -809,18 +767,15 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State  fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current {          Some(b'>') => { -            info.size = 0; - -            let start = info.start.take().unwrap(); -            let name = Slice::from_position( +            // Guaranteed to be valid ASCII bytes. +            let slice = Slice::from_indices(                  tokenizer.parse_state.bytes, -                &Position { -                    start: &start, -                    end: &tokenizer.point, -                }, -            ) -            .serialize() -            .to_lowercase(); +                info.start, +                tokenizer.point.index, +            ); +            let name = slice.as_str().to_ascii_lowercase(); + +            info.start = 0;              if HTML_RAW_NAMES.contains(&name.as_str()) {                  tokenizer.consume(); @@ -829,13 +784,14 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State                  continuation(tokenizer, info)              }          } -        Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => { +        Some(b'A'..=b'Z' | b'a'..=b'z') +            if tokenizer.point.index - info.start < HTML_RAW_SIZE_MAX => +        {              tokenizer.consume(); -            info.size += 1;              State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))          }          _ => { -            info.size = 0; +            info.start = 0;              continuation(tokenizer, info)          }      } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index f10a476..51beda5 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -54,12 +54,11 @@  //! [html_flow]: crate::construct::html_flow  //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +use crate::constant::HTML_CDATA_PREFIX;  use crate::construct::partial_space_or_tab::space_or_tab;  use crate::token::Token;  use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; -  /// Start of HTML (text)  ///  /// ```markdown @@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(instruction))          } +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open)) @@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(comment_open_inside))          } -        Some(b'[') => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| cdata_open_inside(t, 0))) -        } +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(declaration))          } +        Some(b'[') => { +            tokenizer.consume(); +            State::Fn(Box::new(|t| cdata_open_inside(t, 0))) +        }          _ => State::Nok,      }  } @@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State {  /// > | a <![CDATA[>&<]]> b  ///          ^^^^^^  /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { -    match tokenizer.current { -        Some(byte) if byte == CDATA_SEARCH[index] => { -            tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { +    if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { +        tokenizer.consume(); -            if index + 1 == CDATA_SEARCH.len() { -                State::Fn(Box::new(cdata)) -            } else { -                State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) -            } +        if size + 1 == HTML_CDATA_PREFIX.len() { +            State::Fn(Box::new(cdata)) +        } else { +            State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1)))          } -        _ => State::Nok, +    } else { +        State::Nok      }  } @@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_close_start(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphabetical.          Some(b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_close)) @@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_close(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphanumerical and `-`.          Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_close)) @@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_open(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphanumerical and `-`.          Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open)) @@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(end))          } +        // ASCII alphabetical and `:` and `_`.          Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_name)) @@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State {  /// ```  fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { +        // ASCII alphabetical and `-`, `.`, `:`, and `_`.          Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_name)) @@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_value_before))          } -        Some(byte) if byte == b'"' || byte == b'\'' => { +        Some(b'"' | b'\'') => { +            let marker = tokenizer.current.unwrap();              tokenizer.consume(); -            State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) +            State::Fn(Box::new(move |t| { +                tag_open_attribute_value_quoted(t, marker) +            }))          }          Some(_) => {              tokenizer.consume(); @@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta              tokenizer,              Box::new(move |t| tag_open_attribute_value_quoted(t, marker)),          ), -        Some(byte) if byte == marker => { +        Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => {              tokenizer.consume();              State::Fn(Box::new(tag_open_attribute_value_quoted_after))          } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 6399f81..a1ec8d9 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -214,16 +214,14 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {                  media: Media {                      start: label_start.start,                      end: (label_end_start, label_end_start + 3), -                    // To do: virtual spaces not needed, create a `to_str`?                      id: normalize_identifier( -                        &Slice::from_position( +                        // We don’t care about virtual spaces, so `indices` and `as_str` are fine. +                        Slice::from_indices(                              tokenizer.parse_state.bytes, -                            &Position { -                                start: &tokenizer.events[label_start.start.1].point, -                                end: &tokenizer.events[label_end_start - 1].point, -                            }, +                            tokenizer.events[label_start.start.1].point.index, +                            tokenizer.events[label_end_start - 1].point.index,                          ) -                        .serialize(), +                        .as_str(),                      ),                  },              }; @@ -366,11 +364,11 @@ fn ok(tokenizer: &mut Tokenizer, mut info: Info) -> State {  ///        ^  /// ```  fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { -    let label_start = tokenizer +    tokenizer          .label_start_stack          .get_mut(label_start_index) -        .unwrap(); -    label_start.balanced = true; +        .unwrap() +        .balanced = true;      State::Nok  } @@ -529,23 +527,24 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State {  ///          ^  /// ```  fn full_reference_after(tokenizer: &mut Tokenizer) -> State { -    let end = skip::to_back( -        &tokenizer.events, -        tokenizer.events.len() - 1, -        &[Token::ReferenceString], -    ); - -    // To do: virtual spaces not needed, create a `to_str`? -    let id = Slice::from_position( -        tokenizer.parse_state.bytes, -        &Position::from_exit_event(&tokenizer.events, end), -    ) -    .serialize(); -      if tokenizer          .parse_state          .definitions -        .contains(&normalize_identifier(&id)) +        // We don’t care about virtual spaces, so `as_str` is fine. +        .contains(&normalize_identifier( +            Slice::from_position( +                tokenizer.parse_state.bytes, +                &Position::from_exit_event( +                    &tokenizer.events, +                    skip::to_back( +                        &tokenizer.events, +                        tokenizer.events.len() - 1, +                        &[Token::ReferenceString], +                    ), +                ), +            ) +            .as_str(), +        ))      {          State::Ok      } else { diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index d30b8dd..4a3508e 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -64,9 +64,8 @@ pub fn open(tokenizer: &mut Tokenizer) -> State {              tokenizer.consume();              tokenizer.exit(Token::LabelMarker);              tokenizer.exit(Token::LabelImage); -            let end = tokenizer.events.len() - 1;              tokenizer.label_start_stack.push(LabelStart { -                start: (end - 5, end), +                start: (tokenizer.events.len() - 6, tokenizer.events.len() - 1),                  balanced: false,                  inactive: false,              }); diff --git a/src/construct/list.rs b/src/construct/list.rs index 9b59130..d5a9899 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -56,69 +56,6 @@ use crate::util::{      slice::{Position, Slice},  }; -/// Type of list. -#[derive(Debug, PartialEq)] -enum Kind { -    /// In a dot (`.`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// 1. a -    /// ``` -    Dot, -    /// In a paren (`)`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// 1) a -    /// ``` -    Paren, -    /// In an asterisk (`*`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// * a -    /// ``` -    Asterisk, -    /// In a plus (`+`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// + a -    /// ``` -    Plus, -    /// In a dash (`-`) list item. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// - a -    /// ``` -    Dash, -} - -impl Kind { -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'.' => Kind::Dot, -            b')' => Kind::Paren, -            b'*' => Kind::Asterisk, -            b'+' => Kind::Plus, -            b'-' => Kind::Dash, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// Start of list item.  ///  /// ```markdown @@ -126,15 +63,19 @@ impl Kind {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.list {          tokenizer.enter(Token::ListItem); -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -149,15 +90,13 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  fn before(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current {          // Unordered. -        Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| { +        Some(b'*' | b'-') => tokenizer.check(thematic_break, |ok| {              Box::new(if ok { nok } else { before_unordered })          })(tokenizer), +        Some(b'+') => before_unordered(tokenizer),          // Ordered. -        Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => { -            tokenizer.enter(Token::ListItemPrefix); -            tokenizer.enter(Token::ListItemValue); -            inside(tokenizer, 0) -        } +        Some(b'0'..=b'9') if !tokenizer.interrupt => before_ordered(tokenizer), +        Some(b'1') => before_ordered(tokenizer),          _ => State::Nok,      }  } @@ -175,6 +114,18 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {      marker(tokenizer)  } +/// Start of an ordered list item. +/// +/// ```markdown +/// > | * a +///     ^ +/// ``` +fn before_ordered(tokenizer: &mut Tokenizer) -> State { +    tokenizer.enter(Token::ListItemPrefix); +    tokenizer.enter(Token::ListItemValue); +    inside(tokenizer, 0) +} +  /// In an ordered list item value.  ///  /// ```markdown @@ -183,14 +134,14 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State {  /// ```  fn inside(tokenizer: &mut Tokenizer, size: usize) -> State {      match tokenizer.current { -        Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { -            tokenizer.consume(); -            State::Fn(Box::new(move |t| inside(t, size + 1))) -        }          Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => {              tokenizer.exit(Token::ListItemValue);              marker(tokenizer)          } +        Some(b'0'..=b'9') if size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { +            tokenizer.consume(); +            State::Fn(Box::new(move |t| inside(t, size + 1))) +        }          _ => State::Nok,      }  } @@ -262,7 +213,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State {  ///      ^  /// ```  fn whitespace_after(tokenizer: &mut Tokenizer) -> State { -    if matches!(tokenizer.current, Some(b'\t' | b' ')) { +    if let Some(b'\t' | b' ') = tokenizer.current {          State::Nok      } else {          State::Ok @@ -309,7 +260,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State {                  end: &tokenizer.point,              },          ) -        .size(); +        .len();          if blank {              prefix += 1; @@ -389,8 +340,8 @@ fn nok(_tokenizer: &mut Tokenizer) -> State {  pub fn resolve_list_item(tokenizer: &mut Tokenizer) {      let mut index = 0;      let mut balance = 0; -    let mut lists_wip: Vec<(Kind, usize, usize, usize)> = vec![]; -    let mut lists: Vec<(Kind, usize, usize, usize)> = vec![]; +    let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; +    let mut lists: Vec<(u8, usize, usize, usize)> = vec![];      // Merge list items.      while index < tokenizer.events.len() { @@ -400,12 +351,14 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {              if event.event_type == EventType::Enter {                  let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1;                  let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); -                let kind = Kind::from_byte( -                    Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point) -                        .head() -                        .unwrap(), -                ); -                let current = (kind, balance, index, end); +                // Guaranteed to be a valid ASCII byte. +                let marker = Slice::from_index( +                    tokenizer.parse_state.bytes, +                    tokenizer.events[marker].point.index, +                ) +                .head() +                .unwrap(); +                let current = (marker, balance, index, end);                  let mut list_index = lists_wip.len();                  let mut matched = false; @@ -475,7 +428,7 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) {          let mut list_start = tokenizer.events[list_item.2].clone();          let mut list_end = tokenizer.events[list_item.3].clone();          let token_type = match list_item.0 { -            Kind::Paren | Kind::Dot => Token::ListOrdered, +            b'.' | b')' => Token::ListOrdered,              _ => Token::ListUnordered,          };          list_start.token_type = token_type.clone(); diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 146dc40..ec5669c 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -81,10 +81,9 @@ fn inside(tokenizer: &mut Tokenizer) -> State {  /// Merge “`Paragraph`”s, which currently span a single line, into actual  /// `Paragraph`s that span multiple lines.  pub fn resolve(tokenizer: &mut Tokenizer) { -    let len = tokenizer.events.len();      let mut index = 0; -    while index < len { +    while index < tokenizer.events.len() {          let event = &tokenizer.events[index];          if event.event_type == EventType::Enter && event.token_type == Token::Paragraph { diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index be8d6c8..155a1a3 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -10,13 +10,12 @@ use crate::tokenizer::{State, Tokenizer};  ///     ^^^^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    match tokenizer.current { -        Some(0xEF) => { -            tokenizer.enter(Token::ByteOrderMark); -            tokenizer.consume(); -            State::Fn(Box::new(cont)) -        } -        _ => State::Nok, +    if tokenizer.current == Some(0xEF) { +        tokenizer.enter(Token::ByteOrderMark); +        tokenizer.consume(); +        State::Fn(Box::new(cont)) +    } else { +        State::Nok      }  } @@ -27,12 +26,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  ///          ^^^^  /// ```  fn cont(tokenizer: &mut Tokenizer) -> State { -    match tokenizer.current { -        Some(0xBB) => { -            tokenizer.consume(); -            State::Fn(Box::new(end)) -        } -        _ => State::Nok, +    if tokenizer.current == Some(0xBB) { +        tokenizer.consume(); +        State::Fn(Box::new(end)) +    } else { +        State::Nok      }  } @@ -43,12 +41,11 @@ fn cont(tokenizer: &mut Tokenizer) -> State {  ///               ^^^^  /// ```  fn end(tokenizer: &mut Tokenizer) -> State { -    match tokenizer.current { -        Some(0xBF) => { -            tokenizer.consume(); -            tokenizer.exit(Token::ByteOrderMark); -            State::Ok -        } -        _ => State::Nok, +    if tokenizer.current == Some(0xBF) { +        tokenizer.consume(); +        tokenizer.exit(Token::ByteOrderMark); +        State::Ok +    } else { +        State::Nok      }  } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 0a3721c..809aa27 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {              tokenizer.exit(info.options.marker.clone());              State::Fn(Box::new(|t| enclosed_before(t, info)))          } -        None | Some(b' ' | b')') => State::Nok, -        Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, +        // ASCII control, space, closing paren, but *not* `\0`. +        None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok,          Some(_) => {              tokenizer.enter(info.options.destination.clone());              tokenizer.enter(info.options.raw.clone()); @@ -166,12 +166,12 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { +        None | Some(b'\n' | b'<') => State::Nok,          Some(b'>') => {              tokenizer.exit(Token::Data);              tokenizer.exit(info.options.string.clone());              enclosed_before(tokenizer, info)          } -        None | Some(b'\n' | b'<') => State::Nok,          Some(b'\\') => {              tokenizer.consume();              State::Fn(Box::new(|t| enclosed_escape(t, info))) @@ -207,40 +207,25 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(b'(') => { -            if info.balance >= info.options.limit { -                State::Nok -            } else { -                tokenizer.consume(); -                info.balance += 1; -                State::Fn(Box::new(move |t| raw(t, info))) -            } +        None | Some(b'\t' | b'\n' | b' ' | b')') if info.balance == 0 => { +            tokenizer.exit(Token::Data); +            tokenizer.exit(info.options.string.clone()); +            tokenizer.exit(info.options.raw.clone()); +            tokenizer.exit(info.options.destination); +            State::Ok          } -        Some(b')') => { -            if info.balance == 0 { -                tokenizer.exit(Token::Data); -                tokenizer.exit(info.options.string.clone()); -                tokenizer.exit(info.options.raw.clone()); -                tokenizer.exit(info.options.destination); -                State::Ok -            } else { -                tokenizer.consume(); -                info.balance -= 1; -                State::Fn(Box::new(move |t| raw(t, info))) -            } +        Some(b'(') if info.balance < info.options.limit => { +            tokenizer.consume(); +            info.balance += 1; +            State::Fn(Box::new(move |t| raw(t, info)))          } -        None | Some(b'\t' | b'\n' | b' ') => { -            if info.balance > 0 { -                State::Nok -            } else { -                tokenizer.exit(Token::Data); -                tokenizer.exit(info.options.string.clone()); -                tokenizer.exit(info.options.raw.clone()); -                tokenizer.exit(info.options.destination); -                State::Ok -            } +        // ASCII control (but *not* `\0`) and space and `(`. +        None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F) => State::Nok, +        Some(b')') => { +            tokenizer.consume(); +            info.balance -= 1; +            State::Fn(Box::new(move |t| raw(t, info)))          } -        Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok,          Some(b'\\') => {              tokenizer.consume();              State::Fn(Box::new(move |t| raw_escape(t, info))) diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 7e40a2d..6fdb70d 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -123,39 +123,43 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {  ///      ^  /// ```  fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { -    match tokenizer.current { -        None | Some(b'[') => State::Nok, -        Some(b']') if !info.data => State::Nok, -        _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, -        Some(b']') => { -            tokenizer.exit(info.options.string.clone()); -            tokenizer.enter(info.options.marker.clone()); -            tokenizer.consume(); -            tokenizer.exit(info.options.marker.clone()); -            tokenizer.exit(info.options.label); -            State::Ok -        } -        Some(b'\n') => tokenizer.go( -            space_or_tab_eol_with_options(EolOptions { -                content_type: Some(ContentType::String), -                connect: info.connect, -            }), -            |t| { -                info.connect = true; -                at_break(t, info) -            }, -        )(tokenizer), -        _ => { -            tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - -            if info.connect { -                let index = tokenizer.events.len() - 1; -                link(&mut tokenizer.events, index); -            } else { -                info.connect = true; +    if info.size > LINK_REFERENCE_SIZE_MAX +        || matches!(tokenizer.current, None | Some(b'[')) +        || (matches!(tokenizer.current, Some(b']')) && !info.data) +    { +        State::Nok +    } else { +        match tokenizer.current { +            Some(b'\n') => tokenizer.go( +                space_or_tab_eol_with_options(EolOptions { +                    content_type: Some(ContentType::String), +                    connect: info.connect, +                }), +                |t| { +                    info.connect = true; +                    at_break(t, info) +                }, +            )(tokenizer), +            Some(b']') => { +                tokenizer.exit(info.options.string.clone()); +                tokenizer.enter(info.options.marker.clone()); +                tokenizer.consume(); +                tokenizer.exit(info.options.marker.clone()); +                tokenizer.exit(info.options.label); +                State::Ok              } +            _ => { +                tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); + +                if info.connect { +                    let index = tokenizer.events.len() - 1; +                    link(&mut tokenizer.events, index); +                } else { +                    info.connect = true; +                } -            label(tokenizer, info) +                label(tokenizer, info) +            }          }      }  } @@ -172,30 +176,19 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State {              tokenizer.exit(Token::Data);              at_break(tokenizer, info)          } -        _ if info.size > LINK_REFERENCE_SIZE_MAX => { -            tokenizer.exit(Token::Data); -            at_break(tokenizer, info) -        } -        Some(b'\t' | b' ') => { -            tokenizer.consume(); -            info.size += 1; -            State::Fn(Box::new(|t| label(t, info))) -        } -        Some(b'\\') => { -            tokenizer.consume(); -            info.size += 1; -            if !info.data { -                info.data = true; -            } -            State::Fn(Box::new(|t| escape(t, info))) -        } -        Some(_) => { -            tokenizer.consume(); -            info.size += 1; -            if !info.data { -                info.data = true; +        Some(byte) => { +            if info.size > LINK_REFERENCE_SIZE_MAX { +                tokenizer.exit(Token::Data); +                at_break(tokenizer, info) +            } else { +                let func = if matches!(byte, b'\\') { escape } else { label }; +                tokenizer.consume(); +                info.size += 1; +                if !info.data && !matches!(byte, b'\t' | b' ') { +                    info.data = true; +                } +                State::Fn(Box::new(move |t| func(t, info)))              } -            State::Fn(Box::new(|t| label(t, info)))          }      }  } diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 80861af..9cf2f14 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -48,70 +48,13 @@ pub struct Options {      pub string: Token,  } -/// Type of title. -#[derive(Debug, PartialEq)] -enum Kind { -    /// In a parenthesized (`(` and `)`) title. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// (a) -    /// ``` -    Paren, -    /// In a double quoted (`"`) title. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// "a" -    /// ``` -    Double, -    /// In a single quoted (`'`) title. -    /// -    /// ## Example -    /// -    /// ```markdown -    /// 'a' -    /// ``` -    Single, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    /// -    /// > 👉 **Note**: a closing paren is used for `Kind::Paren`. -    fn as_byte(&self) -> u8 { -        match self { -            Kind::Paren => b')', -            Kind::Double => b'"', -            Kind::Single => b'\'', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `(`, `"`, or `'`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'(' => Kind::Paren, -            b'"' => Kind::Double, -            b'\'' => Kind::Single, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse titles.  #[derive(Debug)]  struct Info {      /// Whether we’ve seen data.      connect: bool, -    /// Kind of title. -    kind: Kind, +    /// Closing marker. +    marker: u8,      /// Configuration.      options: Options,  } @@ -124,10 +67,11 @@ struct Info {  /// ```  pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {      match tokenizer.current { -        Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => { +        Some(b'"' | b'\'' | b'(') => { +            let marker = tokenizer.current.unwrap();              let info = Info {                  connect: false, -                kind: Kind::from_byte(byte), +                marker: if marker == b'(' { b')' } else { marker },                  options,              };              tokenizer.enter(info.options.title.clone()); @@ -150,7 +94,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State {  /// ```  fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {              tokenizer.enter(info.options.marker.clone());              tokenizer.consume();              tokenizer.exit(info.options.marker.clone()); @@ -172,10 +116,6 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { -            tokenizer.exit(info.options.string.clone()); -            begin(tokenizer, info) -        }          None => State::Nok,          Some(b'\n') => tokenizer.go(              space_or_tab_eol_with_options(EolOptions { @@ -187,7 +127,11 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {                  at_break(t, info)              },          )(tokenizer), -        _ => { +        Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => { +            tokenizer.exit(info.options.string.clone()); +            begin(tokenizer, info) +        } +        Some(_) => {              tokenizer.enter_with_content(Token::Data, Some(ContentType::String));              if info.connect { @@ -210,21 +154,18 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State {  /// ```  fn title(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        None | Some(b'\n') => {              tokenizer.exit(Token::Data);              at_break(tokenizer, info)          } -        None | Some(b'\n') => { +        Some(b'"' | b'\'' | b')') if tokenizer.current.unwrap() == info.marker => {              tokenizer.exit(Token::Data);              at_break(tokenizer, info)          } -        Some(b'\\') => { +        Some(byte) => { +            let func = if matches!(byte, b'\\') { escape } else { title };              tokenizer.consume(); -            State::Fn(Box::new(|t| escape(t, info))) -        } -        _ => { -            tokenizer.consume(); -            State::Fn(Box::new(|t| title(t, info))) +            State::Fn(Box::new(move |t| func(t, info)))          }      }  } @@ -237,7 +178,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn escape(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'"' | b'\'' | b')') => {              tokenizer.consume();              State::Fn(Box::new(|t| title(t, info)))          } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 13815cb..4f872ba 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -92,8 +92,7 @@ fn trim_data(      if trim_end {          let mut index = slice.bytes.len(); -        let vs = slice.after; -        let mut spaces_only = vs == 0; +        let mut spaces_only = slice.after == 0;          while index > 0 {              match slice.bytes[index - 1] {                  b' ' => {} @@ -105,10 +104,10 @@ fn trim_data(          }          let diff = slice.bytes.len() - index; -        let token_type = if spaces_only -            && hard_break -            && exit_index + 1 < tokenizer.events.len() +        let token_type = if hard_break +            && spaces_only              && diff >= HARD_BREAK_PREFIX_SIZE_MIN +            && exit_index + 1 < tokenizer.events.len()          {              Token::HardBreakTrailing          } else { @@ -123,7 +122,7 @@ fn trim_data(              return;          } -        if diff > 0 || vs > 0 { +        if diff > 0 || slice.after > 0 {              let exit_point = tokenizer.events[exit_index].point.clone();              let mut enter_point = exit_point.clone();              enter_point.index -= diff; @@ -156,14 +155,11 @@ fn trim_data(      if trim_start {          let mut index = 0; -        let vs = slice.before;          while index < slice.bytes.len() {              match slice.bytes[index] { -                b' ' | b'\t' => {} +                b' ' | b'\t' => index += 1,                  _ => break,              } - -            index += 1;          }          // The whole data is whitespace. @@ -174,7 +170,7 @@ fn trim_data(              return;          } -        if index > 0 || vs > 0 { +        if index > 0 || slice.before > 0 {              let enter_point = tokenizer.events[exit_index - 1].point.clone();              let mut exit_point = enter_point.clone();              exit_point.index += index; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 4fc4dc4..785d132 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -53,64 +53,11 @@ use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN};  use crate::token::Token;  use crate::tokenizer::{State, Tokenizer}; -/// Type of thematic break. -#[derive(Debug, PartialEq)] -enum Kind { -    /// In a thematic break using asterisks (`*`). -    /// -    /// ## Example -    /// -    /// ```markdown -    /// *** -    /// ``` -    Asterisk, -    /// In a thematic break using dashes (`-`). -    /// -    /// ## Example -    /// -    /// ```markdown -    /// --- -    /// ``` -    Dash, -    /// In a thematic break using underscores (`_`). -    /// -    /// ## Example -    /// -    /// ```markdown -    /// ___ -    /// ``` -    Underscore, -} - -impl Kind { -    /// Turn the kind into a byte ([u8]). -    fn as_byte(&self) -> u8 { -        match self { -            Kind::Asterisk => b'*', -            Kind::Dash => b'-', -            Kind::Underscore => b'_', -        } -    } -    /// Turn a byte ([u8]) into a kind. -    /// -    /// ## Panics -    /// -    /// Panics if `byte` is not `*`, `-`, or `_`. -    fn from_byte(byte: u8) -> Kind { -        match byte { -            b'*' => Kind::Asterisk, -            b'-' => Kind::Dash, -            b'_' => Kind::Underscore, -            _ => unreachable!("invalid byte"), -        } -    } -} -  /// State needed to parse thematic breaks.  #[derive(Debug)]  struct Info { -    /// Kind of marker. -    kind: Kind, +    /// Marker. +    marker: u8,      /// Number of markers.      size: usize,  } @@ -122,15 +69,19 @@ struct Info {  ///     ^  /// ```  pub fn start(tokenizer: &mut Tokenizer) -> State { -    let max = if tokenizer.parse_state.constructs.code_indented { -        TAB_SIZE - 1 -    } else { -        usize::MAX -    }; -      if tokenizer.parse_state.constructs.thematic_break {          tokenizer.enter(Token::ThematicBreak); -        tokenizer.go(space_or_tab_min_max(0, max), before)(tokenizer) +        tokenizer.go( +            space_or_tab_min_max( +                0, +                if tokenizer.parse_state.constructs.code_indented { +                    TAB_SIZE - 1 +                } else { +                    usize::MAX +                }, +            ), +            before, +        )(tokenizer)      } else {          State::Nok      } @@ -144,10 +95,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {  /// ```  fn before(tokenizer: &mut Tokenizer) -> State {      match tokenizer.current { -        Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break( +        Some(b'*' | b'-' | b'_') => at_break(              tokenizer,              Info { -                kind: Kind::from_byte(byte), +                marker: tokenizer.current.unwrap(),                  size: 0,              },          ), @@ -163,13 +114,13 @@ fn before(tokenizer: &mut Tokenizer) -> State {  /// ```  fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {      match tokenizer.current { -        None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { +        None | Some(b'\n') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => {              tokenizer.exit(Token::ThematicBreak);              // Feel free to interrupt.              tokenizer.interrupt = false;              State::Ok          } -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {              tokenizer.enter(Token::ThematicBreakSequence);              sequence(tokenizer, info)          } @@ -185,7 +136,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State {  /// ```  fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State {      match tokenizer.current { -        Some(byte) if byte == info.kind.as_byte() => { +        Some(b'*' | b'-' | b'_') if tokenizer.current.unwrap() == info.marker => {              tokenizer.consume();              info.size += 1;              State::Fn(Box::new(|t| sequence(t, info))) diff --git a/src/content/document.rs b/src/content/document.rs index 828431d..76d510a 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -89,14 +89,13 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {          let event = &tokenizer.events[index];          if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString { -            // To do: when we operate on u8, we can use a `to_str` here as we -            // don‘t need virtual spaces. +            // Note: we don‘t care about virtual spaces, so `as_str` is fine.              let id = normalize_identifier( -                &Slice::from_position( +                Slice::from_position(                      tokenizer.parse_state.bytes,                      &Position::from_exit_event(&tokenizer.events, index),                  ) -                .serialize(), +                .as_str(),              );              if !definitions.contains(&id) { @@ -423,6 +423,6 @@ pub fn micromark(value: &str) -> String {  /// ```  #[must_use]  pub fn micromark_with_options(value: &str, options: &Options) -> String { -    let (events, result) = parse(value, options); -    compile(&events, result.bytes, options) +    let (events, bytes) = parse(value, options); +    compile(&events, bytes, options)  } diff --git a/src/parser.rs b/src/parser.rs index 613b206..23afb37 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -20,7 +20,7 @@ pub struct ParseState<'a> {  /// Turn a string of markdown into events.  ///  /// Passes the codes back so the compiler can access the source. -pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, ParseState<'a>) { +pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, &'a [u8]) {      let mut parse_state = ParseState {          constructs: &options.constructs,          bytes: value.as_bytes(), @@ -37,6 +37,5 @@ pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec<Event>, ParseStat          },      ); -    // To do: return bytes only? -    (events, parse_state) +    (events, parse_state.bytes)  } diff --git a/src/unicode.rs b/src/unicode.rs index a8445f9..764d4c7 100644 --- a/src/unicode.rs +++ b/src/unicode.rs @@ -6,7 +6,7 @@  /// > It is generate from the latest Unicode data.  ///  /// Rust does not contain an `is_punctuation` method on `char`, while it does -/// support [`is_ascii_alphanumeric`](char::is_ascii_punctuation). +/// support [`is_ascii_alphanumeric`](char::is_ascii_alphanumeric).  ///  /// `CommonMark` handles attention (emphasis, strong) markers based on what  /// comes before or after them. diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs index 5277f90..f8fd18f 100644 --- a/src/util/decode_character_reference.rs +++ b/src/util/decode_character_reference.rs @@ -57,9 +57,9 @@ pub fn decode_named(value: &str) -> String {  /// ```rust ignore  /// use micromark::util::decode_character_reference::decode_numeric;  /// -/// assert_eq!(decode_numeric("123", 10), '{'); -/// assert_eq!(decode_numeric("9", 16), '\t'); -/// assert_eq!(decode_numeric("0", 10), '�'); // Not allowed. +/// assert_eq!(decode_numeric("123", 10), "{"); +/// assert_eq!(decode_numeric("9", 16), "\t"); +/// assert_eq!(decode_numeric("0", 10), "�"); // Not allowed.  /// ```  ///  /// ## Panics @@ -74,27 +74,19 @@ pub fn decode_named(value: &str) -> String {  ///  /// *   [`micromark-util-decode-numeric-character-reference` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-decode-numeric-character-reference)  /// *   [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.30/#entity-and-numeric-character-references) -pub fn decode_numeric(value: &str, radix: u32) -> char { -    let code = u32::from_str_radix(value, radix).expect("expected `value` to be an int"); - -    if -    // C0 except for HT, LF, FF, CR, space -    code < 0x09 || -    code == 0x0B || -    (code > 0x0D && code < 0x20) || -    // Control character (DEL) of the basic block and C1 controls. -    (code > 0x7E && code < 0xA0) || -    // Lone high surrogates and low surrogates. -    (code > 0xd7ff && code < 0xe000) || -    // Noncharacters. -    (code > 0xfdcf && code < 0xfdf0) || -    ((code & 0xffff) == 0xffff) || -    ((code & 0xffff) == 0xfffe) || -    // Out of range -    code > 0x0010_ffff -    { -        char::REPLACEMENT_CHARACTER -    } else { -        char::from_u32(code).expect("expected valid `code`") +pub fn decode_numeric(value: &str, radix: u32) -> String { +    if let Some(char) = char::from_u32(u32::from_str_radix(value, radix).unwrap()) { +        if !matches!(char, +            // C0 except for HT, LF, FF, CR, space +            '\0'..='\u{08}' | '\u{0B}' | '\u{0E}'..='\u{1F}' | +            // Control character (DEL) of c0, and C1 controls. +            '\u{7F}'..='\u{9F}' +            // Lone surrogates, noncharacters, and out of range are handled by +            // Rust. +        ) { +            return char.to_string(); +        }      } + +    char::REPLACEMENT_CHARACTER.to_string()  } diff --git a/src/util/encode.rs b/src/util/encode.rs index 91c5462..d37a2de 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -20,37 +20,33 @@  /// ## References  ///  /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) -pub fn encode<S: Into<String>>(value: S, encode_html: bool) -> String { -    let check = if encode_html { check_all } else { check_nil }; -    let mut value = value.into(); - +pub fn encode(value: &str, encode_html: bool) -> String {      // It’ll grow a bit bigger for each dangerous character.      let mut result = String::with_capacity(value.len()); +    let bytes = value.as_bytes(); +    let mut index = 0; +    let mut start = 0; -    while let Some(indice) = value.find(check) { -        let after = value.split_off(indice + 1); -        let dangerous = value.pop().unwrap(); -        result.push_str(&value); -        result.push_str(match dangerous { -            '\0' => "�", -            '&' => "&", -            '"' => """, -            '<' => "<", -            '>' => ">", -            _ => unreachable!("xxx"), -        }); -        value = after; -    } +    while index < bytes.len() { +        let byte = bytes[index]; +        if matches!(byte, b'\0') || (encode_html && matches!(byte, b'&' | b'"' | b'<' | b'>')) { +            result.push_str(&value[start..index]); +            result.push_str(match byte { +                b'\0' => "�", +                b'&' => "&", +                b'"' => """, +                b'<' => "<", +                b'>' => ">", +                _ => panic!("impossible"), +            }); -    result.push_str(&value); +            start = index + 1; +        } -    result -} +        index += 1; +    } -fn check_all(char: char) -> bool { -    matches!(char, '\0' | '&' | '"' | '<' | '>') -} +    result.push_str(&value[start..]); -fn check_nil(char: char) -> bool { -    matches!(char, '\0') +    result  } diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 42a2bb0..f5b12d0 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -34,25 +34,34 @@  pub fn normalize_identifier(value: &str) -> String {      // Note: it’ll grow a bit smaller for consecutive whitespace.      let mut result = String::with_capacity(value.len()); -    let mut at_start = true; -    let mut at_whitespace = true; +    let bytes = value.as_bytes(); +    let mut in_whitespace = true; +    let mut index = 0; +    let mut start = 0; -    // Collapse markdown whitespace and trim it. -    for char in value.chars() { -        match char { -            '\t' | '\n' | '\r' | ' ' => { -                at_whitespace = true; +    while index < bytes.len() { +        if matches!(bytes[index], b'\t' | b'\n' | b'\r' | b' ') { +            // First whitespace we see after non-whitespace. +            if !in_whitespace { +                result.push_str(&value[start..index]); +                in_whitespace = true;              } -            _ => { -                if at_whitespace && !at_start { -                    result.push(' '); -                } - -                result.push(char); -                at_start = false; -                at_whitespace = false; +        } +        // First non-whitespace we see after whitespace. +        else if in_whitespace { +            if start != 0 { +                result.push(' ');              } + +            start = index; +            in_whitespace = false;          } + +        index += 1; +    } + +    if !in_whitespace { +        result.push_str(&value[start..]);      }      // Some characters are considered “uppercase”, but if their lowercase diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 8c09549..051e1e1 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -32,7 +32,7 @@ use crate::util::encode::encode;  ///  /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri)  pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String { -    let value = encode(normalize_uri(value), true); +    let value = encode(&*normalize_uri(value), true);      if let Some(protocols) = protocols {          let end = value.find(|c| matches!(c, '?' | '#' | '/')); diff --git a/src/util/slice.rs b/src/util/slice.rs index cd3641e..d899dac 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -2,6 +2,7 @@  use crate::constant::TAB_SIZE;  use crate::tokenizer::{Event, EventType, Point}; +use std::str;  /// A range between two places.  #[derive(Debug)] @@ -78,6 +79,15 @@ impl<'a> Slice<'a> {          }      } +    /// To do. +    pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> { +        Slice { +            bytes: &bytes[index..=index], +            before: 0, +            after: 0, +        } +    } +      /// Get the slice belonging to a position.      pub fn from_position(bytes: &'a [u8], position: &Position) -> Slice<'a> {          let mut before = position.start.vs; @@ -107,14 +117,18 @@ impl<'a> Slice<'a> {      }      /// To do. -    // To do: rename to `len`? -    pub fn size(&self) -> usize { -        self.bytes.len() + self.before + self.after +    pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> { +        Slice { +            bytes: &bytes[start..end], +            before: 0, +            after: 0, +        }      } -    // To do: -    // When we have u8s, we could use: <https://doc.rust-lang.org/std/str/fn.from_utf8.html> -    // to implement an `as_str`. +    /// To do. +    pub fn len(&self) -> usize { +        self.bytes.len() + self.before + self.after +    }      /// To do.      pub fn head(&self) -> Option<u8> { @@ -127,16 +141,20 @@ impl<'a> Slice<'a> {          }      } +    // To do: +    pub fn as_str(&self) -> &str { +        str::from_utf8(self.bytes).unwrap() +    } +      /// To do.      pub fn serialize(&self) -> String { -        let mut string = String::with_capacity(self.size()); +        let mut string = String::with_capacity(self.len());          let mut index = self.before;          while index > 0 {              string.push(' ');              index -= 1;          } -        // To do: invalid UTF8? -        string.push_str(std::str::from_utf8(self.bytes).unwrap()); +        string.push_str(self.as_str());          index = self.after;          while index > 0 {              string.push(' '); | 
