From 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 29 Jul 2022 10:49:07 +0200 Subject: Refactor to work on bytes (`u8`) --- src/compiler.rs | 45 +++++---- src/construct/attention.rs | 66 ++++++++----- src/construct/autolink.rs | 40 ++++---- src/construct/blank_line.rs | 2 +- src/construct/block_quote.rs | 6 +- src/construct/character_escape.rs | 4 +- src/construct/character_reference.rs | 27 ++--- src/construct/code_fenced.rs | 56 +++++------ src/construct/code_indented.rs | 8 +- src/construct/code_text.rs | 14 +-- src/construct/definition.rs | 8 +- src/construct/hard_break_escape.rs | 4 +- src/construct/heading_atx.rs | 16 +-- src/construct/heading_setext.rs | 30 +++--- src/construct/html_flow.rs | 131 ++++++++++++------------- src/construct/html_text.rs | 108 ++++++++++---------- src/construct/label_end.rs | 24 ++--- src/construct/label_start_image.rs | 4 +- src/construct/label_start_link.rs | 2 +- src/construct/list.rs | 45 ++++----- src/construct/mod.rs | 1 + src/construct/paragraph.rs | 4 +- src/construct/partial_bom.rs | 54 ++++++++++ src/construct/partial_data.rs | 16 +-- src/construct/partial_destination.rs | 28 +++--- src/construct/partial_label.rs | 22 ++--- src/construct/partial_non_lazy_continuation.rs | 2 +- src/construct/partial_space_or_tab.rs | 12 +-- src/construct/partial_title.rs | 44 ++++----- src/construct/partial_whitespace.rs | 22 ++--- src/construct/thematic_break.rs | 36 +++---- src/content/document.rs | 17 +--- src/content/flow.rs | 4 +- src/content/string.rs | 2 +- src/content/text.rs | 20 ++-- src/lib.rs | 2 +- src/parser.rs | 8 +- src/tokenizer.rs | 126 ++++++++++++------------ src/util/slice.rs | 44 ++++----- 39 files changed, 576 insertions(+), 528 deletions(-) create mode 100644 src/construct/partial_bom.rs diff --git a/src/compiler.rs b/src/compiler.rs index f5673b4..de76142 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -60,7 +60,7 @@ struct Definition { struct CompileContext<'a> { /// Static info. pub events: &'a [Event], - pub chars: &'a [char], + pub bytes: &'a [u8], /// Fields used by handlers to track the things they need to track to /// compile markdown. pub atx_opening_sequence_size: Option, @@ -92,13 +92,13 @@ impl<'a> CompileContext<'a> { /// Create a new compile context. pub fn new( events: &'a [Event], - chars: &'a [char], + bytes: &'a [u8], options: &Options, line_ending: LineEnding, ) -> CompileContext<'a> { CompileContext { events, - chars, + bytes, atx_opening_sequence_size: None, heading_setext_buffer: None, code_flow_seen_data: None, @@ -177,6 +177,7 @@ impl<'a> CompileContext<'a> { /// Add a line ending if needed (as in, there’s no eol/eof already). pub fn line_ending_if_needed(&mut self) { + // To do: fix to use bytes. let last_char = self.buf_tail().chars().last(); let mut add = true; @@ -196,7 +197,7 @@ impl<'a> CompileContext<'a> { /// Turn events and codes into a string of HTML. #[allow(clippy::too_many_lines)] -pub fn compile(events: &[Event], chars: &[char], options: &Options) -> String { +pub fn compile(events: &[Event], bytes: &[u8], options: &Options) -> String { let mut index = 0; let mut line_ending_inferred = None; @@ -209,7 +210,7 @@ pub fn compile(events: &[Event], chars: &[char], options: &Options) -> String { && (event.token_type == Token::BlankLineEnding || event.token_type == Token::LineEnding) { line_ending_inferred = Some(LineEnding::from_str( - &Slice::from_position(chars, &Position::from_exit_event(events, index)).serialize(), + &Slice::from_position(bytes, &Position::from_exit_event(events, index)).serialize(), )); break; } @@ -237,7 +238,7 @@ pub fn compile(events: &[Event], chars: &[char], options: &Options) -> String { } }; - let mut context = CompileContext::new(events, chars, options, line_ending_default); + let mut context = CompileContext::new(events, bytes, options, line_ending_default); let mut definition_indices = vec![]; let mut index = 0; let mut definition_inside = false; @@ -604,7 +605,7 @@ fn on_enter_strong(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`AutolinkEmail`][Token::AutolinkEmail]. fn on_exit_autolink_email(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -623,7 +624,7 @@ fn on_exit_autolink_email(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`AutolinkProtocol`][Token::AutolinkProtocol]. fn on_exit_autolink_protocol(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -678,7 +679,7 @@ fn on_exit_character_reference_value(context: &mut CompileContext) { .take() .expect("expected `character_reference_kind` to be set"); let reference = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -696,7 +697,7 @@ fn on_exit_character_reference_value(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`CodeFlowChunk`][Token::CodeFlowChunk]. fn on_exit_code_flow_chunk(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -765,6 +766,7 @@ fn on_exit_code_flow(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`CodeText`][Token::CodeText]. fn on_exit_code_text(context: &mut CompileContext) { let result = context.resume(); + // To do: use bytes. let mut chars = result.chars(); let mut trim = false; @@ -797,7 +799,7 @@ fn on_exit_drop(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:{[`CodeTextData`][Token::CodeTextData],[`Data`][Token::Data],[`CharacterEscapeValue`][Token::CharacterEscapeValue]}. fn on_exit_data(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -841,7 +843,7 @@ fn on_exit_definition_destination_string(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`DefinitionLabelString`][Token::DefinitionLabelString]. fn on_exit_definition_label_string(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -879,7 +881,7 @@ fn on_exit_heading_atx_sequence(context: &mut CompileContext) { // First fence we see. if context.atx_opening_sequence_size.is_none() { let rank = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .size(); @@ -909,11 +911,11 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) { .take() .expect("`atx_opening_sequence_size` must be set in headings"); let head = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .head(); - let level = if head == Some('-') { 2 } else { 1 }; + let level = if head == Some(b'-') { 2 } else { 1 }; context.line_ending_if_needed(); context.tag(&*format!("", level)); @@ -929,7 +931,7 @@ fn on_exit_html(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:{[`HtmlFlowData`][Token::HtmlFlowData],[`HtmlTextData`][Token::HtmlTextData]}. fn on_exit_html_data(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -947,7 +949,7 @@ fn on_exit_label(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`LabelText`][Token::LabelText]. fn on_exit_label_text(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -964,7 +966,7 @@ fn on_exit_line_ending(context: &mut CompileContext) { context.slurp_one_line_ending = false; } else { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -1017,7 +1019,7 @@ fn on_exit_list_item_value(context: &mut CompileContext) { if expect_first_item { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); @@ -1037,7 +1039,8 @@ fn on_exit_media(context: &mut CompileContext) { let mut index = 0; // Skip current. - while index < (context.media_stack.len() - 1) { + let end = context.media_stack.len() - 1; + while index < end { if context.media_stack[index].image { is_in_image = true; break; @@ -1123,7 +1126,7 @@ fn on_exit_paragraph(context: &mut CompileContext) { /// Handle [`Exit`][EventType::Exit]:[`ReferenceString`][Token::ReferenceString]. fn on_exit_reference_string(context: &mut CompileContext) { let value = Slice::from_position( - context.chars, + context.bytes, &Position::from_exit_event(context.events, context.index), ) .serialize(); diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 65c2f6f..b042645 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -110,23 +110,23 @@ enum MarkerKind { } impl MarkerKind { - /// Turn the kind into a [char]. - fn as_char(&self) -> char { + /// Turn the kind into a byte ([u8]). + fn as_byte(&self) -> u8 { match self { - MarkerKind::Asterisk => '*', - MarkerKind::Underscore => '_', + MarkerKind::Asterisk => b'*', + MarkerKind::Underscore => b'_', } } - /// Turn [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// ## Panics /// - /// Panics if `char` is not `*` or `_`. - fn from_char(char: char) -> MarkerKind { - match char { - '*' => MarkerKind::Asterisk, - '_' => MarkerKind::Underscore, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `*` or `_`. + fn from_byte(byte: u8) -> MarkerKind { + match byte { + b'*' => MarkerKind::Asterisk, + b'_' => MarkerKind::Underscore, + _ => unreachable!("invalid byte"), } } } @@ -160,9 +160,9 @@ struct Sequence { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(char) if tokenizer.parse_state.constructs.attention && matches!(char, '*' | '_') => { + Some(byte) if tokenizer.parse_state.constructs.attention && matches!(byte, b'*' | b'_') => { tokenizer.enter(Token::AttentionSequence); - inside(tokenizer, MarkerKind::from_char(char)) + inside(tokenizer, MarkerKind::from_byte(byte)) } _ => State::Nok, } @@ -175,7 +175,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^^ /// ``` fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { - if tokenizer.current == Some(marker.as_char()) { + if tokenizer.current == Some(marker.as_byte()) { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, marker))) } else { @@ -188,7 +188,6 @@ fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { /// Resolve attention sequences. #[allow(clippy::too_many_lines)] fn resolve_attention(tokenizer: &mut Tokenizer) { - let chars = &tokenizer.parse_state.chars; let mut start = 0; let mut balance = 0; let mut sequences = vec![]; @@ -203,21 +202,34 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { if enter.token_type == Token::AttentionSequence { let end = start + 1; let exit = &tokenizer.events[end]; - let marker = - MarkerKind::from_char(Slice::from_point(chars, &enter.point).head().unwrap()); + + let before_end = enter.point.index; + let before_start = if before_end < 4 { 0 } else { before_end - 4 }; + let string_before = + String::from_utf8_lossy(&tokenizer.parse_state.bytes[before_start..before_end]); + let char_before = string_before.chars().last(); + + let after_start = exit.point.index; + let after_end = if after_start + 4 > tokenizer.parse_state.bytes.len() { + tokenizer.parse_state.bytes.len() + } else { + after_start + 4 + }; + let string_after = + String::from_utf8_lossy(&tokenizer.parse_state.bytes[after_start..after_end]); + let char_after = string_after.chars().next(); + + let marker = MarkerKind::from_byte( + Slice::from_point(tokenizer.parse_state.bytes, &enter.point) + .head() + .unwrap(), + ); let before = classify_character(if enter.point.index > 0 { - Slice::from_point( - chars, - &Point { - index: enter.point.index - 1, - ..enter.point - }, - ) - .tail() + char_before } else { None }); - let after = classify_character(Slice::from_point(chars, &exit.point).tail()); + let after = classify_character(char_after); let open = after == GroupKind::Other || (after == GroupKind::Punctuation && before != GroupKind::Other); // To do: GFM strikethrough? @@ -490,7 +502,7 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { /// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) fn classify_character(char: Option) -> GroupKind { match char { - // Custom characters. + // EOF. None => GroupKind::Whitespace, // Unicode whitespace. Some(char) if char.is_whitespace() => GroupKind::Whitespace, diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 399570b..b843af8 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -115,7 +115,7 @@ use crate::tokenizer::{State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('<') if tokenizer.parse_state.constructs.autolink => { + Some(b'<') if tokenizer.parse_state.constructs.autolink => { tokenizer.enter(Token::Autolink); tokenizer.enter(Token::AutolinkMarker); tokenizer.consume(); @@ -137,16 +137,16 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(char) if char.is_ascii_alphabetic() => { + Some(byte) if byte.is_ascii_alphabetic() => { tokenizer.consume(); State::Fn(Box::new(scheme_or_email_atext)) } - Some(char) if is_ascii_atext(char) => email_atext(tokenizer), + Some(byte) if is_ascii_atext(byte) => email_atext(tokenizer), _ => State::Nok, } } -/// After the first character of the protocol or email name. +/// After the first byte of the protocol or email name. /// /// ```markdown /// > | ab @@ -156,7 +156,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { scheme_inside_or_email_atext(tokenizer, 1) } _ => email_atext(tokenizer), @@ -173,11 +173,11 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(':') => { + Some(b':') => { tokenizer.consume(); State::Fn(Box::new(url_inside)) } - Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') + Some(b'+' | b'-' | b'.' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') if size < AUTOLINK_SCHEME_SIZE_MAX => { tokenizer.consume(); @@ -195,12 +195,12 @@ fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State /// ``` fn url_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.exit(Token::AutolinkProtocol); end(tokenizer) } - Some(char) if char.is_ascii_control() => State::Nok, - None | Some(' ') => State::Nok, + Some(byte) if byte.is_ascii_control() => State::Nok, + None | Some(b' ') => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(url_inside)) @@ -216,11 +216,11 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('@') => { + Some(b'@') => { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Some(char) if is_ascii_atext(char) => { + Some(byte) if is_ascii_atext(byte) => { tokenizer.consume(); State::Fn(Box::new(email_atext)) } @@ -236,7 +236,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size), + Some(byte) if byte.is_ascii_alphanumeric() => email_value(tokenizer, size), _ => State::Nok, } } @@ -249,11 +249,11 @@ fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some('.') => { + Some(b'.') => { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Some('>') => { + Some(b'>') => { let index = tokenizer.events.len(); tokenizer.exit(Token::AutolinkProtocol); // Change the token type. @@ -275,11 +275,11 @@ fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some(b'-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_value(t, size + 1))) } - Some(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some(byte) if byte.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_label(t, size + 1))) } @@ -297,7 +297,7 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.enter(Token::AutolinkMarker); tokenizer.consume(); tokenizer.exit(Token::AutolinkMarker); @@ -324,6 +324,6 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// IETF. /// /// [`is_ascii_alphanumeric`]: char::is_ascii_alphanumeric -fn is_ascii_atext(x: char) -> bool { - matches!(x, '#'..='\'' | '*' | '+' | '-'..='9' | '=' | '?' | 'A'..='Z' | '^'..='~') +fn is_ascii_atext(byte: u8) -> bool { + matches!(byte, b'#'..=b'\'' | b'*' | b'+' | b'-'..=b'9' | b'=' | b'?' | b'A'..=b'Z' | b'^'..=b'~') } diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 6780f40..f397a48 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => State::Ok, + None | Some(b'\n') => State::Ok, _ => State::Nok, } } diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 49a0ea0..7e4753d 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -65,7 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.enter(Token::BlockQuote); cont_before(tokenizer) } @@ -98,7 +98,7 @@ pub fn cont(tokenizer: &mut Tokenizer) -> State { /// ``` fn cont_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.enter(Token::BlockQuotePrefix); tokenizer.enter(Token::BlockQuoteMarker); tokenizer.consume(); @@ -118,7 +118,7 @@ fn cont_before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn cont_after(tokenizer: &mut Tokenizer) -> State { - if let Some('\t' | ' ') = tokenizer.current { + if let Some(b'\t' | b' ') = tokenizer.current { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index e9263af..02e8b62 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -44,7 +44,7 @@ use crate::tokenizer::{State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\\') if tokenizer.parse_state.constructs.character_escape => { + Some(b'\\') if tokenizer.parse_state.constructs.character_escape => { tokenizer.enter(Token::CharacterEscape); tokenizer.enter(Token::CharacterEscapeMarker); tokenizer.consume(); @@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(char) if char.is_ascii_punctuation() => { + Some(byte) if byte.is_ascii_punctuation() => { tokenizer.enter(Token::CharacterEscapeValue); tokenizer.consume(); tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 59043d1..90763c1 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -106,15 +106,15 @@ impl Kind { } } - /// Check if a char is allowed. - fn allowed(&self, char: char) -> bool { + /// Check if a byte ([`u8`]) is allowed. + fn allowed(&self, byte: u8) -> bool { let check = match self { - Kind::Hexadecimal => char::is_ascii_hexdigit, - Kind::Decimal => char::is_ascii_digit, - Kind::Named => char::is_ascii_alphanumeric, + Kind::Hexadecimal => u8::is_ascii_hexdigit, + Kind::Decimal => u8::is_ascii_digit, + Kind::Named => u8::is_ascii_alphanumeric, }; - check(&char) + check(&byte) } } @@ -141,7 +141,7 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('&') if tokenizer.parse_state.constructs.character_reference => { + Some(b'&') if tokenizer.parse_state.constructs.character_reference => { tokenizer.enter(Token::CharacterReference); tokenizer.enter(Token::CharacterReferenceMarker); tokenizer.consume(); @@ -164,7 +164,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn open(tokenizer: &mut Tokenizer) -> State { - if let Some('#') = tokenizer.current { + if let Some(b'#') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerNumeric); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerNumeric); @@ -192,7 +192,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn numeric(tokenizer: &mut Tokenizer) -> State { - if let Some('x' | 'X') = tokenizer.current { + if let Some(b'x' | b'X') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerHexadecimal); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); @@ -229,10 +229,11 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { /// ``` fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(';') if info.size > 0 => { + Some(b';') if info.size > 0 => { if Kind::Named == info.kind { + // To do: fix slice. let value = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position { start: &info.start, end: &tokenizer.point, @@ -252,8 +253,8 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::CharacterReference); State::Ok } - Some(char) => { - if info.size < info.kind.max() && info.kind.allowed(char) { + Some(byte) => { + if info.size < info.kind.max() && info.kind.allowed(byte) { info.size += 1; tokenizer.consume(); State::Fn(Box::new(|t| value(t, info))) diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 98fa54f..21e9259 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -136,23 +136,23 @@ pub enum Kind { } impl Kind { - /// Turn the kind into a [char]. - fn as_char(&self) -> char { + /// Turn the kind into a byte ([u8]). + fn as_byte(&self) -> u8 { match self { - Kind::GraveAccent => '`', - Kind::Tilde => '~', + Kind::GraveAccent => b'`', + Kind::Tilde => b'~', } } - /// Turn a [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// ## Panics /// - /// Panics if `char` is not `~` or `` ` ``. - fn from_char(char: char) -> Kind { - match char { - '`' => Kind::GraveAccent, - '~' => Kind::Tilde, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `~` or `` ` ``. + fn from_byte(byte: u8) -> Kind { + match byte { + b'`' => Kind::GraveAccent, + b'~' => Kind::Tilde, + _ => unreachable!("invalid byte"), } } } @@ -207,7 +207,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(event) = tail { if event.token_type == Token::SpaceOrTab { prefix = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), ) .size(); @@ -215,14 +215,14 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { } match tokenizer.current { - Some(char) if matches!(char, '`' | '~') => { + Some(byte) if matches!(byte, b'`' | b'~') => { tokenizer.enter(Token::CodeFencedFenceSequence); sequence_open( tokenizer, Info { prefix, size: 0, - kind: Kind::from_char(char), + kind: Kind::from_byte(byte), }, ) } @@ -240,7 +240,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.consume(); State::Fn(Box::new(|t| { info.size += 1; @@ -265,7 +265,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -289,7 +289,7 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.exit(Token::CodeFencedFence); @@ -297,12 +297,12 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) } - Some('`') if info.kind == Kind::GraveAccent => State::Nok, + Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, Some(_) => { tokenizer.consume(); State::Fn(Box::new(|t| info_inside(t, info))) @@ -320,7 +320,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -344,7 +344,7 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceMeta); tokenizer.exit(Token::CodeFencedFence); @@ -352,7 +352,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Some('`') if info.kind == Kind::GraveAccent => State::Nok, + Some(b'`') if info.kind == Kind::GraveAccent => State::Nok, _ => { tokenizer.consume(); State::Fn(Box::new(|t| meta(t, info))) @@ -413,7 +413,7 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -452,7 +452,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.enter(Token::CodeFencedFenceSequence); close_sequence(tokenizer, info, 0) } @@ -470,7 +470,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.consume(); State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) } @@ -492,7 +492,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { /// ``` fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::CodeFencedFence); State::Ok } @@ -538,7 +538,7 @@ fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => at_break(tokenizer, info), + None | Some(b'\n') => at_break(tokenizer, info), _ => { tokenizer.enter(Token::CodeFlowChunk); content_continue(tokenizer, info) @@ -556,7 +556,7 @@ fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer, info) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index bb1615c..4a3a9f6 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -79,7 +79,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => after(tokenizer), - Some('\n') => tokenizer.attempt(further_start, |ok| { + Some(b'\n') => tokenizer.attempt(further_start, |ok| { Box::new(if ok { at_break } else { after }) })(tokenizer), _ => { @@ -97,7 +97,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ``` fn content(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer) } @@ -133,7 +133,7 @@ fn further_start(tokenizer: &mut Tokenizer) -> State { State::Nok } else { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -177,7 +177,7 @@ fn further_begin(tokenizer: &mut Tokenizer) -> State { /// ``` fn further_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\n') => further_start(tokenizer), + Some(b'\n') => further_start(tokenizer), _ => State::Nok, } } diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 150f63b..b36a208 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -98,9 +98,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { let len = tokenizer.events.len(); match tokenizer.current { - Some('`') + Some(b'`') if tokenizer.parse_state.constructs.code_text - && (tokenizer.previous != Some('`') + && (tokenizer.previous != Some(b'`') || (len > 0 && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => { @@ -119,7 +119,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { - if let Some('`') = tokenizer.current { + if let Some(b'`') = tokenizer.current { tokenizer.consume(); State::Fn(Box::new(move |t| sequence_open(t, size + 1))) } else { @@ -137,13 +137,13 @@ fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { match tokenizer.current { None => State::Nok, - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); State::Fn(Box::new(move |t| between(t, size_open))) } - Some('`') => { + Some(b'`') => { tokenizer.enter(Token::CodeTextSequence); sequence_close(tokenizer, size_open, 0) } @@ -162,7 +162,7 @@ fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// ``` fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { match tokenizer.current { - None | Some('\n' | '`') => { + None | Some(b'\n' | b'`') => { tokenizer.exit(Token::CodeTextData); between(tokenizer, size_open) } @@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// ``` fn sequence_close(tokenizer: &mut Tokenizer, size_open: usize, size: usize) -> State { match tokenizer.current { - Some('`') => { + Some(b'`') => { tokenizer.consume(); State::Fn(Box::new(move |t| sequence_close(t, size_open, size + 1))) } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index f2b5ae0..14755c9 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -137,7 +137,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('[') => tokenizer.go( + Some(b'[') => tokenizer.go( |t| { label( t, @@ -162,7 +162,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn label_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(':') => { + Some(b':') => { tokenizer.enter(Token::DefinitionMarker); tokenizer.consume(); tokenizer.exit(Token::DefinitionMarker); @@ -231,7 +231,7 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// ``` fn after_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::Definition); // You’d be interrupting. tokenizer.interrupt = true; @@ -294,7 +294,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn title_after_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => State::Ok, + None | Some(b'\n') => State::Ok, _ => State::Nok, } } diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 0585c4c..cdbc192 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -51,7 +51,7 @@ use crate::tokenizer::{State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\\') if tokenizer.parse_state.constructs.hard_break_escape => { + Some(b'\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); tokenizer.consume(); State::Fn(Box::new(inside)) @@ -69,7 +69,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.exit(Token::HardBreakEscape); State::Ok } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 7a7cf2e..9a73b77 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -87,7 +87,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Some('#') == tokenizer.current { + if Some(b'#') == tokenizer.current { tokenizer.enter(Token::HeadingAtxSequence); sequence_open(tokenizer, 0) } else { @@ -103,11 +103,11 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { match tokenizer.current { - None | Some('\n') if rank > 0 => { + None | Some(b'\n') if rank > 0 => { tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) } - Some('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + Some(b'#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |tokenizer| { sequence_open(tokenizer, rank + 1) @@ -129,15 +129,15 @@ fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::HeadingAtx); tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Some('\t' | ' ') => tokenizer.go(space_or_tab(), at_break)(tokenizer), - Some('#') => { + Some(b'\t' | b' ') => tokenizer.go(space_or_tab(), at_break)(tokenizer), + Some(b'#') => { tokenizer.enter(Token::HeadingAtxSequence); further_sequence(tokenizer) } @@ -157,7 +157,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn further_sequence(tokenizer: &mut Tokenizer) -> State { - if let Some('#') = tokenizer.current { + if let Some(b'#') = tokenizer.current { tokenizer.consume(); State::Fn(Box::new(further_sequence)) } else { @@ -175,7 +175,7 @@ fn further_sequence(tokenizer: &mut Tokenizer) -> State { fn data(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. - None | Some('\t' | '\n' | ' ') => { + None | Some(b'\t' | b'\n' | b' ') => { tokenizer.exit(Token::Data); at_break(tokenizer) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index f9dd3f7..2a4adbf 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -88,23 +88,23 @@ pub enum Kind { } impl Kind { - /// Turn the kind into a [char]. - fn as_char(&self) -> char { + /// Turn the kind into a byte ([u8]). + fn as_byte(&self) -> u8 { match self { - Kind::Dash => '-', - Kind::EqualsTo => '=', + Kind::Dash => b'-', + Kind::EqualsTo => b'=', } } - /// Turn a [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// ## Panics /// - /// Panics if `char` is not `-` or `=`. - fn from_char(char: char) -> Kind { - match char { - '-' => Kind::Dash, - '=' => Kind::EqualsTo, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `-` or `=`. + fn from_byte(byte: u8) -> Kind { + match byte { + b'-' => Kind::Dash, + b'=' => Kind::EqualsTo, + _ => unreachable!("invalid byte"), } } } @@ -148,9 +148,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(char) if matches!(char, '-' | '=') => { + Some(byte) if matches!(byte, b'-' | b'=') => { tokenizer.enter(Token::HeadingSetextUnderline); - inside(tokenizer, Kind::from_char(char)) + inside(tokenizer, Kind::from_byte(byte)) } _ => State::Nok, } @@ -165,7 +165,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { match tokenizer.current { - Some(char) if char == kind.as_char() => { + Some(byte) if byte == kind.as_byte() => { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, kind))) } @@ -185,7 +185,7 @@ fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve)); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index e2b66e5..5860c5d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -108,7 +108,7 @@ use crate::token::Token; use crate::tokenizer::{Point, State, Tokenizer}; use crate::util::slice::{Position, Slice}; -const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; +const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -151,23 +151,23 @@ enum QuoteKind { } impl QuoteKind { - /// Turn the kind into a [char]. - fn as_char(&self) -> char { + /// Turn the kind into a byte ([u8]). + fn as_byte(&self) -> u8 { match self { - QuoteKind::Double => '"', - QuoteKind::Single => '\'', + QuoteKind::Double => b'"', + QuoteKind::Single => b'\'', } } - /// Turn a [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// ## Panics /// - /// Panics if `char` is not `"` or `'`. - fn from_char(char: char) -> QuoteKind { - match char { - '"' => QuoteKind::Double, - '\'' => QuoteKind::Single, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `"` or `'`. + fn from_byte(byte: u8) -> QuoteKind { + match byte { + b'"' => QuoteKind::Double, + b'\'' => QuoteKind::Single, + _ => unreachable!("invalid byte"), } } } @@ -179,8 +179,7 @@ struct Info { kind: Kind, /// Whether this is a start tag (`<` not followed by `/`). start_tag: bool, - /// Used depending on `kind` to either collect all parsed characters, or to - /// store expected characters. + /// Used depending on `kind` to collect all parsed bytes. start: Option, /// Collected index, for various reasons. size: usize, @@ -225,7 +224,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Some('<') == tokenizer.current { + if Some(b'<') == tokenizer.current { tokenizer.enter(Token::HtmlFlowData); tokenizer.consume(); State::Fn(Box::new(open)) @@ -256,16 +255,16 @@ fn open(tokenizer: &mut Tokenizer) -> State { }; match tokenizer.current { - Some('!') => { + Some(b'!') => { tokenizer.consume(); State::Fn(Box::new(|t| declaration_open(t, info))) } - Some('/') => { + Some(b'/') => { tokenizer.consume(); info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| tag_close_start(t, info))) } - Some('?') => { + Some(b'?') => { info.kind = Kind::Instruction; tokenizer.consume(); // Do not form containers. @@ -274,7 +273,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { // right now, so we do need to search for `>`, similar to declarations. State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Some('A'..='Z' | 'a'..='z') => { + Some(b'A'..=b'Z' | b'a'..=b'z') => { info.start_tag = true; info.start = Some(tokenizer.point.clone()); tag_name(tokenizer, info) @@ -295,18 +294,18 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('-') => { + Some(b'-') => { tokenizer.consume(); info.kind = Kind::Comment; State::Fn(Box::new(|t| comment_open_inside(t, info))) } - Some('[') => { + Some(b'[') => { tokenizer.consume(); info.kind = Kind::Cdata; info.size = 0; State::Fn(Box::new(|t| cdata_open_inside(t, info))) } - Some('A'..='Z' | 'a'..='z') => { + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); info.kind = Kind::Declaration; // Do not form containers. @@ -325,7 +324,7 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('-') => { + Some(b'-') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -343,7 +342,7 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(char) if char == CDATA_SEARCH[info.size] => { + Some(byte) if byte == CDATA_SEARCH[info.size] => { info.size += 1; tokenizer.consume(); @@ -368,7 +367,7 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('A'..='Z' | 'a'..='z') => { + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| tag_name(t, info))) } @@ -386,11 +385,11 @@ fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - None | Some('\t' | '\n' | ' ' | '/' | '>') => { - let slash = matches!(tokenizer.current, Some('/')); + None | Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => { + let slash = matches!(tokenizer.current, Some(b'/')); let start = info.start.take().unwrap(); let name = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position { start: &start, end: &tokenizer.point, @@ -428,7 +427,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } } - Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| tag_name(t, info))) } @@ -444,7 +443,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -462,7 +461,7 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_closing_tag_after(t, info))) } @@ -491,15 +490,15 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('/') => { + Some(b'/') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_end(t, info))) } - Some('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) } @@ -519,7 +518,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } @@ -538,11 +537,11 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('=') => { + Some(b'=') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) } @@ -561,13 +560,13 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State /// ``` fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - None | Some('<' | '=' | '>' | '`') => State::Nok, - Some(char) if matches!(char, '"' | '\'') => { - info.quote = Some(QuoteKind::from_char(char)); + None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, + Some(byte) if matches!(byte, b'"' | b'\'') => { + info.quote = Some(QuoteKind::from_byte(byte)); tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) } - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } @@ -585,8 +584,8 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> /// ``` fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => State::Nok, - Some(char) if char == info.quote.as_ref().unwrap().as_char() => { + None | Some(b'\n') => State::Nok, + Some(byte) if byte == info.quote.as_ref().unwrap().as_byte() => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) } @@ -605,7 +604,7 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\t' | '\n' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + None | Some(b'\t' | b'\n' | b' ' | b'"' | b'\'' | b'/' | b'<' | b'=' | b'>' | b'`') => { complete_attribute_name_after(tokenizer, info) } Some(_) => { @@ -624,7 +623,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S /// ``` fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('\t' | ' ' | '/' | '>') => complete_attribute_name_before(tokenizer, info), + Some(b'\t' | b' ' | b'/' | b'>') => complete_attribute_name_before(tokenizer, info), _ => State::Nok, } } @@ -637,7 +636,7 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) /// ``` fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } @@ -653,12 +652,12 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) } - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } @@ -674,27 +673,27 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('-') if info.kind == Kind::Comment => { + Some(b'-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_comment_inside(t, info))) } - Some('<') if info.kind == Kind::Raw => { + Some(b'<') if info.kind == Kind::Raw => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_raw_tag_open(t, info))) } - Some('>') if info.kind == Kind::Declaration => { + Some(b'>') if info.kind == Kind::Declaration => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Some('?') if info.kind == Kind::Instruction => { + Some(b'?') if info.kind == Kind::Instruction => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Some(']') if info.kind == Kind::Cdata => { + Some(b']') if info.kind == Kind::Cdata => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) } - Some('\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { + Some(b'\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { tokenizer.exit(Token::HtmlFlowData); tokenizer.check(blank_line_before, |ok| { if ok { @@ -704,7 +703,7 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { } })(tokenizer) } - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_start(tokenizer, info) } @@ -741,7 +740,7 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -760,7 +759,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => continuation_start(tokenizer, info), + None | Some(b'\n') => continuation_start(tokenizer, info), _ => { tokenizer.enter(Token::HtmlFlowData); continuation(tokenizer, info) @@ -776,7 +775,7 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('-') => { + Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -792,7 +791,7 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('/') => { + Some(b'/') => { tokenizer.consume(); info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) @@ -809,12 +808,12 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State /// ``` fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { info.size = 0; let start = info.start.take().unwrap(); let name = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position { start: &start, end: &tokenizer.point, @@ -830,7 +829,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State continuation(tokenizer, info) } } - Some('A'..='Z' | 'a'..='z') if info.size < HTML_RAW_SIZE_MAX => { + Some(b'A'..=b'Z' | b'a'..=b'z') if info.size < HTML_RAW_SIZE_MAX => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) @@ -850,7 +849,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State /// ``` fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(']') => { + Some(b']') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -874,11 +873,11 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> /// ``` fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Some('-') if info.kind == Kind::Comment => { + Some(b'-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -894,7 +893,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_after(tokenizer) } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index b1ad113..f10a476 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -58,7 +58,7 @@ use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; +const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; /// Start of HTML (text) /// @@ -67,7 +67,7 @@ const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Some('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { + if Some(b'<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { tokenizer.enter(Token::HtmlText); tokenizer.enter(Token::HtmlTextData); tokenizer.consume(); @@ -89,19 +89,19 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('!') => { + Some(b'!') => { tokenizer.consume(); State::Fn(Box::new(declaration_open)) } - Some('/') => { + Some(b'/') => { tokenizer.consume(); State::Fn(Box::new(tag_close_start)) } - Some('?') => { + Some(b'?') => { tokenizer.consume(); State::Fn(Box::new(instruction)) } - Some('A'..='Z' | 'a'..='z') => { + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } @@ -121,15 +121,15 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('-') => { + Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Some('[') => { + Some(b'[') => { tokenizer.consume(); State::Fn(Box::new(|t| cdata_open_inside(t, 0))) } - Some('A'..='Z' | 'a'..='z') => { + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } @@ -145,7 +145,7 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('-') => { + Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(comment_start)) } @@ -168,8 +168,8 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('>') => State::Nok, - Some('-') => { + None | Some(b'>') => State::Nok, + Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(comment_start_dash)) } @@ -192,7 +192,7 @@ fn comment_start(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('>') => State::Nok, + None | Some(b'>') => State::Nok, _ => comment(tokenizer), } } @@ -206,8 +206,8 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some('\n') => at_line_ending(tokenizer, Box::new(comment)), - Some('-') => { + Some(b'\n') => at_line_ending(tokenizer, Box::new(comment)), + Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(comment_close)) } @@ -226,7 +226,7 @@ fn comment(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('-') => { + Some(b'-') => { tokenizer.consume(); State::Fn(Box::new(end)) } @@ -242,7 +242,7 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { match tokenizer.current { - Some(char) if char == CDATA_SEARCH[index] => { + Some(byte) if byte == CDATA_SEARCH[index] => { tokenizer.consume(); if index + 1 == CDATA_SEARCH.len() { @@ -264,8 +264,8 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some('\n') => at_line_ending(tokenizer, Box::new(cdata)), - Some(']') => { + Some(b'\n') => at_line_ending(tokenizer, Box::new(cdata)), + Some(b']') => { tokenizer.consume(); State::Fn(Box::new(cdata_close)) } @@ -284,7 +284,7 @@ fn cdata(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(']') => { + Some(b']') => { tokenizer.consume(); State::Fn(Box::new(cdata_end)) } @@ -300,8 +300,8 @@ fn cdata_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => end(tokenizer), - Some(']') => cdata_close(tokenizer), + Some(b'>') => end(tokenizer), + Some(b']') => cdata_close(tokenizer), _ => cdata(tokenizer), } } @@ -314,8 +314,8 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('>') => end(tokenizer), - Some('\n') => at_line_ending(tokenizer, Box::new(declaration)), + None | Some(b'>') => end(tokenizer), + Some(b'\n') => at_line_ending(tokenizer, Box::new(declaration)), _ => { tokenizer.consume(); State::Fn(Box::new(declaration)) @@ -332,8 +332,8 @@ fn declaration(tokenizer: &mut Tokenizer) -> State { fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Nok, - Some('\n') => at_line_ending(tokenizer, Box::new(instruction)), - Some('?') => { + Some(b'\n') => at_line_ending(tokenizer, Box::new(instruction)), + Some(b'?') => { tokenizer.consume(); State::Fn(Box::new(instruction_close)) } @@ -352,7 +352,7 @@ fn instruction(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => end(tokenizer), + Some(b'>') => end(tokenizer), _ => instruction(tokenizer), } } @@ -365,7 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('A'..='Z' | 'a'..='z') => { + Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -381,7 +381,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -397,8 +397,8 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\n') => at_line_ending(tokenizer, Box::new(tag_close_between)), - Some('\t' | ' ') => { + Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_close_between)), + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_close_between)) } @@ -414,11 +414,11 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } - Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -431,16 +431,16 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_between)), - Some('\t' | ' ') => { + Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_open_between)), + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_between)) } - Some('/') => { + Some(b'/') => { tokenizer.consume(); State::Fn(Box::new(end)) } - Some(':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -456,7 +456,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -473,12 +473,12 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)), - Some('\t' | ' ') => { + Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)), + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name_after)) } - Some('=') => { + Some(b'=') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } @@ -495,15 +495,15 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('<' | '=' | '>' | '`') => State::Nok, - Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)), - Some('\t' | ' ') => { + None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, + Some(b'\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)), + Some(b'\t' | b' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Some(char) if char == '"' || char == '\'' => { + Some(byte) if byte == b'"' || byte == b'\'' => { tokenizer.consume(); - State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, char))) + State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) } Some(_) => { tokenizer.consume(); @@ -518,14 +518,14 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// > | a e /// ^ /// ``` -fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> State { +fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> State { match tokenizer.current { None => State::Nok, - Some('\n') => at_line_ending( + Some(b'\n') => at_line_ending( tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Some(char) if char == marker => { + Some(byte) if byte == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } @@ -546,8 +546,8 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> S /// ``` fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('"' | '\'' | '<' | '=' | '`') => State::Nok, - Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), + None | Some(b'"' | b'\'' | b'<' | b'=' | b'`') => State::Nok, + Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => tag_open_between(tokenizer), Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) @@ -564,7 +564,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\t' | '\n' | ' ' | '>' | '/') => tag_open_between(tokenizer), + Some(b'\t' | b'\n' | b' ' | b'>' | b'/') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -577,7 +577,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.consume(); tokenizer.exit(Token::HtmlTextData); tokenizer.exit(Token::HtmlText); @@ -599,7 +599,7 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box) -> State { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.exit(Token::HtmlTextData); tokenizer.enter(Token::LineEnding); tokenizer.consume(); diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 5ea788f..6399f81 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -182,7 +182,7 @@ struct Info { /// > | [a] b /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Some(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { + if Some(b']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { let mut label_start_index = None; let mut index = tokenizer.label_start_stack.len(); @@ -217,7 +217,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { // To do: virtual spaces not needed, create a `to_str`? id: normalize_identifier( &Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position { start: &tokenizer.events[label_start.start.1].point, end: &tokenizer.events[label_end_start - 1].point, @@ -258,7 +258,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { // Resource (`[asd](fgh)`)? - Some('(') => tokenizer.attempt(resource, move |is_ok| { + Some(b'(') => tokenizer.attempt(resource, move |is_ok| { Box::new(move |t| { // Also fine if `defined`, as then it’s a valid shortcut. if is_ok || defined { @@ -269,7 +269,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { }) })(tokenizer), // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? - Some('[') => tokenizer.attempt(full_reference, move |is_ok| { + Some(b'[') => tokenizer.attempt(full_reference, move |is_ok| { Box::new(move |t| { if is_ok { ok(t, info) @@ -382,7 +382,7 @@ fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { /// ``` fn resource(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('(') => { + Some(b'(') => { tokenizer.enter(Token::Resource); tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); @@ -411,7 +411,7 @@ fn resource_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(')') => resource_end(tokenizer), + Some(b')') => resource_end(tokenizer), _ => tokenizer.go( |t| { destination( @@ -451,7 +451,7 @@ fn destination_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('"' | '\'' | '(') => tokenizer.go( + Some(b'"' | b'\'' | b'(') => tokenizer.go( |t| { title( t, @@ -486,7 +486,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(')') => { + Some(b')') => { tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); tokenizer.exit(Token::ResourceMarker); @@ -505,7 +505,7 @@ fn resource_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn full_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('[') => tokenizer.go( + Some(b'[') => tokenizer.go( |t| { label( t, @@ -537,7 +537,7 @@ fn full_reference_after(tokenizer: &mut Tokenizer) -> State { // To do: virtual spaces not needed, create a `to_str`? let id = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, end), ) .serialize(); @@ -563,7 +563,7 @@ fn full_reference_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('[') => { + Some(b'[') => { tokenizer.enter(Token::Reference); tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); @@ -584,7 +584,7 @@ fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { /// ``` fn collapsed_reference_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(']') => { + Some(b']') => { tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); tokenizer.exit(Token::ReferenceMarker); diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 078026d..d30b8dd 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -40,7 +40,7 @@ use crate::tokenizer::{LabelStart, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('!') if tokenizer.parse_state.constructs.label_start_image => { + Some(b'!') if tokenizer.parse_state.constructs.label_start_image => { tokenizer.enter(Token::LabelImage); tokenizer.enter(Token::LabelImageMarker); tokenizer.consume(); @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('[') => { + Some(b'[') => { tokenizer.enter(Token::LabelMarker); tokenizer.consume(); tokenizer.exit(Token::LabelMarker); diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index d7ae1d6..c47941c 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -39,7 +39,7 @@ use crate::tokenizer::{LabelStart, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('[') if tokenizer.parse_state.constructs.label_start_link => { + Some(b'[') if tokenizer.parse_state.constructs.label_start_link => { let start = tokenizer.events.len(); tokenizer.enter(Token::LabelLink); tokenizer.enter(Token::LabelMarker); diff --git a/src/construct/list.rs b/src/construct/list.rs index 355eeee..9b59130 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -102,19 +102,19 @@ enum Kind { } impl Kind { - /// Turn a [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// ## Panics /// - /// Panics if `char` is not `.`, `)`, `*`, `+`, or `-`. - fn from_char(char: char) -> Kind { - match char { - '.' => Kind::Dot, - ')' => Kind::Paren, - '*' => Kind::Asterisk, - '+' => Kind::Plus, - '-' => Kind::Dash, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `.`, `)`, `*`, `+`, or `-`. + fn from_byte(byte: u8) -> Kind { + match byte { + b'.' => Kind::Dot, + b')' => Kind::Paren, + b'*' => Kind::Asterisk, + b'+' => Kind::Plus, + b'-' => Kind::Dash, + _ => unreachable!("invalid byte"), } } } @@ -149,11 +149,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Unordered. - Some('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| { + Some(b'*' | b'+' | b'-') => tokenizer.check(thematic_break, |ok| { Box::new(if ok { nok } else { before_unordered }) })(tokenizer), // Ordered. - Some(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => { + Some(byte) if byte.is_ascii_digit() && (!tokenizer.interrupt || byte == b'1') => { tokenizer.enter(Token::ListItemPrefix); tokenizer.enter(Token::ListItemValue); inside(tokenizer, 0) @@ -183,11 +183,11 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Some(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + Some(byte) if byte.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, size + 1))) } - Some('.' | ')') if !tokenizer.interrupt || size < 2 => { + Some(b'.' | b')') if !tokenizer.interrupt || size < 2 => { tokenizer.exit(Token::ListItemValue); marker(tokenizer) } @@ -262,7 +262,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn whitespace_after(tokenizer: &mut Tokenizer) -> State { - if matches!(tokenizer.current, Some('\t' | ' ')) { + if matches!(tokenizer.current, Some(b'\t' | b' ')) { State::Nok } else { State::Ok @@ -277,7 +277,7 @@ fn whitespace_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn prefix_other(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); @@ -303,7 +303,7 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State { &[Token::ListItem], ); let mut prefix = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position { start: &tokenizer.events[start].point, end: &tokenizer.point, @@ -400,13 +400,10 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { if event.event_type == EventType::Enter { let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1; let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); - let kind = Kind::from_char( - Slice::from_point( - &tokenizer.parse_state.chars, - &tokenizer.events[marker].point, - ) - .head() - .unwrap(), + let kind = Kind::from_byte( + Slice::from_point(tokenizer.parse_state.bytes, &tokenizer.events[marker].point) + .head() + .unwrap(), ); let current = (kind, balance, index, end); diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 569c609..7b50957 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -84,6 +84,7 @@ pub mod label_start_image; pub mod label_start_link; pub mod list; pub mod paragraph; +pub mod partial_bom; pub mod partial_data; pub mod partial_destination; pub mod partial_label; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 5d230d3..146dc40 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -44,7 +44,7 @@ use crate::util::skip::opt as skip_opt; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { unreachable!("unexpected eol/eof") } _ => { @@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::Paragraph); tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve)); diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs new file mode 100644 index 0000000..be8d6c8 --- /dev/null +++ b/src/construct/partial_bom.rs @@ -0,0 +1,54 @@ +//! To do. + +use crate::token::Token; +use crate::tokenizer::{State, Tokenizer}; + +/// Before a BOM. +/// +/// ```text +/// > | 0xEF 0xBB 0xBF +/// ^^^^ +/// ``` +pub fn start(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(0xEF) => { + tokenizer.enter(Token::ByteOrderMark); + tokenizer.consume(); + State::Fn(Box::new(cont)) + } + _ => State::Nok, + } +} + +/// Second byte in BOM. +/// +/// ```text +/// > | 0xEF 0xBB 0xBF +/// ^^^^ +/// ``` +fn cont(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(0xBB) => { + tokenizer.consume(); + State::Fn(Box::new(end)) + } + _ => State::Nok, + } +} + +/// Last byte in BOM. +/// +/// ```text +/// > | 0xEF 0xBB 0xBF +/// ^^^^ +/// ``` +fn end(tokenizer: &mut Tokenizer) -> State { + match tokenizer.current { + Some(0xBF) => { + tokenizer.consume(); + tokenizer.exit(Token::ByteOrderMark); + State::Ok + } + _ => State::Nok, + } +} diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 0b66b09..335d7ab 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -15,9 +15,9 @@ use crate::tokenizer::{EventType, State, Tokenizer}; /// > | abc /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { +pub fn start(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { match tokenizer.current { - Some(char) if stop.contains(&char) => { + Some(byte) if stop.contains(&byte) => { tokenizer.enter(Token::Data); tokenizer.consume(); State::Fn(Box::new(move |t| data(t, stop))) @@ -32,16 +32,16 @@ pub fn start(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { /// > | abc /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { +fn at_break(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { match tokenizer.current { None => State::Ok, - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); State::Fn(Box::new(move |t| at_break(t, stop))) } - Some(char) if stop.contains(&char) => { + Some(byte) if stop.contains(&byte) => { tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); State::Ok } @@ -58,10 +58,10 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { /// > | abc /// ^^^ /// ``` -fn data(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { +fn data(tokenizer: &mut Tokenizer, stop: &'static [u8]) -> State { let done = match tokenizer.current { - None | Some('\n') => true, - Some(char) if stop.contains(&char) => true, + None | Some(b'\n') => true, + Some(byte) if stop.contains(&byte) => true, _ => false, }; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 6447228..0a3721c 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -117,7 +117,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { }; match tokenizer.current { - Some('<') => { + Some(b'<') => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.literal.clone()); tokenizer.enter(info.options.marker.clone()); @@ -125,8 +125,8 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { tokenizer.exit(info.options.marker.clone()); State::Fn(Box::new(|t| enclosed_before(t, info))) } - None | Some(' ' | ')') => State::Nok, - Some(char) if char.is_ascii_control() => State::Nok, + None | Some(b' ' | b')') => State::Nok, + Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, Some(_) => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.raw.clone()); @@ -144,7 +144,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ^ /// ``` fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { - if let Some('>') = tokenizer.current { + if let Some(b'>') = tokenizer.current { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -166,13 +166,13 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('>') => { + Some(b'>') => { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, info) } - None | Some('\n' | '<') => State::Nok, - Some('\\') => { + None | Some(b'\n' | b'<') => State::Nok, + Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed_escape(t, info))) } @@ -191,7 +191,7 @@ fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('<' | '>' | '\\') => { + Some(b'<' | b'>' | b'\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed(t, info))) } @@ -207,7 +207,7 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('(') => { + Some(b'(') => { if info.balance >= info.options.limit { State::Nok } else { @@ -216,7 +216,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Fn(Box::new(move |t| raw(t, info))) } } - Some(')') => { + Some(b')') => { if info.balance == 0 { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); @@ -229,7 +229,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Fn(Box::new(move |t| raw(t, info))) } } - None | Some('\t' | '\n' | ' ') => { + None | Some(b'\t' | b'\n' | b' ') => { if info.balance > 0 { State::Nok } else { @@ -240,8 +240,8 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Ok } } - Some(char) if char.is_ascii_control() => State::Nok, - Some('\\') => { + Some(byte) if byte != b'\0' && byte.is_ascii_control() => State::Nok, + Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw_escape(t, info))) } @@ -260,7 +260,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn raw_escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some('(' | ')' | '\\') => { + Some(b'(' | b')' | b'\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw(t, info))) } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index ee31533..7e40a2d 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -82,9 +82,9 @@ pub struct Options { struct Info { /// Whether we’ve seen our first `ChunkString`. connect: bool, - /// Whether there are non-blank characters in the label. + /// Whether there are non-blank bytes in the label. data: bool, - /// Number of characters in the label. + /// Number of bytes in the label. size: usize, /// Configuration. options: Options, @@ -98,7 +98,7 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Some('[') => { + Some(b'[') => { let info = Info { connect: false, data: false, @@ -124,10 +124,10 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - None | Some('[') => State::Nok, - Some(']') if !info.data => State::Nok, + None | Some(b'[') => State::Nok, + Some(b']') if !info.data => State::Nok, _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, - Some(']') => { + Some(b']') => { tokenizer.exit(info.options.string.clone()); tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); @@ -135,7 +135,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(info.options.label); State::Ok } - Some('\n') => tokenizer.go( + Some(b'\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -168,7 +168,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - None | Some('\n' | '[' | ']') => { + None | Some(b'\n' | b'[' | b']') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } @@ -176,12 +176,12 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Some('\t' | ' ') => { + Some(b'\t' | b' ') => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| label(t, info))) } - Some('\\') => { + Some(b'\\') => { tokenizer.consume(); info.size += 1; if !info.data { @@ -208,7 +208,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('[' | '\\' | ']') => { + Some(b'[' | b'\\' | b']') => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| label(t, info))) diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs index 068e30f..6005a6c 100644 --- a/src/construct/partial_non_lazy_continuation.rs +++ b/src/construct/partial_non_lazy_continuation.rs @@ -22,7 +22,7 @@ use crate::tokenizer::{State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 6070ffe..f31cbc6 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -11,9 +11,9 @@ use crate::tokenizer::{ContentType, State, StateFn, Tokenizer}; /// Options to parse `space_or_tab`. #[derive(Debug)] pub struct Options { - /// Minimum allowed characters (inclusive). + /// Minimum allowed bytes (inclusive). pub min: usize, - /// Maximum allowed characters (inclusive). + /// Maximum allowed bytes (inclusive). pub max: usize, /// Token type to use for whitespace events. pub kind: Token, @@ -134,7 +134,7 @@ pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box { /// ``` fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('\t' | ' ') if info.options.max > 0 => { + Some(b'\t' | b' ') if info.options.max > 0 => { tokenizer .enter_with_content(info.options.kind.clone(), info.options.content_type.clone()); @@ -165,7 +165,7 @@ fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some('\t' | ' ') if info.size < info.options.max => { + Some(b'\t' | b' ') if info.size < info.options.max => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| inside(t, info))) @@ -190,7 +190,7 @@ fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn after_space_or_tab(tokenizer: &mut Tokenizer, mut info: EolInfo) -> State { match tokenizer.current { - Some('\n') => { + Some(b'\n') => { tokenizer.enter_with_content(Token::LineEnding, info.options.content_type.clone()); if info.connect { @@ -239,7 +239,7 @@ fn after_eol(tokenizer: &mut Tokenizer, info: EolInfo) -> State { /// ``` fn after_more_space_or_tab(tokenizer: &mut Tokenizer) -> State { // Blank line not allowed. - if matches!(tokenizer.current, None | Some('\n')) { + if matches!(tokenizer.current, None | Some(b'\n')) { State::Nok } else { State::Ok diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 15fc25e..80861af 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -78,29 +78,29 @@ enum Kind { } impl Kind { - /// Turn the kind into a [char]. + /// Turn the kind into a byte ([u8]). /// /// > 👉 **Note**: a closing paren is used for `Kind::Paren`. - fn as_char(&self) -> char { + fn as_byte(&self) -> u8 { match self { - Kind::Paren => ')', - Kind::Double => '"', - Kind::Single => '\'', + Kind::Paren => b')', + Kind::Double => b'"', + Kind::Single => b'\'', } } - /// Turn a [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. /// /// ## Panics /// - /// Panics if `char` is not `(`, `"`, or `'`. - fn from_char(char: char) -> Kind { - match char { - '(' => Kind::Paren, - '"' => Kind::Double, - '\'' => Kind::Single, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `(`, `"`, or `'`. + fn from_byte(byte: u8) -> Kind { + match byte { + b'(' => Kind::Paren, + b'"' => Kind::Double, + b'\'' => Kind::Single, + _ => unreachable!("invalid byte"), } } } @@ -124,10 +124,10 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Some(char) if matches!(char, '"' | '\'' | '(') => { + Some(byte) if matches!(byte, b'"' | b'\'' | b'(') => { let info = Info { connect: false, - kind: Kind::from_char(char), + kind: Kind::from_byte(byte), options, }; tokenizer.enter(info.options.title.clone()); @@ -150,7 +150,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -172,12 +172,12 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.exit(info.options.string.clone()); begin(tokenizer, info) } None => State::Nok, - Some('\n') => tokenizer.go( + Some(b'\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -210,15 +210,15 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn title(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - None | Some('\n') => { + None | Some(b'\n') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Some('\\') => { + Some(b'\\') => { tokenizer.consume(); State::Fn(Box::new(|t| escape(t, info))) } @@ -237,7 +237,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.consume(); State::Fn(Box::new(|t| title(t, info))) } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 152824b..13815cb 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -86,25 +86,25 @@ fn trim_data( hard_break: bool, ) { let mut slice = Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, exit_index), ); if trim_end { - let mut index = slice.chars.len(); + let mut index = slice.bytes.len(); let vs = slice.after; let mut spaces_only = vs == 0; while index > 0 { - match slice.chars[index - 1] { - ' ' => {} - '\t' => spaces_only = false, + match slice.bytes[index - 1] { + b' ' => {} + b'\t' => spaces_only = false, _ => break, } index -= 1; } - let diff = slice.chars.len() - index; + let diff = slice.bytes.len() - index; let token_type = if spaces_only && hard_break && exit_index + 1 < tokenizer.events.len() @@ -150,16 +150,16 @@ fn trim_data( ); tokenizer.events[exit_index].point = enter_point; - slice.chars = &slice.chars[..index]; + slice.bytes = &slice.bytes[..index]; } } if trim_start { let mut index = 0; let vs = slice.before; - while index < slice.chars.len() { - match slice.chars[index] { - ' ' | '\t' => {} + while index < slice.bytes.len() { + match slice.bytes[index] { + b' ' | b'\t' => {} _ => break, } @@ -168,7 +168,7 @@ fn trim_data( // The whole data is whitespace. // We can be very fast: we only change the token types. - if index == slice.chars.len() { + if index == slice.bytes.len() { tokenizer.events[exit_index - 1].token_type = Token::SpaceOrTab; tokenizer.events[exit_index].token_type = Token::SpaceOrTab; return; diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index bed454b..4fc4dc4 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -83,25 +83,25 @@ enum Kind { } impl Kind { - /// Turn the kind into a [char]. - fn as_char(&self) -> char { + /// Turn the kind into a byte ([u8]). + fn as_byte(&self) -> u8 { match self { - Kind::Asterisk => '*', - Kind::Dash => '-', - Kind::Underscore => '_', + Kind::Asterisk => b'*', + Kind::Dash => b'-', + Kind::Underscore => b'_', } } - /// Turn a [char] into a kind. + /// Turn a byte ([u8]) into a kind. /// /// ## Panics /// - /// Panics if `char` is not `*`, `-`, or `_`. - fn from_char(char: char) -> Kind { - match char { - '*' => Kind::Asterisk, - '-' => Kind::Dash, - '_' => Kind::Underscore, - _ => unreachable!("invalid char"), + /// Panics if `byte` is not `*`, `-`, or `_`. + fn from_byte(byte: u8) -> Kind { + match byte { + b'*' => Kind::Asterisk, + b'-' => Kind::Dash, + b'_' => Kind::Underscore, + _ => unreachable!("invalid byte"), } } } @@ -144,10 +144,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Some(char) if matches!(char, '*' | '-' | '_') => at_break( + Some(byte) if matches!(byte, b'*' | b'-' | b'_') => at_break( tokenizer, Info { - kind: Kind::from_char(char), + kind: Kind::from_byte(byte), size: 0, }, ), @@ -163,13 +163,13 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - None | Some('\n' | '\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { + None | Some(b'\n' | b'\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.enter(Token::ThematicBreakSequence); sequence(tokenizer, info) } @@ -185,7 +185,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Some(char) if char == info.kind.as_char() => { + Some(byte) if byte == info.kind.as_byte() => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| sequence(t, info))) diff --git a/src/content/document.rs b/src/content/document.rs index 935c4ef..828431d 100644 --- a/src/content/document.rs +++ b/src/content/document.rs @@ -11,6 +11,7 @@ use crate::construct::{ block_quote::{cont as block_quote_cont, start as block_quote}, list::{cont as list_item_const, start as list_item}, + partial_bom::start as bom, }; use crate::content::flow::start as flow; use crate::parser::ParseState; @@ -78,7 +79,7 @@ struct DocumentInfo { pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { let mut tokenizer = Tokenizer::new(point, parse_state); - let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before)); + let state = tokenizer.push(0, parse_state.bytes.len(), Box::new(before)); tokenizer.flush(state, true); let mut index = 0; @@ -92,7 +93,7 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { // don‘t need virtual spaces. let id = normalize_identifier( &Slice::from_position( - &tokenizer.parse_state.chars, + tokenizer.parse_state.bytes, &Position::from_exit_event(&tokenizer.events, index), ) .serialize(), @@ -124,15 +125,7 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Some('\u{FEFF}') => { - tokenizer.enter(Token::ByteOrderMark); - tokenizer.consume(); - tokenizer.exit(Token::ByteOrderMark); - State::Fn(Box::new(start)) - } - _ => start(tokenizer), - } + tokenizer.attempt_opt(bom, start)(tokenizer) } /// Before document. @@ -358,7 +351,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State // Parse flow, pausing after eols. tokenizer.go_until( state, - |code| matches!(code, Some('\n')), + |code| matches!(code, Some(b'\n')), move |state| Box::new(move |t| flow_end(t, info, state)), )(tokenizer) } diff --git a/src/content/flow.rs b/src/content/flow.rs index 09c4e2c..bf4104c 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -88,7 +88,7 @@ fn initial_before(tokenizer: &mut Tokenizer) -> State { fn blank_line_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::BlankLineEnding); tokenizer.consume(); tokenizer.exit(Token::BlankLineEnding); @@ -112,7 +112,7 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State { fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { None => State::Ok, - Some('\n') => { + Some(b'\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); diff --git a/src/content/string.rs b/src/content/string.rs index 8bc2b91..d2aec3f 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -18,7 +18,7 @@ use crate::construct::{ }; use crate::tokenizer::{State, Tokenizer}; -const MARKERS: [char; 2] = ['&', '\\']; +const MARKERS: [u8; 2] = [b'&', b'\\']; /// Start of string. pub fn start(tokenizer: &mut Tokenizer) -> State { diff --git a/src/content/text.rs b/src/content/text.rs index ebdf888..30c98a3 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -30,16 +30,16 @@ use crate::construct::{ }; use crate::tokenizer::{State, Tokenizer}; -const MARKERS: [char; 9] = [ - '!', // `label_start_image` - '&', // `character_reference` - '*', // `attention` - '<', // `autolink`, `html_text` - '[', // `label_start_link` - '\\', // `character_escape`, `hard_break_escape` - ']', // `label_end` - '_', // `attention` - '`', // `code_text` +const MARKERS: [u8; 9] = [ + b'!', // `label_start_image` + b'&', // `character_reference` + b'*', // `attention` + b'<', // `autolink`, `html_text` + b'[', // `label_start_link` + b'\\', // `character_escape`, `hard_break_escape` + b']', // `label_end` + b'_', // `attention` + b'`', // `code_text` ]; /// Start of text. diff --git a/src/lib.rs b/src/lib.rs index c1b0fa0..750ca36 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -424,5 +424,5 @@ pub fn micromark(value: &str) -> String { #[must_use] pub fn micromark_with_options(value: &str, options: &Options) -> String { let (events, result) = parse(value, options); - compile(&events, &result.chars, options) + compile(&events, result.bytes, options) } diff --git a/src/parser.rs b/src/parser.rs index cc9c256..613b206 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -12,7 +12,7 @@ use crate::{Constructs, Options}; pub struct ParseState<'a> { pub constructs: &'a Constructs, /// List of chars. - pub chars: Vec, + pub bytes: &'a [u8], /// Set of defined identifiers. pub definitions: Vec, } @@ -20,11 +20,10 @@ pub struct ParseState<'a> { /// Turn a string of markdown into events. /// /// Passes the codes back so the compiler can access the source. -pub fn parse<'a>(value: &str, options: &'a Options) -> (Vec, ParseState<'a>) { +pub fn parse<'a>(value: &'a str, options: &'a Options) -> (Vec, ParseState<'a>) { let mut parse_state = ParseState { constructs: &options.constructs, - // To do: change to `u8`s? - chars: value.chars().collect::<_>(), + bytes: value.as_bytes(), definitions: vec![], }; @@ -38,5 +37,6 @@ pub fn parse<'a>(value: &str, options: &'a Options) -> (Vec, ParseState<' }, ); + // To do: return bytes only? (events, parse_state) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3cbad0f..9c5e9f6 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -27,8 +27,8 @@ pub enum ContentType { #[derive(Debug, PartialEq)] pub enum CharAction { - Normal(char), - Insert(char), + Normal(u8), + Insert(u8), Ignore, } @@ -42,8 +42,8 @@ pub struct Point { pub line: usize, /// 1-indexed column number. /// This is increases up to a tab stop for tabs. - /// Some editors count tabs as 1 character, so this position is not always - /// the same as editors. + /// Some editors count tabs as 1 character, so this position is not the + /// same as editors. pub column: usize, /// 0-indexed position in the document. /// @@ -81,7 +81,7 @@ pub struct Event { } /// The essence of the state machine are functions: `StateFn`. -/// It’s responsible for dealing with the current char. +/// It’s responsible for dealing with the current byte. /// It yields a [`State`][]. pub type StateFn = dyn FnOnce(&mut Tokenizer) -> State; @@ -157,9 +157,9 @@ struct InternalState { /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. stack_len: usize, /// Previous code. - previous: Option, + previous: Option, /// Current code. - current: Option, + current: Option, /// Current relative and absolute position in the file. point: Point, } @@ -173,17 +173,17 @@ pub struct Tokenizer<'a> { first_line: usize, /// To do. line_start: Point, - /// Track whether a character is expected to be consumed, and whether it’s - /// actually consumed + /// Track whether the current byte is already consumed (`true`) or expected + /// to be consumed (`false`). /// /// Tracked to make sure everything’s valid. consumed: bool, /// Track whether this tokenizer is done. resolved: bool, - /// Current character code. - pub current: Option, - /// Previous character code. - pub previous: Option, + /// Current byte. + pub current: Option, + /// Previous byte. + pub previous: Option, /// Current relative and absolute place in the file. pub point: Point, /// Semantic labels of one or more codes in `codes`. @@ -297,13 +297,13 @@ impl<'a> Tokenizer<'a> { } /// Prepare for a next code to get consumed. - pub fn expect(&mut self, char: Option) { - debug_assert!(self.consumed, "expected previous character to be consumed"); + pub fn expect(&mut self, byte: Option) { + debug_assert!(self.consumed, "expected previous byte to be consumed"); self.consumed = false; - self.current = char; + self.current = byte; } - /// Consume the current character. + /// Consume the current byte. /// Each [`StateFn`][] is expected to call this to signal that this code is /// used, or call a next `StateFn`. pub fn consume(&mut self) { @@ -320,23 +320,23 @@ impl<'a> Tokenizer<'a> { self.consumed = true; } - /// Move to the next (virtual) character. + /// Move to the next (virtual) byte. pub fn move_one(&mut self) { - match char_action(&self.parse_state.chars, &self.point) { + match byte_action(self.parse_state.bytes, &self.point) { CharAction::Ignore => { self.point.index += 1; } - CharAction::Insert(char) => { - self.previous = Some(char); + CharAction::Insert(byte) => { + self.previous = Some(byte); self.point.column += 1; self.point.vs += 1; } - CharAction::Normal(char) => { - self.previous = Some(char); + CharAction::Normal(byte) => { + self.previous = Some(byte); self.point.vs = 0; self.point.index += 1; - if char == '\n' { + if byte == b'\n' { self.point.line += 1; self.point.column = 1; @@ -355,7 +355,7 @@ impl<'a> Tokenizer<'a> { } } - /// Move (virtual) characters. + /// Move (virtual) bytes. pub fn move_to(&mut self, to: (usize, usize)) { let (to_index, to_vs) = to; while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs { @@ -382,10 +382,10 @@ impl<'a> Tokenizer<'a> { pub fn enter_with_link(&mut self, token_type: Token, link: Option) { let mut point = self.point.clone(); - // Move back past ignored chars. + // Move back past ignored bytes. while point.index > 0 { point.index -= 1; - let action = char_action(&self.parse_state.chars, &point); + let action = byte_action(self.parse_state.bytes, &point); if !matches!(action, CharAction::Ignore) { point.index += 1; break; @@ -432,13 +432,13 @@ impl<'a> Tokenizer<'a> { // A bit weird, but if we exit right after a line ending, we *don’t* want to consider // potential skips. - if matches!(self.previous, Some('\n')) { + if matches!(self.previous, Some(b'\n')) { point = self.line_start.clone(); } else { - // Move back past ignored chars. + // Move back past ignored bytes. while point.index > 0 { point.index -= 1; - let action = char_action(&self.parse_state.chars, &point); + let action = byte_action(self.parse_state.bytes, &point); if !matches!(action, CharAction::Ignore) { point.index += 1; break; @@ -520,7 +520,7 @@ impl<'a> Tokenizer<'a> { pub fn go_until( &mut self, state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - until: impl Fn(Option) -> bool + 'static, + until: impl Fn(Option) -> bool + 'static, done: impl FnOnce(State) -> Box + 'static, ) -> Box { attempt_impl( @@ -653,21 +653,19 @@ impl<'a> Tokenizer<'a> { while self.point.index < max { match state { State::Ok | State::Nok => break, - State::Fn(func) => match char_action(&self.parse_state.chars, &self.point) { + State::Fn(func) => match byte_action(self.parse_state.bytes, &self.point) { CharAction::Ignore => { state = State::Fn(Box::new(func)); self.move_one(); } - CharAction::Insert(char) => { - log::debug!("main: passing (fake): `{:?}` ({:?})", char, self.point); - self.expect(Some(char)); + CharAction::Insert(byte) => { + log::debug!("main: passing (fake): `{:?}` ({:?})", byte, self.point); + self.expect(Some(byte)); state = func(self); - // self.point.column += 1; - // self.point.vs += 1; } - CharAction::Normal(char) => { - log::debug!("main: passing: `{:?}` ({:?})", char, self.point); - self.expect(Some(char)); + CharAction::Normal(byte) => { + log::debug!("main: passing: `{:?}` ({:?})", byte, self.point); + self.expect(Some(byte)); state = func(self); } }, @@ -690,28 +688,28 @@ impl<'a> Tokenizer<'a> { // To do: clean this? // We sometimes move back when flushing, so then we use those codes. if self.point.index == max { - let char = None; - log::debug!("main: flushing eof: `{:?}` ({:?})", char, self.point); - self.expect(char); + let byte = None; + log::debug!("main: flushing eof: `{:?}` ({:?})", byte, self.point); + self.expect(byte); state = func(self); } else { - match char_action(&self.parse_state.chars, &self.point) { + match byte_action(self.parse_state.bytes, &self.point) { CharAction::Ignore => { state = State::Fn(Box::new(func)); self.move_one(); } - CharAction::Insert(char) => { + CharAction::Insert(byte) => { log::debug!( "main: flushing (fake): `{:?}` ({:?})", - char, + byte, self.point ); - self.expect(Some(char)); + self.expect(Some(byte)); state = func(self); } - CharAction::Normal(char) => { - log::debug!("main: flushing: `{:?}` ({:?})", char, self.point); - self.expect(Some(char)); + CharAction::Normal(byte) => { + log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point); + self.expect(Some(byte)); state = func(self); } } @@ -735,22 +733,20 @@ impl<'a> Tokenizer<'a> { } } -fn char_action(chars: &[char], point: &Point) -> CharAction { - if point.index < chars.len() { - let char = chars[point.index]; +fn byte_action(bytes: &[u8], point: &Point) -> CharAction { + if point.index < bytes.len() { + let byte = bytes[point.index]; - if char == '\0' { - CharAction::Normal(char::REPLACEMENT_CHARACTER) - } else if char == '\r' { + if byte == b'\r' { // CRLF. - if point.index < chars.len() - 1 && chars[point.index + 1] == '\n' { + if point.index < bytes.len() - 1 && bytes[point.index + 1] == b'\n' { CharAction::Ignore } // CR. else { - CharAction::Normal('\n') + CharAction::Normal(b'\n') } - } else if char == '\t' { + } else if byte == b'\t' { let remainder = point.column % TAB_SIZE; let vs = if remainder == 0 { 0 @@ -761,19 +757,19 @@ fn char_action(chars: &[char], point: &Point) -> CharAction { // On the tab itself, first send it. if point.vs == 0 { if vs == 0 { - CharAction::Normal(char) + CharAction::Normal(byte) } else { - CharAction::Insert(char) + CharAction::Insert(byte) } } else if vs == 0 { - CharAction::Normal(' ') + CharAction::Normal(b' ') } else { - CharAction::Insert(' ') + CharAction::Insert(b' ') } } // VS? else { - CharAction::Normal(char) + CharAction::Normal(byte) } } else { unreachable!("out of bounds") @@ -786,7 +782,7 @@ fn char_action(chars: &[char], point: &Point) -> CharAction { /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( state: impl FnOnce(&mut Tokenizer) -> State + 'static, - pause: Option) -> bool + 'static>>, + pause: Option) -> bool + 'static>>, start: usize, done: impl FnOnce(&mut Tokenizer, State) -> State + 'static, ) -> Box { diff --git a/src/util/slice.rs b/src/util/slice.rs index 14fd527..cd3641e 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -48,17 +48,21 @@ impl<'a> Position<'a> { /// Includes information on virtual spaces before and after the chars. #[derive(Debug)] pub struct Slice<'a> { - pub chars: &'a [char], + pub bytes: &'a [u8], pub before: usize, pub after: usize, } impl<'a> Slice<'a> { /// Get the slice belonging to a position. - pub fn from_point(list: &'a [char], point: &Point) -> Slice<'a> { + pub fn from_point(bytes: &'a [u8], point: &Point) -> Slice<'a> { let mut before = point.vs; let mut start = point.index; - let end = if start < list.len() { start + 1 } else { start }; + let end = if start < bytes.len() { + start + 1 + } else { + start + }; // If we have virtual spaces before, it means we are past the actual // character at that index, and those virtual spaces. @@ -68,14 +72,14 @@ impl<'a> Slice<'a> { }; Slice { - chars: if start < end { &list[start..end] } else { &[] }, + bytes: if start < end { &bytes[start..end] } else { &[] }, before, after: 0, } } /// Get the slice belonging to a position. - pub fn from_position(list: &'a [char], position: &Position) -> Slice<'a> { + pub fn from_position(bytes: &'a [u8], position: &Position) -> Slice<'a> { let mut before = position.start.vs; let mut after = position.end.vs; let mut start = position.start.index; @@ -96,15 +100,16 @@ impl<'a> Slice<'a> { } Slice { - chars: &list[start..end], + bytes: &bytes[start..end], before, after, } } /// To do. + // To do: rename to `len`? pub fn size(&self) -> usize { - self.chars.len() + self.before + self.after + self.bytes.len() + self.before + self.after } // To do: @@ -112,27 +117,13 @@ impl<'a> Slice<'a> { // to implement an `as_str`. /// To do. - pub fn head(&self) -> Option { + pub fn head(&self) -> Option { if self.before > 0 { - Some(' ') - } else if self.chars.is_empty() { + Some(b' ') + } else if self.bytes.is_empty() { None } else { - Some(self.chars[0]) - } - } - - /// To do. - pub fn tail(&self) -> Option { - if self.after > 0 { - Some(' ') - } else { - let index = self.chars.len(); - if index > 0 { - Some(self.chars[index - 1]) - } else { - None - } + Some(self.bytes[0]) } } @@ -144,7 +135,8 @@ impl<'a> Slice<'a> { string.push(' '); index -= 1; } - string.push_str(&self.chars.iter().collect::()); + // To do: invalid UTF8? + string.push_str(std::str::from_utf8(self.bytes).unwrap()); index = self.after; while index > 0 { string.push(' '); -- cgit