From 0eeff9148e327183e532752f46421a75506dd7a6 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 29 Jul 2022 18:22:59 +0200 Subject: Refactor to improve states * Remove custom kind wrappers, use plain bytes instead * Remove `Into`s, use the explicit expected types instead * Refactor to use `slice.as_str` in most places * Remove unneeded unique check before adding a definition * Use a shared CDATA prefix in constants * Inline byte checks into matches * Pass bytes back from parser instead of whole parse state * Refactor to work more often on bytes * Rename custom `size` to `len` --- src/construct/html_text.rs | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'src/construct/html_text.rs') diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index f10a476..51beda5 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -54,12 +54,11 @@ //! [html_flow]: crate::construct::html_flow //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +use crate::constant::HTML_CDATA_PREFIX; use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; - /// Start of HTML (text) /// /// ```markdown @@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(instruction)) } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Some(b'[') => { - tokenizer.consume(); - State::Fn(Box::new(|t| cdata_open_inside(t, 0))) - } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } + Some(b'[') => { + tokenizer.consume(); + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) + } _ => State::Nok, } } @@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a &<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { - match tokenizer.current { - Some(byte) if byte == CDATA_SEARCH[index] => { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { + if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { + tokenizer.consume(); - if index + 1 == CDATA_SEARCH.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) - } + if size + 1 == HTML_CDATA_PREFIX.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1))) } - _ => State::Nok, + } else { + State::Nok } } @@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(end)) } + // ASCII alphabetical and `:` and `_`. Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Some(byte) if byte == b'"' || byte == b'\'' => { + Some(b'"' | b'\'') => { + let marker = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) + State::Fn(Box::new(move |t| { + tag_open_attribute_value_quoted(t, marker) + })) } Some(_) => { tokenizer.consume(); @@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Some(byte) if byte == marker => { + Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } -- cgit