diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
commit | 0eeff9148e327183e532752f46421a75506dd7a6 (patch) | |
tree | 4f0aed04f90aa759ce96a2e87aa719e7fa95c450 /src/construct/html_text.rs | |
parent | 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff) | |
download | markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2 markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip |
Refactor to improve states
* Remove custom kind wrappers, use plain bytes instead
* Remove `Into`s, use the explicit expected types instead
* Refactor to use `slice.as_str` in most places
* Remove unneeded unique check before adding a definition
* Use a shared CDATA prefix in constants
* Inline byte checks into matches
* Pass bytes back from parser instead of whole parse state
* Refactor to work more often on bytes
* Rename custom `size` to `len`
Diffstat (limited to '')
-rw-r--r-- | src/construct/html_text.rs | 46 |
1 files changed, 27 insertions, 19 deletions
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index f10a476..51beda5 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -54,12 +54,11 @@ //! [html_flow]: crate::construct::html_flow //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +use crate::constant::HTML_CDATA_PREFIX; use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; use crate::tokenizer::{State, StateFn, Tokenizer}; -const CDATA_SEARCH: [u8; 6] = [b'C', b'D', b'A', b'T', b'A', b'[']; - /// Start of HTML (text) /// /// ```markdown @@ -101,6 +100,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(instruction)) } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -125,14 +125,15 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Some(b'[') => { - tokenizer.consume(); - State::Fn(Box::new(|t| cdata_open_inside(t, 0))) - } + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } + Some(b'[') => { + tokenizer.consume(); + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) + } _ => State::Nok, } } @@ -240,18 +241,17 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { - match tokenizer.current { - Some(byte) if byte == CDATA_SEARCH[index] => { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, size: usize) -> State { + if tokenizer.current == Some(HTML_CDATA_PREFIX[size]) { + tokenizer.consume(); - if index + 1 == CDATA_SEARCH.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) - } + if size + 1 == HTML_CDATA_PREFIX.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, size + 1))) } - _ => State::Nok, + } else { + State::Nok } } @@ -365,6 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical. Some(b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -381,6 +382,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) @@ -414,6 +416,7 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphanumerical and `-`. Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) @@ -440,6 +443,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(end)) } + // ASCII alphabetical and `:` and `_`. Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -456,6 +460,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { + // ASCII alphabetical and `-`, `.`, `:`, and `_`. Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) @@ -501,9 +506,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Some(byte) if byte == b'"' || byte == b'\'' => { + Some(b'"' | b'\'') => { + let marker = tokenizer.current.unwrap(); tokenizer.consume(); - State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, byte))) + State::Fn(Box::new(move |t| { + tag_open_attribute_value_quoted(t, marker) + })) } Some(_) => { tokenizer.consume(); @@ -525,7 +533,7 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: u8) -> Sta tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Some(byte) if byte == marker => { + Some(b'"' | b'\'') if tokenizer.current.unwrap() == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } |