From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 28 Jul 2022 16:48:00 +0200 Subject: Refactor to work on `char`s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event. --- src/construct/html_text.rs | 161 ++++++++++++++++++++------------------------- 1 file changed, 70 insertions(+), 91 deletions(-) (limited to 'src/construct/html_text.rs') diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 3ac8d71..b1ad113 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -56,8 +56,9 @@ use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; -use crate::tokenizer::{Code, State, StateFn, Tokenizer}; -use crate::util::codes::parse; +use crate::tokenizer::{State, StateFn, Tokenizer}; + +const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// Start of HTML (text) /// @@ -66,7 +67,7 @@ use crate::util::codes::parse; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Code::Char('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { + if Some('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { tokenizer.enter(Token::HtmlText); tokenizer.enter(Token::HtmlTextData); tokenizer.consume(); @@ -88,19 +89,19 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('!') => { + Some('!') => { tokenizer.consume(); State::Fn(Box::new(declaration_open)) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(tag_close_start)) } - Code::Char('?') => { + Some('?') => { tokenizer.consume(); State::Fn(Box::new(instruction)) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } @@ -120,16 +121,15 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Code::Char('[') => { + Some('[') => { tokenizer.consume(); - let buffer = parse("CDATA["); - State::Fn(Box::new(|t| cdata_open_inside(t, buffer, 0))) + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } @@ -145,7 +145,7 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_start)) } @@ -168,8 +168,8 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => State::Nok, - Code::Char('-') => { + None | Some('>') => State::Nok, + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_start_dash)) } @@ -192,7 +192,7 @@ fn comment_start(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => State::Nok, + None | Some('>') => State::Nok, _ => comment(tokenizer), } } @@ -205,11 +205,9 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(comment)) - } - Code::Char('-') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(comment)), + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_close)) } @@ -228,7 +226,7 @@ fn comment(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(end)) } @@ -242,17 +240,18 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a &<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec, index: usize) -> State { - if tokenizer.current == buffer[index] { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { + match tokenizer.current { + Some(char) if char == CDATA_SEARCH[index] => { + tokenizer.consume(); - if index + 1 == buffer.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, buffer, index + 1))) + if index + 1 == CDATA_SEARCH.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) + } } - } else { - State::Nok + _ => State::Nok, } } @@ -264,11 +263,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec, index: usize) /// ``` fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(cdata)) - } - Code::Char(']') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(cdata)), + Some(']') => { tokenizer.consume(); State::Fn(Box::new(cdata_close)) } @@ -287,7 +284,7 @@ fn cdata(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.consume(); State::Fn(Box::new(cdata_end)) } @@ -303,8 +300,8 @@ fn cdata_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => end(tokenizer), - Code::Char(']') => cdata_close(tokenizer), + Some('>') => end(tokenizer), + Some(']') => cdata_close(tokenizer), _ => cdata(tokenizer), } } @@ -317,10 +314,8 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => end(tokenizer), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(declaration)) - } + None | Some('>') => end(tokenizer), + Some('\n') => at_line_ending(tokenizer, Box::new(declaration)), _ => { tokenizer.consume(); State::Fn(Box::new(declaration)) @@ -336,11 +331,9 @@ fn declaration(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(instruction)) - } - Code::Char('?') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(instruction)), + Some('?') => { tokenizer.consume(); State::Fn(Box::new(instruction_close)) } @@ -359,7 +352,7 @@ fn instruction(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => end(tokenizer), + Some('>') => end(tokenizer), _ => instruction(tokenizer), } } @@ -372,7 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -388,7 +381,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -404,10 +397,8 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_close_between)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_close_between)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_close_between)) } @@ -423,13 +414,11 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -442,18 +431,16 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_between)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_between)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_between)) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(end)) } - Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -469,7 +456,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -486,14 +473,12 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name_after)) } - Code::Char('=') => { + Some('=') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } @@ -510,19 +495,17 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + None | Some('<' | '=' | '>' | '`') => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Code::Char(char) if char == '"' || char == '\'' => { + Some(char) if char == '"' || char == '\'' => { tokenizer.consume(); State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, char))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) } @@ -537,12 +520,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending( + None => State::Nok, + Some('\n') => at_line_ending( tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Code::Char(char) if char == marker => { + Some(char) if char == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } @@ -563,11 +546,9 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> S /// ``` fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => State::Nok, - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer), - Code::Char(_) => { + None | Some('"' | '\'' | '<' | '=' | '`') => State::Nok, + Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) } @@ -583,9 +564,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer), + Some('\t' | '\n' | ' ' | '>' | '/') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -598,7 +577,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); tokenizer.exit(Token::HtmlTextData); tokenizer.exit(Token::HtmlText); @@ -620,7 +599,7 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.exit(Token::HtmlTextData); tokenizer.enter(Token::LineEnding); tokenizer.consume(); -- cgit