diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-28 16:48:00 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-28 16:48:00 +0200 |
commit | f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch) | |
tree | c1ac3f22473bd79566d835b2474d2ae9e00d6c55 /src/construct/html_flow.rs | |
parent | d729b07712ca9cc91e68af1776dac9d7008a90cb (diff) | |
download | markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2 markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip |
Refactor to work on `char`s
Previously, a custom char implementation was used.
This was easier to work with, as sometimes “virtual” characters are injected,
or characters are ignored.
This replaces that with working on actual `char`s.
In the hope of in the future working on `u8`s, even.
This simplifies the state machine somewhat, as only `\n` is fed, regardless of
whether it was a CRLF, CR, or LF.
It also feeds `' '` instead of virtual spaces.
The BOM, if present, is now available as a `ByteOrderMark` event.
Diffstat (limited to 'src/construct/html_flow.rs')
-rw-r--r-- | src/construct/html_flow.rs | 224 |
1 files changed, 111 insertions, 113 deletions
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 24d6f98..238963d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -105,8 +105,10 @@ use crate::construct::{ partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; -use crate::util::codes::{parse, serialize}; +use crate::tokenizer::{Point, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; + +const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -168,17 +170,6 @@ impl QuoteKind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('"' | '\'')`. - fn from_code(code: Code) -> QuoteKind { - match code { - Code::Char(char) => QuoteKind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse HTML (flow). @@ -190,9 +181,9 @@ struct Info { start_tag: bool, /// Used depending on `kind` to either collect all parsed characters, or to /// store expected characters. - buffer: Vec<Code>, - /// `index` into `buffer` when expecting certain characters. - index: usize, + start: Option<Point>, + /// Collected index, for various reasons. + size: usize, /// Current quote, when in a double or single quoted attribute value. quote: Option<QuoteKind>, } @@ -234,7 +225,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Code::Char('<') == tokenizer.current { + if Some('<') == tokenizer.current { tokenizer.enter(Token::HtmlFlowData); tokenizer.consume(); State::Fn(Box::new(open)) @@ -259,21 +250,22 @@ fn open(tokenizer: &mut Tokenizer) -> State { kind: Kind::Basic, // Assume closing tag (or no tag). start_tag: false, - buffer: vec![], - index: 0, + start: None, + size: 0, quote: None, }; match tokenizer.current { - Code::Char('!') => { + Some('!') => { tokenizer.consume(); State::Fn(Box::new(|t| declaration_open(t, info))) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); + info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| tag_close_start(t, info))) } - Code::Char('?') => { + Some('?') => { info.kind = Kind::Instruction; tokenizer.consume(); // Do not form containers. @@ -282,8 +274,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { // right now, so we do need to search for `>`, similar to declarations. State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { info.start_tag = true; + info.start = Some(tokenizer.point.clone()); tag_name(tokenizer, info) } _ => State::Nok, @@ -302,19 +295,18 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); info.kind = Kind::Comment; State::Fn(Box::new(|t| comment_open_inside(t, info))) } - Code::Char('[') => { + Some('[') => { tokenizer.consume(); info.kind = Kind::Cdata; - info.buffer = parse("CDATA["); - info.index = 0; + info.size = 0; State::Fn(Box::new(|t| cdata_open_inside(t, info))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); info.kind = Kind::Declaration; // Do not form containers. @@ -333,7 +325,7 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -350,20 +342,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^^^^^^ /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { - if tokenizer.current == info.buffer[info.index] { - info.index += 1; - tokenizer.consume(); + match tokenizer.current { + Some(char) if char == CDATA_SEARCH[info.size] => { + info.size += 1; + tokenizer.consume(); - if info.index == info.buffer.len() { - info.buffer.clear(); - // Do not form containers. - tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation(t, info))) - } else { - State::Fn(Box::new(|t| cdata_open_inside(t, info))) + if info.size == CDATA_SEARCH.len() { + info.size = 0; + // Do not form containers. + tokenizer.concrete = true; + State::Fn(Box::new(|t| continuation(t, info))) + } else { + State::Fn(Box::new(|t| cdata_open_inside(t, info))) + } } - } else { - State::Nok + _ => State::Nok, } } @@ -373,11 +366,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | </x> /// ^ /// ``` -fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); - info.buffer.push(tokenizer.current); State::Fn(Box::new(|t| tag_name(t, info))) } _ => State::Nok, @@ -394,22 +386,27 @@ fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { - let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); - let name = tag_name_buffer.as_str(); - let slash = matches!(tokenizer.current, Code::Char('/')); - - info.buffer.clear(); - - if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { + None | Some('\t' | '\n' | ' ' | '/' | '>') => { + let slash = matches!(tokenizer.current, Some('/')); + let start = info.start.take().unwrap(); + let name = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &start, + end: &tokenizer.point, + }, + ) + .serialize() + .trim() + .to_lowercase(); + println!("name: {:?}", name); + + if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) { info.kind = Kind::Raw; // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) - } else if HTML_BLOCK_NAMES.contains(&name) { + } else if HTML_BLOCK_NAMES.contains(&name.as_str()) { // Basic is assumed, no need to set `kind`. if slash { tokenizer.consume(); @@ -432,12 +429,11 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } } - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); - info.buffer.push(tokenizer.current); State::Fn(Box::new(|t| tag_name(t, info))) } - Code::Char(_) => State::Nok, + Some(_) => State::Nok, } } @@ -449,7 +445,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -467,7 +463,7 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_closing_tag_after(t, info))) } @@ -496,15 +492,15 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_end(t, info))) } - Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) } @@ -524,7 +520,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } @@ -543,11 +539,11 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('=') => { + Some('=') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) } @@ -566,13 +562,13 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State /// ``` fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok, - Code::Char('"' | '\'') => { + None | Some('<' | '=' | '>' | '`') => State::Nok, + Some(char) if matches!(char, '"' | '\'') => { + info.quote = Some(QuoteKind::from_char(char)); tokenizer.consume(); - info.quote = Some(QuoteKind::from_code(tokenizer.current)); State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } @@ -590,8 +586,8 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> /// ``` fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Nok, - Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => { + None | Some('\n') => State::Nok, + Some(char) if char == info.quote.as_ref().unwrap().as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) } @@ -610,13 +606,10 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + None | Some('\t' | '\n' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { complete_attribute_name_after(tokenizer, info) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_unquoted(t, info))) } @@ -632,9 +625,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S /// ``` fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { - complete_attribute_name_before(tokenizer, info) - } + Some('\t' | ' ' | '/' | '>') => complete_attribute_name_before(tokenizer, info), _ => State::Nok, } } @@ -647,7 +638,7 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) /// ``` fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } @@ -663,16 +654,16 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } - Code::Char(_) => State::Nok, + Some(_) => State::Nok, } } @@ -684,29 +675,27 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') if info.kind == Kind::Comment => { + Some('-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_comment_inside(t, info))) } - Code::Char('<') if info.kind == Kind::Raw => { + Some('<') if info.kind == Kind::Raw => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_raw_tag_open(t, info))) } - Code::Char('>') if info.kind == Kind::Declaration => { + Some('>') if info.kind == Kind::Declaration => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Code::Char('?') if info.kind == Kind::Instruction => { + Some('?') if info.kind == Kind::Instruction => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Code::Char(']') if info.kind == Kind::Cdata => { + Some(']') if info.kind == Kind::Cdata => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - if info.kind == Kind::Basic || info.kind == Kind::Complete => - { + Some('\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { tokenizer.exit(Token::HtmlFlowData); tokenizer.check(blank_line_before, |ok| { if ok { @@ -716,7 +705,7 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { } })(tokenizer) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_start(tokenizer, info) } @@ -753,7 +742,7 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -772,9 +761,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - continuation_start(tokenizer, info) - } + None | Some('\n') => continuation_start(tokenizer, info), _ => { tokenizer.enter(Token::HtmlFlowData); continuation(tokenizer, info) @@ -790,7 +777,7 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -804,10 +791,11 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <script>console.log(1)</script> /// ^ /// ``` -fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('/') => { + Some('/') => { tokenizer.consume(); + info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => continuation(tokenizer, info), @@ -822,24 +810,34 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('>') => { - let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); - info.buffer.clear(); - - if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { + Some('>') => { + info.size = 0; + + let start = info.start.take().unwrap(); + let name = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &start, + end: &tokenizer.point, + }, + ) + .serialize() + .to_lowercase(); + + if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } else { continuation(tokenizer, info) } } - Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => { + Some('A'..='Z' | 'a'..='z') if info.size < HTML_RAW_SIZE_MAX => { tokenizer.consume(); - info.buffer.push(tokenizer.current); + info.size += 1; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => { - info.buffer.clear(); + info.size = 0; continuation(tokenizer, info) } } @@ -853,7 +851,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State /// ``` fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -877,11 +875,11 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> /// ``` fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Code::Char('-') if info.kind == Kind::Comment => { + Some('-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -897,7 +895,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_after(tokenizer) } |