diff options
Diffstat (limited to 'src/construct')
28 files changed, 540 insertions, 614 deletions
diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 27d7544..65c2f6f 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -52,8 +52,9 @@ //! [html-strong]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-strong-element use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType, Point, State, Tokenizer}; +use crate::tokenizer::{Event, EventType, Point, State, Tokenizer}; use crate::unicode::PUNCTUATION; +use crate::util::slice::Slice; /// Character code kinds. #[derive(Debug, PartialEq)] @@ -128,17 +129,6 @@ impl MarkerKind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('*' | '_')`. - fn from_code(code: Code) -> MarkerKind { - match code { - Code::Char(char) => MarkerKind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// Attentention sequence that we can take markers from. @@ -170,9 +160,9 @@ struct Sequence { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('*' | '_') if tokenizer.parse_state.constructs.attention => { + Some(char) if tokenizer.parse_state.constructs.attention && matches!(char, '*' | '_') => { tokenizer.enter(Token::AttentionSequence); - inside(tokenizer, MarkerKind::from_code(tokenizer.current)) + inside(tokenizer, MarkerKind::from_char(char)) } _ => State::Nok, } @@ -185,23 +175,20 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^^ /// ``` fn inside(tokenizer: &mut Tokenizer, marker: MarkerKind) -> State { - match tokenizer.current { - Code::Char(char) if char == marker.as_char() => { - tokenizer.consume(); - State::Fn(Box::new(move |t| inside(t, marker))) - } - _ => { - tokenizer.exit(Token::AttentionSequence); - tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); - State::Ok - } + if tokenizer.current == Some(marker.as_char()) { + tokenizer.consume(); + State::Fn(Box::new(move |t| inside(t, marker))) + } else { + tokenizer.exit(Token::AttentionSequence); + tokenizer.register_resolver("attention".to_string(), Box::new(resolve_attention)); + State::Ok } } /// Resolve attention sequences. #[allow(clippy::too_many_lines)] fn resolve_attention(tokenizer: &mut Tokenizer) { - let codes = &tokenizer.parse_state.codes; + let chars = &tokenizer.parse_state.chars; let mut start = 0; let mut balance = 0; let mut sequences = vec![]; @@ -216,17 +203,21 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { if enter.token_type == Token::AttentionSequence { let end = start + 1; let exit = &tokenizer.events[end]; - let marker = MarkerKind::from_code(codes[enter.point.index]); + let marker = + MarkerKind::from_char(Slice::from_point(chars, &enter.point).head().unwrap()); let before = classify_character(if enter.point.index > 0 { - codes[enter.point.index - 1] - } else { - Code::None - }); - let after = classify_character(if exit.point.index < codes.len() { - codes[exit.point.index] + Slice::from_point( + chars, + &Point { + index: enter.point.index - 1, + ..enter.point + }, + ) + .tail() } else { - Code::None + None }); + let after = classify_character(Slice::from_point(chars, &exit.point).tail()); let open = after == GroupKind::Other || (after == GroupKind::Punctuation && before != GroupKind::Other); // To do: GFM strikethrough? @@ -326,9 +317,9 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { let sequence_close = &mut sequences[close]; let close_event_index = sequence_close.event_index; let seq_close_enter = sequence_close.start_point.clone(); + // No need to worry about `VS`, because sequences are only actual characters. sequence_close.size -= take; sequence_close.start_point.column += take; - sequence_close.start_point.offset += take; sequence_close.start_point.index += take; let seq_close_exit = sequence_close.start_point.clone(); @@ -352,9 +343,9 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { let sequence_open = &mut sequences[open]; let open_event_index = sequence_open.event_index; let seq_open_exit = sequence_open.end_point.clone(); + // No need to worry about `VS`, because sequences are only actual characters. sequence_open.size -= take; sequence_open.end_point.column -= take; - sequence_open.end_point.offset -= take; sequence_open.end_point.index -= take; let seq_open_enter = sequence_open.end_point.clone(); @@ -492,20 +483,20 @@ fn resolve_attention(tokenizer: &mut Tokenizer) { /// Used for attention (emphasis, strong), whose sequences can open or close /// based on the class of surrounding characters. /// -/// > π **Note** that eof (`Code::None`) is seen as whitespace. +/// > π **Note** that eof (`None`) is seen as whitespace. /// /// ## References /// /// * [`micromark-util-classify-character` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-util-classify-character/dev/index.js) -fn classify_character(code: Code) -> GroupKind { - match code { +fn classify_character(char: Option<char>) -> GroupKind { + match char { // Custom characters. - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace => GroupKind::Whitespace, + None => GroupKind::Whitespace, // Unicode whitespace. - Code::Char(char) if char.is_whitespace() => GroupKind::Whitespace, + Some(char) if char.is_whitespace() => GroupKind::Whitespace, // Unicode punctuation. - Code::Char(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation, + Some(char) if PUNCTUATION.contains(&char) => GroupKind::Punctuation, // Everything else. - Code::Char(_) => GroupKind::Other, + Some(_) => GroupKind::Other, } } diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 3933596..399570b 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -103,7 +103,7 @@ use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX}; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of an autolink. /// @@ -115,7 +115,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('<') if tokenizer.parse_state.constructs.autolink => { + Some('<') if tokenizer.parse_state.constructs.autolink => { tokenizer.enter(Token::Autolink); tokenizer.enter(Token::AutolinkMarker); tokenizer.consume(); @@ -137,11 +137,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_alphabetic() => { + Some(char) if char.is_ascii_alphabetic() => { tokenizer.consume(); State::Fn(Box::new(scheme_or_email_atext)) } - Code::Char(char) if is_ascii_atext(char) => email_atext(tokenizer), + Some(char) if is_ascii_atext(char) => email_atext(tokenizer), _ => State::Nok, } } @@ -156,7 +156,7 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { scheme_inside_or_email_atext(tokenizer, 1) } _ => email_atext(tokenizer), @@ -173,11 +173,11 @@ fn scheme_or_email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char(':') => { + Some(':') => { tokenizer.consume(); State::Fn(Box::new(url_inside)) } - Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') + Some('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') if size < AUTOLINK_SCHEME_SIZE_MAX => { tokenizer.consume(); @@ -195,15 +195,13 @@ fn scheme_inside_or_email_atext(tokenizer: &mut Tokenizer, size: usize) -> State /// ``` fn url_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.exit(Token::AutolinkProtocol); end(tokenizer) } - Code::Char(char) if char.is_ascii_control() => State::Nok, - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ') => { - State::Nok - } - Code::Char(_) => { + Some(char) if char.is_ascii_control() => State::Nok, + None | Some(' ') => State::Nok, + Some(_) => { tokenizer.consume(); State::Fn(Box::new(url_inside)) } @@ -218,11 +216,11 @@ fn url_inside(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_atext(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('@') => { + Some('@') => { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Code::Char(char) if is_ascii_atext(char) => { + Some(char) if is_ascii_atext(char) => { tokenizer.consume(); State::Fn(Box::new(email_atext)) } @@ -238,7 +236,7 @@ fn email_atext(tokenizer: &mut Tokenizer) -> State { /// ``` fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size), + Some(char) if char.is_ascii_alphanumeric() => email_value(tokenizer, size), _ => State::Nok, } } @@ -251,11 +249,11 @@ fn email_at_sign_or_dot(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char('.') => { + Some('.') => { tokenizer.consume(); State::Fn(Box::new(|t| email_at_sign_or_dot(t, 0))) } - Code::Char('>') => { + Some('>') => { let index = tokenizer.events.len(); tokenizer.exit(Token::AutolinkProtocol); // Change the token type. @@ -277,11 +275,11 @@ fn email_label(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_value(t, size + 1))) } - Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + Some(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| email_label(t, size + 1))) } @@ -299,7 +297,7 @@ fn email_value(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.enter(Token::AutolinkMarker); tokenizer.consume(); tokenizer.exit(Token::AutolinkMarker); diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 537ffc1..6780f40 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -33,7 +33,7 @@ //! [flow]: crate::content::flow use crate::construct::partial_space_or_tab::space_or_tab; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of a blank line. /// @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok, + None | Some('\n') => State::Ok, _ => State::Nok, } } diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 3bb4b8b..49a0ea0 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -36,7 +36,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::space_or_tab_min_max; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of block quote. /// @@ -65,7 +65,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.enter(Token::BlockQuote); cont_before(tokenizer) } @@ -98,7 +98,7 @@ pub fn cont(tokenizer: &mut Tokenizer) -> State { /// ``` fn cont_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.enter(Token::BlockQuotePrefix); tokenizer.enter(Token::BlockQuoteMarker); tokenizer.consume(); @@ -118,17 +118,14 @@ fn cont_before(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn cont_after(tokenizer: &mut Tokenizer) -> State { - match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.enter(Token::SpaceOrTab); - tokenizer.consume(); - tokenizer.exit(Token::SpaceOrTab); - tokenizer.exit(Token::BlockQuotePrefix); - State::Ok - } - _ => { - tokenizer.exit(Token::BlockQuotePrefix); - State::Ok - } + if let Some('\t' | ' ') = tokenizer.current { + tokenizer.enter(Token::SpaceOrTab); + tokenizer.consume(); + tokenizer.exit(Token::SpaceOrTab); + tokenizer.exit(Token::BlockQuotePrefix); + State::Ok + } else { + tokenizer.exit(Token::BlockQuotePrefix); + State::Ok } } diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 9e9b713..e9263af 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -34,7 +34,7 @@ //! [hard_break_escape]: crate::construct::hard_break_escape use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of a character escape. /// @@ -44,7 +44,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('\\') if tokenizer.parse_state.constructs.character_escape => { + Some('\\') if tokenizer.parse_state.constructs.character_escape => { tokenizer.enter(Token::CharacterEscape); tokenizer.enter(Token::CharacterEscapeMarker); tokenizer.consume(); @@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_punctuation() => { + Some(char) if char.is_ascii_punctuation() => { tokenizer.enter(Token::CharacterEscapeValue); tokenizer.consume(); tokenizer.exit(Token::CharacterEscapeValue); diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 8521f15..59043d1 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,7 +66,8 @@ use crate::constant::{ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{Point, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// Kind of a character reference. #[derive(Debug, Clone, PartialEq)] @@ -120,8 +121,10 @@ impl Kind { /// State needed to parse character references. #[derive(Debug, Clone)] struct Info { - /// All parsed characters. - buffer: String, + /// Place of value start. + start: Point, + /// Size of value. + size: usize, /// Kind of character reference. kind: Kind, } @@ -138,7 +141,7 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('&') if tokenizer.parse_state.constructs.character_reference => { + Some('&') if tokenizer.parse_state.constructs.character_reference => { tokenizer.enter(Token::CharacterReference); tokenizer.enter(Token::CharacterReferenceMarker); tokenizer.consume(); @@ -161,18 +164,21 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn open(tokenizer: &mut Tokenizer) -> State { - let info = Info { - buffer: String::new(), - kind: Kind::Named, - }; - if let Code::Char('#') = tokenizer.current { + if let Some('#') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerNumeric); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerNumeric); - State::Fn(Box::new(|t| numeric(t, info))) + State::Fn(Box::new(numeric)) } else { tokenizer.enter(Token::CharacterReferenceValue); - value(tokenizer, info) + value( + tokenizer, + Info { + start: tokenizer.point.clone(), + size: 0, + kind: Kind::Named, + }, + ) } } @@ -185,17 +191,25 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn numeric(tokenizer: &mut Tokenizer, mut info: Info) -> State { - if let Code::Char('x' | 'X') = tokenizer.current { +fn numeric(tokenizer: &mut Tokenizer) -> State { + if let Some('x' | 'X') = tokenizer.current { tokenizer.enter(Token::CharacterReferenceMarkerHexadecimal); tokenizer.consume(); tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); - info.kind = Kind::Hexadecimal; + let info = Info { + start: tokenizer.point.clone(), + size: 0, + kind: Kind::Hexadecimal, + }; State::Fn(Box::new(|t| value(t, info))) } else { tokenizer.enter(Token::CharacterReferenceValue); - info.kind = Kind::Decimal; + let info = Info { + start: tokenizer.point.clone(), + size: 0, + kind: Kind::Decimal, + }; value(tokenizer, info) } } @@ -215,24 +229,32 @@ fn numeric(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(';') if !info.buffer.is_empty() => { - let unknown_named = Kind::Named == info.kind - && !CHARACTER_REFERENCES.iter().any(|d| d.0 == info.buffer); + Some(';') if info.size > 0 => { + if Kind::Named == info.kind { + let value = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &info.start, + end: &tokenizer.point, + }, + ) + .serialize(); - if unknown_named { - State::Nok - } else { - tokenizer.exit(Token::CharacterReferenceValue); - tokenizer.enter(Token::CharacterReferenceMarkerSemi); - tokenizer.consume(); - tokenizer.exit(Token::CharacterReferenceMarkerSemi); - tokenizer.exit(Token::CharacterReference); - State::Ok + if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { + return State::Nok; + } } + + tokenizer.exit(Token::CharacterReferenceValue); + tokenizer.enter(Token::CharacterReferenceMarkerSemi); + tokenizer.consume(); + tokenizer.exit(Token::CharacterReferenceMarkerSemi); + tokenizer.exit(Token::CharacterReference); + State::Ok } - Code::Char(char) => { - if info.buffer.len() < info.kind.max() && info.kind.allowed(char) { - info.buffer.push(char); + Some(char) => { + if info.size < info.kind.max() && info.kind.allowed(char) { + info.size += 1; tokenizer.consume(); State::Fn(Box::new(|t| value(t, info))) } else { diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 2fea95e..98fa54f 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -107,8 +107,8 @@ use crate::construct::{ partial_space_or_tab::{space_or_tab, space_or_tab_min_max}, }; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; -use crate::util::span::from_exit_event; +use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// Kind of fences. #[derive(Debug, Clone, PartialEq)] @@ -155,17 +155,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not ``Code::Char('~' | '`')``. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse code (fenced). @@ -217,20 +206,23 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(event) = tail { if event.token_type == Token::SpaceOrTab { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; + prefix = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), + ) + .size(); } } match tokenizer.current { - Code::Char('`' | '~') => { + Some(char) if matches!(char, '`' | '~') => { tokenizer.enter(Token::CodeFencedFenceSequence); sequence_open( tokenizer, Info { prefix, size: 0, - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), }, ) } @@ -248,7 +240,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| { info.size += 1; @@ -273,7 +265,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -282,7 +274,7 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { _ => { tokenizer.enter(Token::CodeFencedFenceInfo); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - info_inside(tokenizer, info, vec![]) + info_inside(tokenizer, info) } } } @@ -295,9 +287,9 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec<Code>) -> State { +fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.exit(Token::CodeFencedFence); @@ -305,16 +297,15 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec<Code>) -> S tokenizer.concrete = true; at_break(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) } - Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok, - Code::Char(_) => { - codes.push(tokenizer.current); + Some('`') if info.kind == Kind::GraveAccent => State::Nok, + Some(_) => { tokenizer.consume(); - State::Fn(Box::new(|t| info_inside(t, info, codes))) + State::Fn(Box::new(|t| info_inside(t, info))) } } } @@ -329,7 +320,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec<Code>) -> S /// ``` fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -353,7 +344,7 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceMeta); tokenizer.exit(Token::CodeFencedFence); @@ -361,7 +352,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok, + Some('`') if info.kind == Kind::GraveAccent => State::Nok, _ => { tokenizer.consume(); State::Fn(Box::new(|t| meta(t, info))) @@ -422,7 +413,7 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -461,7 +452,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(Token::CodeFencedFenceSequence); close_sequence(tokenizer, info, 0) } @@ -479,7 +470,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) } @@ -501,7 +492,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { /// ``` fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); State::Ok } @@ -547,9 +538,7 @@ fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_break(tokenizer, info) - } + None | Some('\n') => at_break(tokenizer, info), _ => { tokenizer.enter(Token::CodeFlowChunk); content_continue(tokenizer, info) @@ -567,7 +556,7 @@ fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer, info) } diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index 015c4a0..bb1615c 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -48,7 +48,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::TAB_SIZE; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of code (indented). /// @@ -78,11 +78,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => after(tokenizer), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer - .attempt(further_start, |ok| { - Box::new(if ok { at_break } else { after }) - })(tokenizer), + None => after(tokenizer), + Some('\n') => tokenizer.attempt(further_start, |ok| { + Box::new(if ok { at_break } else { after }) + })(tokenizer), _ => { tokenizer.enter(Token::CodeFlowChunk); content(tokenizer) @@ -98,7 +97,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ``` fn content(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer) } @@ -134,7 +133,7 @@ fn further_start(tokenizer: &mut Tokenizer) -> State { State::Nok } else { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -178,7 +177,7 @@ fn further_begin(tokenizer: &mut Tokenizer) -> State { /// ``` fn further_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => further_start(tokenizer), + Some('\n') => further_start(tokenizer), _ => State::Nok, } } diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index f5f92fc..150f63b 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -84,7 +84,7 @@ //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of code (text). /// @@ -98,9 +98,9 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { let len = tokenizer.events.len(); match tokenizer.current { - Code::Char('`') + Some('`') if tokenizer.parse_state.constructs.code_text - && (tokenizer.previous != Code::Char('`') + && (tokenizer.previous != Some('`') || (len > 0 && tokenizer.events[len - 1].token_type == Token::CharacterEscape)) => { @@ -119,7 +119,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { - if let Code::Char('`') = tokenizer.current { + if let Some('`') = tokenizer.current { tokenizer.consume(); State::Fn(Box::new(move |t| sequence_open(t, size + 1))) } else { @@ -136,14 +136,14 @@ fn sequence_open(tokenizer: &mut Tokenizer, size: usize) -> State { /// ``` fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Nok, + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); State::Fn(Box::new(move |t| between(t, size_open))) } - Code::Char('`') => { + Some('`') => { tokenizer.enter(Token::CodeTextSequence); sequence_close(tokenizer, size_open, 0) } @@ -162,7 +162,7 @@ fn between(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// ``` fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '`') => { + None | Some('\n' | '`') => { tokenizer.exit(Token::CodeTextData); between(tokenizer, size_open) } @@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, size_open: usize) -> State { /// ``` fn sequence_close(tokenizer: &mut Tokenizer, size_open: usize, size: usize) -> State { match tokenizer.current { - Code::Char('`') => { + Some('`') => { tokenizer.consume(); State::Fn(Box::new(move |t| sequence_close(t, size_open, size + 1))) } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index ffaaa98..f2b5ae0 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -100,7 +100,7 @@ use crate::construct::{ partial_title::{start as title, Options as TitleOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; /// At the start of a definition. @@ -137,7 +137,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => tokenizer.go( + Some('[') => tokenizer.go( |t| { label( t, @@ -162,7 +162,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn label_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(':') => { + Some(':') => { tokenizer.enter(Token::DefinitionMarker); tokenizer.consume(); tokenizer.exit(Token::DefinitionMarker); @@ -231,7 +231,7 @@ fn after(tokenizer: &mut Tokenizer) -> State { /// ``` fn after_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Definition); // Youβd be interrupting. tokenizer.interrupt = true; @@ -294,7 +294,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn title_after_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Ok, + None | Some('\n') => State::Ok, _ => State::Nok, } } diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 40a83ef..0585c4c 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -40,7 +40,7 @@ //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of a hard break (escape). /// @@ -51,7 +51,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('\\') if tokenizer.parse_state.constructs.hard_break_escape => { + Some('\\') if tokenizer.parse_state.constructs.hard_break_escape => { tokenizer.enter(Token::HardBreakEscape); tokenizer.consume(); State::Fn(Box::new(inside)) @@ -69,7 +69,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.exit(Token::HardBreakEscape); State::Ok } diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 5de9a80..7a7cf2e 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -57,7 +57,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, Event, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, Event, EventType, State, Tokenizer}; /// Start of a heading (atx). /// @@ -87,7 +87,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Code::Char('#') == tokenizer.current { + if Some('#') == tokenizer.current { tokenizer.enter(Token::HeadingAtxSequence); sequence_open(tokenizer, 0) } else { @@ -103,11 +103,11 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if rank > 0 => { + None | Some('\n') if rank > 0 => { tokenizer.exit(Token::HeadingAtxSequence); at_break(tokenizer) } - Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { + Some('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |tokenizer| { sequence_open(tokenizer, rank + 1) @@ -129,21 +129,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, rank: usize) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HeadingAtx); tokenizer.register_resolver("heading_atx".to_string(), Box::new(resolve)); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Code::VirtualSpace | Code::Char('\t' | ' ') => { - tokenizer.go(space_or_tab(), at_break)(tokenizer) - } - Code::Char('#') => { + Some('\t' | ' ') => tokenizer.go(space_or_tab(), at_break)(tokenizer), + Some('#') => { tokenizer.enter(Token::HeadingAtxSequence); further_sequence(tokenizer) } - Code::Char(_) => { + Some(_) => { tokenizer.enter_with_content(Token::Data, Some(ContentType::Text)); data(tokenizer) } @@ -159,7 +157,7 @@ fn at_break(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn further_sequence(tokenizer: &mut Tokenizer) -> State { - if let Code::Char('#') = tokenizer.current { + if let Some('#') = tokenizer.current { tokenizer.consume(); State::Fn(Box::new(further_sequence)) } else { @@ -177,7 +175,7 @@ fn further_sequence(tokenizer: &mut Tokenizer) -> State { fn data(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Note: `#` for closing sequence must be preceded by whitespace, otherwise itβs just text. - Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => { + None | Some('\t' | '\n' | ' ') => { tokenizer.exit(Token::Data); at_break(tokenizer) } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index a0f7545..f9dd3f7 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -60,7 +60,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::token::Token; -use crate::tokenizer::{Code, EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, Tokenizer}; use crate::util::skip::opt_back as skip_opt_back; /// Kind of underline. @@ -148,7 +148,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(char) if char == '-' || char == '=' => { + Some(char) if matches!(char, '-' | '=') => { tokenizer.enter(Token::HeadingSetextUnderline); inside(tokenizer, Kind::from_char(char)) } @@ -165,7 +165,7 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { match tokenizer.current { - Code::Char(char) if char == kind.as_char() => { + Some(char) if char == kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, kind))) } @@ -185,7 +185,7 @@ fn inside(tokenizer: &mut Tokenizer, kind: Kind) -> State { /// ``` fn after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { // Feel free to interrupt. tokenizer.interrupt = false; tokenizer.register_resolver("heading_setext".to_string(), Box::new(resolve)); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 24d6f98..238963d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -105,8 +105,10 @@ use crate::construct::{ partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; -use crate::util::codes::{parse, serialize}; +use crate::tokenizer::{Point, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; + +const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -168,17 +170,6 @@ impl QuoteKind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('"' | '\'')`. - fn from_code(code: Code) -> QuoteKind { - match code { - Code::Char(char) => QuoteKind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse HTML (flow). @@ -190,9 +181,9 @@ struct Info { start_tag: bool, /// Used depending on `kind` to either collect all parsed characters, or to /// store expected characters. - buffer: Vec<Code>, - /// `index` into `buffer` when expecting certain characters. - index: usize, + start: Option<Point>, + /// Collected index, for various reasons. + size: usize, /// Current quote, when in a double or single quoted attribute value. quote: Option<QuoteKind>, } @@ -234,7 +225,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn before(tokenizer: &mut Tokenizer) -> State { - if Code::Char('<') == tokenizer.current { + if Some('<') == tokenizer.current { tokenizer.enter(Token::HtmlFlowData); tokenizer.consume(); State::Fn(Box::new(open)) @@ -259,21 +250,22 @@ fn open(tokenizer: &mut Tokenizer) -> State { kind: Kind::Basic, // Assume closing tag (or no tag). start_tag: false, - buffer: vec![], - index: 0, + start: None, + size: 0, quote: None, }; match tokenizer.current { - Code::Char('!') => { + Some('!') => { tokenizer.consume(); State::Fn(Box::new(|t| declaration_open(t, info))) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); + info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| tag_close_start(t, info))) } - Code::Char('?') => { + Some('?') => { info.kind = Kind::Instruction; tokenizer.consume(); // Do not form containers. @@ -282,8 +274,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { // right now, so we do need to search for `>`, similar to declarations. State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { info.start_tag = true; + info.start = Some(tokenizer.point.clone()); tag_name(tokenizer, info) } _ => State::Nok, @@ -302,19 +295,18 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); info.kind = Kind::Comment; State::Fn(Box::new(|t| comment_open_inside(t, info))) } - Code::Char('[') => { + Some('[') => { tokenizer.consume(); info.kind = Kind::Cdata; - info.buffer = parse("CDATA["); - info.index = 0; + info.size = 0; State::Fn(Box::new(|t| cdata_open_inside(t, info))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); info.kind = Kind::Declaration; // Do not form containers. @@ -333,7 +325,7 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -350,20 +342,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// ^^^^^^ /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { - if tokenizer.current == info.buffer[info.index] { - info.index += 1; - tokenizer.consume(); + match tokenizer.current { + Some(char) if char == CDATA_SEARCH[info.size] => { + info.size += 1; + tokenizer.consume(); - if info.index == info.buffer.len() { - info.buffer.clear(); - // Do not form containers. - tokenizer.concrete = true; - State::Fn(Box::new(|t| continuation(t, info))) - } else { - State::Fn(Box::new(|t| cdata_open_inside(t, info))) + if info.size == CDATA_SEARCH.len() { + info.size = 0; + // Do not form containers. + tokenizer.concrete = true; + State::Fn(Box::new(|t| continuation(t, info))) + } else { + State::Fn(Box::new(|t| cdata_open_inside(t, info))) + } } - } else { - State::Nok + _ => State::Nok, } } @@ -373,11 +366,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// > | </x> /// ^ /// ``` -fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); - info.buffer.push(tokenizer.current); State::Fn(Box::new(|t| tag_name(t, info))) } _ => State::Nok, @@ -394,22 +386,27 @@ fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { - let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); - let name = tag_name_buffer.as_str(); - let slash = matches!(tokenizer.current, Code::Char('/')); - - info.buffer.clear(); - - if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) { + None | Some('\t' | '\n' | ' ' | '/' | '>') => { + let slash = matches!(tokenizer.current, Some('/')); + let start = info.start.take().unwrap(); + let name = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &start, + end: &tokenizer.point, + }, + ) + .serialize() + .trim() + .to_lowercase(); + println!("name: {:?}", name); + + if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) { info.kind = Kind::Raw; // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) - } else if HTML_BLOCK_NAMES.contains(&name) { + } else if HTML_BLOCK_NAMES.contains(&name.as_str()) { // Basic is assumed, no need to set `kind`. if slash { tokenizer.consume(); @@ -432,12 +429,11 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { } } } - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); - info.buffer.push(tokenizer.current); State::Fn(Box::new(|t| tag_name(t, info))) } - Code::Char(_) => State::Nok, + Some(_) => State::Nok, } } @@ -449,7 +445,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); // Do not form containers. tokenizer.concrete = true; @@ -467,7 +463,7 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_closing_tag_after(t, info))) } @@ -496,15 +492,15 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_end(t, info))) } - Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_before(t, info))) } @@ -524,7 +520,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name(t, info))) } @@ -543,11 +539,11 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('=') => { + Some('=') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_name_after(t, info))) } @@ -566,13 +562,13 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State /// ``` fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok, - Code::Char('"' | '\'') => { + None | Some('<' | '=' | '>' | '`') => State::Nok, + Some(char) if matches!(char, '"' | '\'') => { + info.quote = Some(QuoteKind::from_char(char)); tokenizer.consume(); - info.quote = Some(QuoteKind::from_code(tokenizer.current)); State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info))) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_before(t, info))) } @@ -590,8 +586,8 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> /// ``` fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Nok, - Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => { + None | Some('\n') => State::Nok, + Some(char) if char == info.quote.as_ref().unwrap().as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info))) } @@ -610,13 +606,10 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { + None | Some('\t' | '\n' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => { complete_attribute_name_after(tokenizer, info) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(|t| complete_attribute_value_unquoted(t, info))) } @@ -632,9 +625,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S /// ``` fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => { - complete_attribute_name_before(tokenizer, info) - } + Some('\t' | ' ' | '/' | '>') => complete_attribute_name_before(tokenizer, info), _ => State::Nok, } } @@ -647,7 +638,7 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) /// ``` fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } @@ -663,16 +654,16 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { // Do not form containers. tokenizer.concrete = true; continuation(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(|t| complete_after(t, info))) } - Code::Char(_) => State::Nok, + Some(_) => State::Nok, } } @@ -684,29 +675,27 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') if info.kind == Kind::Comment => { + Some('-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_comment_inside(t, info))) } - Code::Char('<') if info.kind == Kind::Raw => { + Some('<') if info.kind == Kind::Raw => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_raw_tag_open(t, info))) } - Code::Char('>') if info.kind == Kind::Declaration => { + Some('>') if info.kind == Kind::Declaration => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Code::Char('?') if info.kind == Kind::Instruction => { + Some('?') if info.kind == Kind::Instruction => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } - Code::Char(']') if info.kind == Kind::Cdata => { + Some(']') if info.kind == Kind::Cdata => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_character_data_inside(t, info))) } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - if info.kind == Kind::Basic || info.kind == Kind::Complete => - { + Some('\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => { tokenizer.exit(Token::HtmlFlowData); tokenizer.check(blank_line_before, |ok| { if ok { @@ -716,7 +705,7 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State { } })(tokenizer) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_start(tokenizer, info) } @@ -753,7 +742,7 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -772,9 +761,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - continuation_start(tokenizer, info) - } + None | Some('\n') => continuation_start(tokenizer, info), _ => { tokenizer.enter(Token::HtmlFlowData); continuation(tokenizer, info) @@ -790,7 +777,7 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -804,10 +791,11 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State { /// > | <script>console.log(1)</script> /// ^ /// ``` -fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State { +fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('/') => { + Some('/') => { tokenizer.consume(); + info.start = Some(tokenizer.point.clone()); State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => continuation(tokenizer, info), @@ -822,24 +810,34 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('>') => { - let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); - info.buffer.clear(); - - if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { + Some('>') => { + info.size = 0; + + let start = info.start.take().unwrap(); + let name = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &start, + end: &tokenizer.point, + }, + ) + .serialize() + .to_lowercase(); + + if HTML_RAW_NAMES.contains(&name.as_str()) { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } else { continuation(tokenizer, info) } } - Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => { + Some('A'..='Z' | 'a'..='z') if info.size < HTML_RAW_SIZE_MAX => { tokenizer.consume(); - info.buffer.push(tokenizer.current); + info.size += 1; State::Fn(Box::new(|t| continuation_raw_end_tag(t, info))) } _ => { - info.buffer.clear(); + info.size = 0; continuation(tokenizer, info) } } @@ -853,7 +851,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State /// ``` fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -877,11 +875,11 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> /// ``` fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_close(t, info))) } - Code::Char('-') if info.kind == Kind::Comment => { + Some('-') if info.kind == Kind::Comment => { tokenizer.consume(); State::Fn(Box::new(|t| continuation_declaration_inside(t, info))) } @@ -897,7 +895,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta /// ``` fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::HtmlFlowData); continuation_after(tokenizer) } diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 3ac8d71..b1ad113 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -56,8 +56,9 @@ use crate::construct::partial_space_or_tab::space_or_tab; use crate::token::Token; -use crate::tokenizer::{Code, State, StateFn, Tokenizer}; -use crate::util::codes::parse; +use crate::tokenizer::{State, StateFn, Tokenizer}; + +const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '[']; /// Start of HTML (text) /// @@ -66,7 +67,7 @@ use crate::util::codes::parse; /// ^ /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Code::Char('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { + if Some('<') == tokenizer.current && tokenizer.parse_state.constructs.html_text { tokenizer.enter(Token::HtmlText); tokenizer.enter(Token::HtmlTextData); tokenizer.consume(); @@ -88,19 +89,19 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('!') => { + Some('!') => { tokenizer.consume(); State::Fn(Box::new(declaration_open)) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(tag_close_start)) } - Code::Char('?') => { + Some('?') => { tokenizer.consume(); State::Fn(Box::new(instruction)) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } @@ -120,16 +121,15 @@ fn open(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_open_inside)) } - Code::Char('[') => { + Some('[') => { tokenizer.consume(); - let buffer = parse("CDATA["); - State::Fn(Box::new(|t| cdata_open_inside(t, buffer, 0))) + State::Fn(Box::new(|t| cdata_open_inside(t, 0))) } - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(declaration)) } @@ -145,7 +145,7 @@ fn declaration_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_start)) } @@ -168,8 +168,8 @@ fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => State::Nok, - Code::Char('-') => { + None | Some('>') => State::Nok, + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_start_dash)) } @@ -192,7 +192,7 @@ fn comment_start(tokenizer: &mut Tokenizer) -> State { /// [html_flow]: crate::construct::html_flow fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => State::Nok, + None | Some('>') => State::Nok, _ => comment(tokenizer), } } @@ -205,11 +205,9 @@ fn comment_start_dash(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(comment)) - } - Code::Char('-') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(comment)), + Some('-') => { tokenizer.consume(); State::Fn(Box::new(comment_close)) } @@ -228,7 +226,7 @@ fn comment(tokenizer: &mut Tokenizer) -> State { /// ``` fn comment_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-') => { + Some('-') => { tokenizer.consume(); State::Fn(Box::new(end)) } @@ -242,17 +240,18 @@ fn comment_close(tokenizer: &mut Tokenizer) -> State { /// > | a <![CDATA[>&<]]> b /// ^^^^^^ /// ``` -fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec<Code>, index: usize) -> State { - if tokenizer.current == buffer[index] { - tokenizer.consume(); +fn cdata_open_inside(tokenizer: &mut Tokenizer, index: usize) -> State { + match tokenizer.current { + Some(char) if char == CDATA_SEARCH[index] => { + tokenizer.consume(); - if index + 1 == buffer.len() { - State::Fn(Box::new(cdata)) - } else { - State::Fn(Box::new(move |t| cdata_open_inside(t, buffer, index + 1))) + if index + 1 == CDATA_SEARCH.len() { + State::Fn(Box::new(cdata)) + } else { + State::Fn(Box::new(move |t| cdata_open_inside(t, index + 1))) + } } - } else { - State::Nok + _ => State::Nok, } } @@ -264,11 +263,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, buffer: Vec<Code>, index: usize) /// ``` fn cdata(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(cdata)) - } - Code::Char(']') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(cdata)), + Some(']') => { tokenizer.consume(); State::Fn(Box::new(cdata_close)) } @@ -287,7 +284,7 @@ fn cdata(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.consume(); State::Fn(Box::new(cdata_end)) } @@ -303,8 +300,8 @@ fn cdata_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn cdata_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => end(tokenizer), - Code::Char(']') => cdata_close(tokenizer), + Some('>') => end(tokenizer), + Some(']') => cdata_close(tokenizer), _ => cdata(tokenizer), } } @@ -317,10 +314,8 @@ fn cdata_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn declaration(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('>') => end(tokenizer), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(declaration)) - } + None | Some('>') => end(tokenizer), + Some('\n') => at_line_ending(tokenizer, Box::new(declaration)), _ => { tokenizer.consume(); State::Fn(Box::new(declaration)) @@ -336,11 +331,9 @@ fn declaration(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(instruction)) - } - Code::Char('?') => { + None => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(instruction)), + Some('?') => { tokenizer.consume(); State::Fn(Box::new(instruction_close)) } @@ -359,7 +352,7 @@ fn instruction(tokenizer: &mut Tokenizer) -> State { /// ``` fn instruction_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => end(tokenizer), + Some('>') => end(tokenizer), _ => instruction(tokenizer), } } @@ -372,7 +365,7 @@ fn instruction_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('A'..='Z' | 'a'..='z') => { + Some('A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -388,7 +381,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_close)) } @@ -404,10 +397,8 @@ fn tag_close(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_close_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_close_between)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_close_between)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_close_between)) } @@ -423,13 +414,11 @@ fn tag_close_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open)) } - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -442,18 +431,16 @@ fn tag_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_between)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_between)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_between)) } - Code::Char('/') => { + Some('/') => { tokenizer.consume(); State::Fn(Box::new(end)) } - Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some(':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -469,7 +456,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { + Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name)) } @@ -486,14 +473,12 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_name_after)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_name_after)) } - Code::Char('=') => { + Some('=') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } @@ -510,19 +495,17 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)) - } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + None | Some('<' | '=' | '>' | '`') => State::Nok, + Some('\n') => at_line_ending(tokenizer, Box::new(tag_open_attribute_value_before)), + Some('\t' | ' ') => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_before)) } - Code::Char(char) if char == '"' || char == '\'' => { + Some(char) if char == '"' || char == '\'' => { tokenizer.consume(); State::Fn(Box::new(move |t| tag_open_attribute_value_quoted(t, char))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) } @@ -537,12 +520,12 @@ fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> State { match tokenizer.current { - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending( + None => State::Nok, + Some('\n') => at_line_ending( tokenizer, Box::new(move |t| tag_open_attribute_value_quoted(t, marker)), ), - Code::Char(char) if char == marker => { + Some(char) if char == marker => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_quoted_after)) } @@ -563,11 +546,9 @@ fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer, marker: char) -> S /// ``` fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => State::Nok, - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer), - Code::Char(_) => { + None | Some('"' | '\'' | '<' | '=' | '`') => State::Nok, + Some('\t' | '\n' | ' ' | '/' | '>') => tag_open_between(tokenizer), + Some(_) => { tokenizer.consume(); State::Fn(Box::new(tag_open_attribute_value_unquoted)) } @@ -583,9 +564,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { /// ``` fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer), + Some('\t' | '\n' | ' ' | '>' | '/') => tag_open_between(tokenizer), _ => State::Nok, } } @@ -598,7 +577,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.consume(); tokenizer.exit(Token::HtmlTextData); tokenizer.exit(Token::HtmlText); @@ -620,7 +599,7 @@ fn end(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_line_ending(tokenizer: &mut Tokenizer, return_state: Box<StateFn>) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.exit(Token::HtmlTextData); tokenizer.enter(Token::LineEnding); tokenizer.consume(); diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 6f0a707..5ea788f 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -1,4 +1,4 @@ -//! Label end is a construct that occurs in the [text][] content type. +//! Label end is a construct that occurs in the [text][] conten&t type. //! //! It forms with the following BNF: //! @@ -154,10 +154,11 @@ use crate::construct::{ partial_title::{start as title, Options as TitleOptions}, }; use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType, Media, State, Tokenizer}; +use crate::tokenizer::{Event, EventType, Media, State, Tokenizer}; use crate::util::{ normalize_identifier::normalize_identifier, - span::{serialize, Span}, + skip, + slice::{Position, Slice}, }; /// State needed to parse label end. @@ -181,7 +182,7 @@ struct Info { /// > | [a] b /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { - if Code::Char(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { + if Some(']') == tokenizer.current && tokenizer.parse_state.constructs.label_end { let mut label_start_index = None; let mut index = tokenizer.label_start_stack.len(); @@ -207,19 +208,23 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { } let label_end_start = tokenizer.events.len(); + let info = Info { label_start_index, media: Media { start: label_start.start, end: (label_end_start, label_end_start + 3), - id: normalize_identifier(&serialize( - &tokenizer.parse_state.codes, - &Span { - start_index: tokenizer.events[label_start.start.1].point.index, - end_index: tokenizer.events[label_end_start - 1].point.index, - }, - false, - )), + // To do: virtual spaces not needed, create a `to_str`? + id: normalize_identifier( + &Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &tokenizer.events[label_start.start.1].point, + end: &tokenizer.events[label_end_start - 1].point, + }, + ) + .serialize(), + ), }, }; @@ -253,7 +258,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { // Resource (`[asd](fgh)`)? - Code::Char('(') => tokenizer.attempt(resource, move |is_ok| { + Some('(') => tokenizer.attempt(resource, move |is_ok| { Box::new(move |t| { // Also fine if `defined`, as then itβs a valid shortcut. if is_ok || defined { @@ -264,7 +269,7 @@ fn after(tokenizer: &mut Tokenizer, info: Info) -> State { }) })(tokenizer), // Full (`[asd][fgh]`) or collapsed (`[asd][]`) reference? - Code::Char('[') => tokenizer.attempt(full_reference, move |is_ok| { + Some('[') => tokenizer.attempt(full_reference, move |is_ok| { Box::new(move |t| { if is_ok { ok(t, info) @@ -377,7 +382,7 @@ fn nok(tokenizer: &mut Tokenizer, label_start_index: usize) -> State { /// ``` fn resource(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('(') => { + Some('(') => { tokenizer.enter(Token::Resource); tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); @@ -406,7 +411,7 @@ fn resource_start(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(')') => resource_end(tokenizer), + Some(')') => resource_end(tokenizer), _ => tokenizer.go( |t| { destination( @@ -446,7 +451,7 @@ fn destination_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_between(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('"' | '\'' | '(') => tokenizer.go( + Some('"' | '\'' | '(') => tokenizer.go( |t| { title( t, @@ -481,7 +486,7 @@ fn title_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn resource_end(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(')') => { + Some(')') => { tokenizer.enter(Token::ResourceMarker); tokenizer.consume(); tokenizer.exit(Token::ResourceMarker); @@ -500,7 +505,7 @@ fn resource_end(tokenizer: &mut Tokenizer) -> State { /// ``` fn full_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => tokenizer.go( + Some('[') => tokenizer.go( |t| { label( t, @@ -524,36 +529,23 @@ fn full_reference(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn full_reference_after(tokenizer: &mut Tokenizer) -> State { - let events = &tokenizer.events; - let mut index = events.len() - 1; - let mut start: Option<usize> = None; - let mut end: Option<usize> = None; - - while index > 0 { - index -= 1; - let event = &events[index]; - if event.token_type == Token::ReferenceString { - if event.event_type == EventType::Exit { - end = Some(event.point.index); - } else { - start = Some(event.point.index); - break; - } - } - } + let end = skip::to_back( + &tokenizer.events, + tokenizer.events.len() - 1, + &[Token::ReferenceString], + ); + + // To do: virtual spaces not needed, create a `to_str`? + let id = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, end), + ) + .serialize(); if tokenizer .parse_state .definitions - .contains(&normalize_identifier(&serialize( - &tokenizer.parse_state.codes, - &Span { - // Always found, otherwise we donβt get here. - start_index: start.unwrap(), - end_index: end.unwrap(), - }, - false, - ))) + .contains(&normalize_identifier(&id)) { State::Ok } else { @@ -571,7 +563,7 @@ fn full_reference_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => { + Some('[') => { tokenizer.enter(Token::Reference); tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); @@ -592,7 +584,7 @@ fn collapsed_reference(tokenizer: &mut Tokenizer) -> State { /// ``` fn collapsed_reference_open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char(']') => { + Some(']') => { tokenizer.enter(Token::ReferenceMarker); tokenizer.consume(); tokenizer.exit(Token::ReferenceMarker); @@ -735,7 +727,11 @@ pub fn resolve_media(tokenizer: &mut Tokenizer) { 0, vec![Event { event_type: EventType::Exit, - token_type: Token::Link, + token_type: if group_enter_event.token_type == Token::LabelLink { + Token::Link + } else { + Token::Image + }, point: events[group_end_index].point.clone(), link: None, }], diff --git a/src/construct/label_start_image.rs b/src/construct/label_start_image.rs index 8c12ffe..078026d 100644 --- a/src/construct/label_start_image.rs +++ b/src/construct/label_start_image.rs @@ -30,7 +30,7 @@ use super::label_end::resolve_media; use crate::token::Token; -use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; +use crate::tokenizer::{LabelStart, State, Tokenizer}; /// Start of label (image) start. /// @@ -40,7 +40,7 @@ use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('!') if tokenizer.parse_state.constructs.label_start_image => { + Some('!') if tokenizer.parse_state.constructs.label_start_image => { tokenizer.enter(Token::LabelImage); tokenizer.enter(Token::LabelImageMarker); tokenizer.consume(); @@ -59,7 +59,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` pub fn open(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') => { + Some('[') => { tokenizer.enter(Token::LabelMarker); tokenizer.consume(); tokenizer.exit(Token::LabelMarker); diff --git a/src/construct/label_start_link.rs b/src/construct/label_start_link.rs index e13cd77..d7ae1d6 100644 --- a/src/construct/label_start_link.rs +++ b/src/construct/label_start_link.rs @@ -29,7 +29,7 @@ use super::label_end::resolve_media; use crate::token::Token; -use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; +use crate::tokenizer::{LabelStart, State, Tokenizer}; /// Start of label (link) start. /// @@ -39,7 +39,7 @@ use crate::tokenizer::{Code, LabelStart, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('[') if tokenizer.parse_state.constructs.label_start_link => { + Some('[') if tokenizer.parse_state.constructs.label_start_link => { let start = tokenizer.events.len(); tokenizer.enter(Token::LabelLink); tokenizer.enter(Token::LabelMarker); diff --git a/src/construct/list.rs b/src/construct/list.rs index f5bb0ce..355eeee 100644 --- a/src/construct/list.rs +++ b/src/construct/list.rs @@ -50,10 +50,10 @@ use crate::construct::{ thematic_break::start as thematic_break, }; use crate::token::Token; -use crate::tokenizer::{Code, EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, Tokenizer}; use crate::util::{ skip, - span::{codes as codes_from_span, from_exit_event}, + slice::{Position, Slice}, }; /// Type of list. @@ -117,17 +117,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('.' | ')' | '*' | '+' | '-')`. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// Start of list item. @@ -160,11 +149,11 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { // Unordered. - Code::Char('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| { + Some('*' | '+' | '-') => tokenizer.check(thematic_break, |ok| { Box::new(if ok { nok } else { before_unordered }) })(tokenizer), // Ordered. - Code::Char(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => { + Some(char) if char.is_ascii_digit() && (!tokenizer.interrupt || char == '1') => { tokenizer.enter(Token::ListItemPrefix); tokenizer.enter(Token::ListItemValue); inside(tokenizer, 0) @@ -194,11 +183,11 @@ fn before_unordered(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { + Some(char) if char.is_ascii_digit() && size + 1 < LIST_ITEM_VALUE_SIZE_MAX => { tokenizer.consume(); State::Fn(Box::new(move |t| inside(t, size + 1))) } - Code::Char('.' | ')') if !tokenizer.interrupt || size < 2 => { + Some('.' | ')') if !tokenizer.interrupt || size < 2 => { tokenizer.exit(Token::ListItemValue); marker(tokenizer) } @@ -273,10 +262,7 @@ fn whitespace(tokenizer: &mut Tokenizer) -> State { /// ^ /// ``` fn whitespace_after(tokenizer: &mut Tokenizer) -> State { - if matches!( - tokenizer.current, - Code::VirtualSpace | Code::Char('\t' | ' ') - ) { + if matches!(tokenizer.current, Some('\t' | ' ')) { State::Nok } else { State::Ok @@ -291,7 +277,7 @@ fn whitespace_after(tokenizer: &mut Tokenizer) -> State { /// ``` fn prefix_other(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.enter(Token::SpaceOrTab); tokenizer.consume(); tokenizer.exit(Token::SpaceOrTab); @@ -316,8 +302,18 @@ fn after(tokenizer: &mut Tokenizer, blank: bool) -> State { tokenizer.events.len() - 1, &[Token::ListItem], ); - let prefix = tokenizer.point.index - tokenizer.events[start].point.index - + (if blank { 1 } else { 0 }); + let mut prefix = Slice::from_position( + &tokenizer.parse_state.chars, + &Position { + start: &tokenizer.events[start].point, + end: &tokenizer.point, + }, + ) + .size(); + + if blank { + prefix += 1; + } let container = tokenizer.container.as_mut().unwrap(); container.blank_initial = blank; @@ -403,12 +399,15 @@ pub fn resolve_list_item(tokenizer: &mut Tokenizer) { if event.token_type == Token::ListItem { if event.event_type == EventType::Enter { let end = skip::opt(&tokenizer.events, index, &[Token::ListItem]) - 1; - let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]) + 1; - let codes = codes_from_span( - &tokenizer.parse_state.codes, - &from_exit_event(&tokenizer.events, marker), + let marker = skip::to(&tokenizer.events, index, &[Token::ListItemMarker]); + let kind = Kind::from_char( + Slice::from_point( + &tokenizer.parse_state.chars, + &tokenizer.events[marker].point, + ) + .head() + .unwrap(), ); - let kind = Kind::from_code(codes[0]); let current = (kind, balance, index, end); let mut list_index = lists_wip.len(); diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index 4bce6a4..5d230d3 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -33,7 +33,7 @@ //! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element use crate::token::Token; -use crate::tokenizer::{Code, ContentType, EventType, State, Tokenizer}; +use crate::tokenizer::{ContentType, EventType, State, Tokenizer}; use crate::util::skip::opt as skip_opt; /// Before a paragraph. @@ -44,7 +44,7 @@ use crate::util::skip::opt as skip_opt; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { unreachable!("unexpected eol/eof") } _ => { @@ -63,7 +63,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::Paragraph); tokenizer.register_resolver_before("paragraph".to_string(), Box::new(resolve)); diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 4216276..0b66b09 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -7,7 +7,7 @@ //! [text]: crate::content::text use crate::token::Token; -use crate::tokenizer::{Code, EventType, State, Tokenizer}; +use crate::tokenizer::{EventType, State, Tokenizer}; /// At the beginning of data. /// @@ -15,13 +15,14 @@ use crate::tokenizer::{Code, EventType, State, Tokenizer}; /// > | abc /// ^ /// ``` -pub fn start(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { - if stop.contains(&tokenizer.current) { - tokenizer.enter(Token::Data); - tokenizer.consume(); - State::Fn(Box::new(move |t| data(t, stop))) - } else { - at_break(tokenizer, stop) +pub fn start(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { + match tokenizer.current { + Some(char) if stop.contains(&char) => { + tokenizer.enter(Token::Data); + tokenizer.consume(); + State::Fn(Box::new(move |t| data(t, stop))) + } + _ => at_break(tokenizer, stop), } } @@ -31,16 +32,16 @@ pub fn start(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { /// > | abc /// ^ /// ``` -fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { +fn at_break(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { match tokenizer.current { - Code::None => State::Ok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None => State::Ok, + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); State::Fn(Box::new(move |t| at_break(t, stop))) } - _ if stop.contains(&tokenizer.current) => { + Some(char) if stop.contains(&char) => { tokenizer.register_resolver_before("data".to_string(), Box::new(resolve_data)); State::Ok } @@ -57,10 +58,10 @@ fn at_break(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { /// > | abc /// ^^^ /// ``` -fn data(tokenizer: &mut Tokenizer, stop: &'static [Code]) -> State { +fn data(tokenizer: &mut Tokenizer, stop: &'static [char]) -> State { let done = match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => true, - _ if stop.contains(&tokenizer.current) => true, + None | Some('\n') => true, + Some(char) if stop.contains(&char) => true, _ => false, }; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 6a984e2..6447228 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -72,7 +72,7 @@ //! [sanitize_uri]: crate::util::sanitize_uri use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, Tokenizer}; /// Configuration. /// @@ -117,7 +117,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { }; match tokenizer.current { - Code::Char('<') => { + Some('<') => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.literal.clone()); tokenizer.enter(info.options.marker.clone()); @@ -125,11 +125,9 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { tokenizer.exit(info.options.marker.clone()); State::Fn(Box::new(|t| enclosed_before(t, info))) } - Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ' | ')') => { - State::Nok - } - Code::Char(char) if char.is_ascii_control() => State::Nok, - Code::Char(_) => { + None | Some(' ' | ')') => State::Nok, + Some(char) if char.is_ascii_control() => State::Nok, + Some(_) => { tokenizer.enter(info.options.destination.clone()); tokenizer.enter(info.options.raw.clone()); tokenizer.enter(info.options.string.clone()); @@ -146,7 +144,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ^ /// ``` fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { - if let Code::Char('>') = tokenizer.current { + if let Some('>') = tokenizer.current { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -168,13 +166,13 @@ fn enclosed_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('>') => { + Some('>') => { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, info) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '<') => State::Nok, - Code::Char('\\') => { + None | Some('\n' | '<') => State::Nok, + Some('\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed_escape(t, info))) } @@ -193,7 +191,7 @@ fn enclosed(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('<' | '>' | '\\') => { + Some('<' | '>' | '\\') => { tokenizer.consume(); State::Fn(Box::new(|t| enclosed(t, info))) } @@ -209,7 +207,7 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('(') => { + Some('(') => { if info.balance >= info.options.limit { State::Nok } else { @@ -218,7 +216,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Fn(Box::new(move |t| raw(t, info))) } } - Code::Char(')') => { + Some(')') => { if info.balance == 0 { tokenizer.exit(Token::Data); tokenizer.exit(info.options.string.clone()); @@ -231,10 +229,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Fn(Box::new(move |t| raw(t, info))) } } - Code::None - | Code::CarriageReturnLineFeed - | Code::VirtualSpace - | Code::Char('\t' | '\n' | '\r' | ' ') => { + None | Some('\t' | '\n' | ' ') => { if info.balance > 0 { State::Nok } else { @@ -245,12 +240,12 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { State::Ok } } - Code::Char(char) if char.is_ascii_control() => State::Nok, - Code::Char('\\') => { + Some(char) if char.is_ascii_control() => State::Nok, + Some('\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw_escape(t, info))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); State::Fn(Box::new(move |t| raw(t, info))) } @@ -265,7 +260,7 @@ fn raw(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn raw_escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char('(' | ')' | '\\') => { + Some('(' | ')' | '\\') => { tokenizer.consume(); State::Fn(Box::new(move |t| raw(t, info))) } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 91a0e26..ee31533 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -62,7 +62,7 @@ use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::constant::LINK_REFERENCE_SIZE_MAX; use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, Tokenizer}; /// Configuration. /// @@ -98,7 +98,7 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Code::Char('[') => { + Some('[') => { let info = Info { connect: false, data: false, @@ -124,10 +124,10 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::Char('[') => State::Nok, - Code::Char(']') if !info.data => State::Nok, + None | Some('[') => State::Nok, + Some(']') if !info.data => State::Nok, _ if info.size > LINK_REFERENCE_SIZE_MAX => State::Nok, - Code::Char(']') => { + Some(']') => { tokenizer.exit(info.options.string.clone()); tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); @@ -135,7 +135,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(info.options.label); State::Ok } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go( + Some('\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -168,7 +168,7 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '[' | ']') => { + None | Some('\n' | '[' | ']') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } @@ -176,12 +176,12 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| label(t, info))) } - Code::Char('\\') => { + Some('\\') => { tokenizer.consume(); info.size += 1; if !info.data { @@ -189,7 +189,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { } State::Fn(Box::new(|t| escape(t, info))) } - Code::Char(_) => { + Some(_) => { tokenizer.consume(); info.size += 1; if !info.data { @@ -208,7 +208,7 @@ fn label(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char('[' | '\\' | ']') => { + Some('[' | '\\' | ']') => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| label(t, info))) diff --git a/src/construct/partial_non_lazy_continuation.rs b/src/construct/partial_non_lazy_continuation.rs index bdc22e4..068e30f 100644 --- a/src/construct/partial_non_lazy_continuation.rs +++ b/src/construct/partial_non_lazy_continuation.rs @@ -11,7 +11,7 @@ //! [html_flow]: crate::construct::html_flow use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Start of continuation. /// @@ -22,7 +22,7 @@ use crate::tokenizer::{Code, State, Tokenizer}; /// ``` pub fn start(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index 5f1a917..6070ffe 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -6,7 +6,7 @@ use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, StateFn, Tokenizer}; +use crate::tokenizer::{ContentType, State, StateFn, Tokenizer}; /// Options to parse `space_or_tab`. #[derive(Debug)] @@ -134,7 +134,7 @@ pub fn space_or_tab_eol_with_options(options: EolOptions) -> Box<StateFn> { /// ``` fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') if info.options.max > 0 => { + Some('\t' | ' ') if info.options.max > 0 => { tokenizer .enter_with_content(info.options.kind.clone(), info.options.content_type.clone()); @@ -165,7 +165,7 @@ fn start(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::VirtualSpace | Code::Char('\t' | ' ') if info.size < info.options.max => { + Some('\t' | ' ') if info.size < info.options.max => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| inside(t, info))) @@ -190,7 +190,7 @@ fn inside(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn after_space_or_tab(tokenizer: &mut Tokenizer, mut info: EolInfo) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter_with_content(Token::LineEnding, info.options.content_type.clone()); if info.connect { @@ -239,10 +239,7 @@ fn after_eol(tokenizer: &mut Tokenizer, info: EolInfo) -> State { /// ``` fn after_more_space_or_tab(tokenizer: &mut Tokenizer) -> State { // Blank line not allowed. - if matches!( - tokenizer.current, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - ) { + if matches!(tokenizer.current, None | Some('\n')) { State::Nok } else { State::Ok diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index e9528fd..15fc25e 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -33,7 +33,7 @@ use super::partial_space_or_tab::{space_or_tab_eol_with_options, EolOptions}; use crate::subtokenize::link; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; +use crate::tokenizer::{ContentType, State, Tokenizer}; /// Configuration. /// @@ -103,19 +103,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// > π **Note**: an opening paren must be used for `Kind::Paren`. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('(' | '"' | '\'')`. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse titles. @@ -137,10 +124,10 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { match tokenizer.current { - Code::Char('"' | '\'' | '(') => { + Some(char) if matches!(char, '"' | '\'' | '(') => { let info = Info { connect: false, - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), options, }; tokenizer.enter(info.options.title.clone()); @@ -163,7 +150,7 @@ pub fn start(tokenizer: &mut Tokenizer, options: Options) -> State { /// ``` fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(info.options.marker.clone()); tokenizer.consume(); tokenizer.exit(info.options.marker.clone()); @@ -185,12 +172,12 @@ fn begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.exit(info.options.string.clone()); begin(tokenizer, info) } - Code::None => State::Nok, - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go( + None => State::Nok, + Some('\n') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -223,15 +210,15 @@ fn at_break(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn title(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); at_break(tokenizer, info) } - Code::Char('\\') => { + Some('\\') => { tokenizer.consume(); State::Fn(Box::new(|t| escape(t, info))) } @@ -250,7 +237,7 @@ fn title(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn escape(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| title(t, info))) } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 4c94c7d..152824b 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -47,8 +47,8 @@ use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; use crate::token::Token; -use crate::tokenizer::{Code, Event, EventType, Tokenizer}; -use crate::util::span; +use crate::tokenizer::{Event, EventType, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// To do. pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) { @@ -85,30 +85,26 @@ fn trim_data( trim_end: bool, hard_break: bool, ) { - let mut codes = span::codes( - &tokenizer.parse_state.codes, - &span::from_exit_event(&tokenizer.events, exit_index), + let mut slice = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, exit_index), ); if trim_end { - let mut index = codes.len(); - let mut vs = 0; - let mut spaces_only = true; + let mut index = slice.chars.len(); + let vs = slice.after; + let mut spaces_only = vs == 0; while index > 0 { - match codes[index - 1] { - Code::Char(' ') => {} - Code::Char('\t') => spaces_only = false, - Code::VirtualSpace => { - vs += 1; - spaces_only = false; - } + match slice.chars[index - 1] { + ' ' => {} + '\t' => spaces_only = false, _ => break, } index -= 1; } - let diff = codes.len() - index; + let diff = slice.chars.len() - index; let token_type = if spaces_only && hard_break && exit_index + 1 < tokenizer.events.len() @@ -127,12 +123,12 @@ fn trim_data( return; } - if diff > 0 { + if diff > 0 || vs > 0 { let exit_point = tokenizer.events[exit_index].point.clone(); let mut enter_point = exit_point.clone(); enter_point.index -= diff; - enter_point.column -= diff - vs; - enter_point.offset -= diff - vs; + enter_point.column -= diff; + enter_point.vs = 0; tokenizer.map.add( exit_index + 1, @@ -154,17 +150,16 @@ fn trim_data( ); tokenizer.events[exit_index].point = enter_point; - codes = &codes[..index]; + slice.chars = &slice.chars[..index]; } } if trim_start { let mut index = 0; - let mut vs = 0; - while index < codes.len() { - match codes[index] { - Code::Char(' ' | '\t') => {} - Code::VirtualSpace => vs += 1, + let vs = slice.before; + while index < slice.chars.len() { + match slice.chars[index] { + ' ' | '\t' => {} _ => break, } @@ -173,18 +168,18 @@ fn trim_data( // The whole data is whitespace. // We can be very fast: we only change the token types. - if index == codes.len() { + if index == slice.chars.len() { tokenizer.events[exit_index - 1].token_type = Token::SpaceOrTab; tokenizer.events[exit_index].token_type = Token::SpaceOrTab; return; } - if index > 0 { + if index > 0 || vs > 0 { let enter_point = tokenizer.events[exit_index - 1].point.clone(); let mut exit_point = enter_point.clone(); exit_point.index += index; - exit_point.column += index - vs; - exit_point.offset += index - vs; + exit_point.column += index; + exit_point.vs = 0; tokenizer.map.add( exit_index - 1, diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 41dc6ae..bed454b 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -51,7 +51,7 @@ use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; use crate::constant::{TAB_SIZE, THEMATIC_BREAK_MARKER_COUNT_MIN}; use crate::token::Token; -use crate::tokenizer::{Code, State, Tokenizer}; +use crate::tokenizer::{State, Tokenizer}; /// Type of thematic break. #[derive(Debug, PartialEq)] @@ -104,19 +104,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// > π **Note**: an opening paren must be used for `Kind::Paren`. - /// - /// ## Panics - /// - /// Panics if `code` is not `Code::Char('*' | '-' | '_')`. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse thematic breaks. @@ -157,10 +144,10 @@ pub fn start(tokenizer: &mut Tokenizer) -> State { /// ``` fn before(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::Char('*' | '-' | '_') => at_break( + Some(char) if matches!(char, '*' | '-' | '_') => at_break( tokenizer, Info { - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), size: 0, }, ), @@ -176,15 +163,13 @@ fn before(tokenizer: &mut Tokenizer) -> State { /// ``` fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') - if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => - { + None | Some('\n' | '\r') if info.size >= THEMATIC_BREAK_MARKER_COUNT_MIN => { tokenizer.exit(Token::ThematicBreak); // Feel free to interrupt. tokenizer.interrupt = false; State::Ok } - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(Token::ThematicBreakSequence); sequence(tokenizer, info) } @@ -200,7 +185,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn sequence(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); info.size += 1; State::Fn(Box::new(|t| sequence(t, info))) |