aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct/html_flow.rs
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
commitf7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch)
treec1ac3f22473bd79566d835b2474d2ae9e00d6c55 /src/construct/html_flow.rs
parentd729b07712ca9cc91e68af1776dac9d7008a90cb (diff)
downloadmarkdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip
Refactor to work on `char`s
Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event.
Diffstat (limited to 'src/construct/html_flow.rs')
-rw-r--r--src/construct/html_flow.rs224
1 files changed, 111 insertions, 113 deletions
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index 24d6f98..238963d 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -105,8 +105,10 @@ use crate::construct::{
partial_space_or_tab::{space_or_tab_with_options, Options as SpaceOrTabOptions},
};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
-use crate::util::codes::{parse, serialize};
+use crate::tokenizer::{Point, State, Tokenizer};
+use crate::util::slice::{Position, Slice};
+
+const CDATA_SEARCH: [char; 6] = ['C', 'D', 'A', 'T', 'A', '['];
/// Kind of HTML (flow).
#[derive(Debug, PartialEq)]
@@ -168,17 +170,6 @@ impl QuoteKind {
_ => unreachable!("invalid char"),
}
}
- /// Turn [Code] into a kind.
- ///
- /// ## Panics
- ///
- /// Panics if `code` is not `Code::Char('"' | '\'')`.
- fn from_code(code: Code) -> QuoteKind {
- match code {
- Code::Char(char) => QuoteKind::from_char(char),
- _ => unreachable!("invalid code"),
- }
- }
}
/// State needed to parse HTML (flow).
@@ -190,9 +181,9 @@ struct Info {
start_tag: bool,
/// Used depending on `kind` to either collect all parsed characters, or to
/// store expected characters.
- buffer: Vec<Code>,
- /// `index` into `buffer` when expecting certain characters.
- index: usize,
+ start: Option<Point>,
+ /// Collected index, for various reasons.
+ size: usize,
/// Current quote, when in a double or single quoted attribute value.
quote: Option<QuoteKind>,
}
@@ -234,7 +225,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ^
/// ```
fn before(tokenizer: &mut Tokenizer) -> State {
- if Code::Char('<') == tokenizer.current {
+ if Some('<') == tokenizer.current {
tokenizer.enter(Token::HtmlFlowData);
tokenizer.consume();
State::Fn(Box::new(open))
@@ -259,21 +250,22 @@ fn open(tokenizer: &mut Tokenizer) -> State {
kind: Kind::Basic,
// Assume closing tag (or no tag).
start_tag: false,
- buffer: vec![],
- index: 0,
+ start: None,
+ size: 0,
quote: None,
};
match tokenizer.current {
- Code::Char('!') => {
+ Some('!') => {
tokenizer.consume();
State::Fn(Box::new(|t| declaration_open(t, info)))
}
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
+ info.start = Some(tokenizer.point.clone());
State::Fn(Box::new(|t| tag_close_start(t, info)))
}
- Code::Char('?') => {
+ Some('?') => {
info.kind = Kind::Instruction;
tokenizer.consume();
// Do not form containers.
@@ -282,8 +274,9 @@ fn open(tokenizer: &mut Tokenizer) -> State {
// right now, so we do need to search for `>`, similar to declarations.
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
info.start_tag = true;
+ info.start = Some(tokenizer.point.clone());
tag_name(tokenizer, info)
}
_ => State::Nok,
@@ -302,19 +295,18 @@ fn open(tokenizer: &mut Tokenizer) -> State {
/// ```
fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
info.kind = Kind::Comment;
State::Fn(Box::new(|t| comment_open_inside(t, info)))
}
- Code::Char('[') => {
+ Some('[') => {
tokenizer.consume();
info.kind = Kind::Cdata;
- info.buffer = parse("CDATA[");
- info.index = 0;
+ info.size = 0;
State::Fn(Box::new(|t| cdata_open_inside(t, info)))
}
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
info.kind = Kind::Declaration;
// Do not form containers.
@@ -333,7 +325,7 @@ fn declaration_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
// Do not form containers.
tokenizer.concrete = true;
@@ -350,20 +342,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ^^^^^^
/// ```
fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
- if tokenizer.current == info.buffer[info.index] {
- info.index += 1;
- tokenizer.consume();
+ match tokenizer.current {
+ Some(char) if char == CDATA_SEARCH[info.size] => {
+ info.size += 1;
+ tokenizer.consume();
- if info.index == info.buffer.len() {
- info.buffer.clear();
- // Do not form containers.
- tokenizer.concrete = true;
- State::Fn(Box::new(|t| continuation(t, info)))
- } else {
- State::Fn(Box::new(|t| cdata_open_inside(t, info)))
+ if info.size == CDATA_SEARCH.len() {
+ info.size = 0;
+ // Do not form containers.
+ tokenizer.concrete = true;
+ State::Fn(Box::new(|t| continuation(t, info)))
+ } else {
+ State::Fn(Box::new(|t| cdata_open_inside(t, info)))
+ }
}
- } else {
- State::Nok
+ _ => State::Nok,
}
}
@@ -373,11 +366,10 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// > | </x>
/// ^
/// ```
-fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State {
+fn tag_close_start(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('A'..='Z' | 'a'..='z') => {
+ Some('A'..='Z' | 'a'..='z') => {
tokenizer.consume();
- info.buffer.push(tokenizer.current);
State::Fn(Box::new(|t| tag_name(t, info)))
}
_ => State::Nok,
@@ -394,22 +386,27 @@ fn tag_close_start(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::None
- | Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => {
- let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();
- let name = tag_name_buffer.as_str();
- let slash = matches!(tokenizer.current, Code::Char('/'));
-
- info.buffer.clear();
-
- if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name) {
+ None | Some('\t' | '\n' | ' ' | '/' | '>') => {
+ let slash = matches!(tokenizer.current, Some('/'));
+ let start = info.start.take().unwrap();
+ let name = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &start,
+ end: &tokenizer.point,
+ },
+ )
+ .serialize()
+ .trim()
+ .to_lowercase();
+ println!("name: {:?}", name);
+
+ if !slash && info.start_tag && HTML_RAW_NAMES.contains(&name.as_str()) {
info.kind = Kind::Raw;
// Do not form containers.
tokenizer.concrete = true;
continuation(tokenizer, info)
- } else if HTML_BLOCK_NAMES.contains(&name) {
+ } else if HTML_BLOCK_NAMES.contains(&name.as_str()) {
// Basic is assumed, no need to set `kind`.
if slash {
tokenizer.consume();
@@ -432,12 +429,11 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
}
}
}
- Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
+ Some('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume();
- info.buffer.push(tokenizer.current);
State::Fn(Box::new(|t| tag_name(t, info)))
}
- Code::Char(_) => State::Nok,
+ Some(_) => State::Nok,
}
}
@@ -449,7 +445,7 @@ fn tag_name(tokenizer: &mut Tokenizer, mut info: Info) -> State {
/// ```
fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
// Do not form containers.
tokenizer.concrete = true;
@@ -467,7 +463,7 @@ fn basic_self_closing(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_closing_tag_after(t, info)))
}
@@ -496,15 +492,15 @@ fn complete_closing_tag_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_end(t, info)))
}
- Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
+ Some('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name_before(t, info)))
}
@@ -524,7 +520,7 @@ fn complete_attribute_name_before(tokenizer: &mut Tokenizer, info: Info) -> Stat
/// ```
fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
+ Some('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name(t, info)))
}
@@ -543,11 +539,11 @@ fn complete_attribute_name(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('=') => {
+ Some('=') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_name_after(t, info)))
}
@@ -566,13 +562,13 @@ fn complete_attribute_name_after(tokenizer: &mut Tokenizer, info: Info) -> State
/// ```
fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::None | Code::Char('<' | '=' | '>' | '`') => State::Nok,
- Code::Char('"' | '\'') => {
+ None | Some('<' | '=' | '>' | '`') => State::Nok,
+ Some(char) if matches!(char, '"' | '\'') => {
+ info.quote = Some(QuoteKind::from_char(char));
tokenizer.consume();
- info.quote = Some(QuoteKind::from_code(tokenizer.current));
State::Fn(Box::new(|t| complete_attribute_value_quoted(t, info)))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_before(t, info)))
}
@@ -590,8 +586,8 @@ fn complete_attribute_value_before(tokenizer: &mut Tokenizer, mut info: Info) ->
/// ```
fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => State::Nok,
- Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => {
+ None | Some('\n') => State::Nok,
+ Some(char) if char == info.quote.as_ref().unwrap().as_char() => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_quoted_after(t, info)))
}
@@ -610,13 +606,10 @@ fn complete_attribute_value_quoted(tokenizer: &mut Tokenizer, info: Info) -> Sta
/// ```
fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None
- | Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => {
+ None | Some('\t' | '\n' | ' ' | '"' | '\'' | '/' | '<' | '=' | '>' | '`') => {
complete_attribute_name_after(tokenizer, info)
}
- Code::Char(_) => {
+ Some(_) => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_attribute_value_unquoted(t, info)))
}
@@ -632,9 +625,7 @@ fn complete_attribute_value_unquoted(tokenizer: &mut Tokenizer, info: Info) -> S
/// ```
fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::VirtualSpace | Code::Char('\t' | ' ' | '/' | '>') => {
- complete_attribute_name_before(tokenizer, info)
- }
+ Some('\t' | ' ' | '/' | '>') => complete_attribute_name_before(tokenizer, info),
_ => State::Nok,
}
}
@@ -647,7 +638,7 @@ fn complete_attribute_value_quoted_after(tokenizer: &mut Tokenizer, info: Info)
/// ```
fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_after(t, info)))
}
@@ -663,16 +654,16 @@ fn complete_end(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
// Do not form containers.
tokenizer.concrete = true;
continuation(tokenizer, info)
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
+ Some('\t' | ' ') => {
tokenizer.consume();
State::Fn(Box::new(|t| complete_after(t, info)))
}
- Code::Char(_) => State::Nok,
+ Some(_) => State::Nok,
}
}
@@ -684,29 +675,27 @@ fn complete_after(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-') if info.kind == Kind::Comment => {
+ Some('-') if info.kind == Kind::Comment => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_comment_inside(t, info)))
}
- Code::Char('<') if info.kind == Kind::Raw => {
+ Some('<') if info.kind == Kind::Raw => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_raw_tag_open(t, info)))
}
- Code::Char('>') if info.kind == Kind::Declaration => {
+ Some('>') if info.kind == Kind::Declaration => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_close(t, info)))
}
- Code::Char('?') if info.kind == Kind::Instruction => {
+ Some('?') if info.kind == Kind::Instruction => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
- Code::Char(']') if info.kind == Kind::Cdata => {
+ Some(']') if info.kind == Kind::Cdata => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_character_data_inside(t, info)))
}
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
- if info.kind == Kind::Basic || info.kind == Kind::Complete =>
- {
+ Some('\n') if info.kind == Kind::Basic || info.kind == Kind::Complete => {
tokenizer.exit(Token::HtmlFlowData);
tokenizer.check(blank_line_before, |ok| {
if ok {
@@ -716,7 +705,7 @@ fn continuation(tokenizer: &mut Tokenizer, info: Info) -> State {
}
})(tokenizer)
}
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::HtmlFlowData);
continuation_start(tokenizer, info)
}
@@ -753,7 +742,7 @@ fn continuation_start(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
@@ -772,9 +761,7 @@ fn continuation_start_non_lazy(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- continuation_start(tokenizer, info)
- }
+ None | Some('\n') => continuation_start(tokenizer, info),
_ => {
tokenizer.enter(Token::HtmlFlowData);
continuation(tokenizer, info)
@@ -790,7 +777,7 @@ fn continuation_before(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('-') => {
+ Some('-') => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
@@ -804,10 +791,11 @@ fn continuation_comment_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
/// > | <script>console.log(1)</script>
/// ^
/// ```
-fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State {
+fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('/') => {
+ Some('/') => {
tokenizer.consume();
+ info.start = Some(tokenizer.point.clone());
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => continuation(tokenizer, info),
@@ -822,24 +810,34 @@ fn continuation_raw_tag_open(tokenizer: &mut Tokenizer, info: Info) -> State {
/// ```
fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
- let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();
- info.buffer.clear();
-
- if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) {
+ Some('>') => {
+ info.size = 0;
+
+ let start = info.start.take().unwrap();
+ let name = Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position {
+ start: &start,
+ end: &tokenizer.point,
+ },
+ )
+ .serialize()
+ .to_lowercase();
+
+ if HTML_RAW_NAMES.contains(&name.as_str()) {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_close(t, info)))
} else {
continuation(tokenizer, info)
}
}
- Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => {
+ Some('A'..='Z' | 'a'..='z') if info.size < HTML_RAW_SIZE_MAX => {
tokenizer.consume();
- info.buffer.push(tokenizer.current);
+ info.size += 1;
State::Fn(Box::new(|t| continuation_raw_end_tag(t, info)))
}
_ => {
- info.buffer.clear();
+ info.size = 0;
continuation(tokenizer, info)
}
}
@@ -853,7 +851,7 @@ fn continuation_raw_end_tag(tokenizer: &mut Tokenizer, mut info: Info) -> State
/// ```
fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char(']') => {
+ Some(']') => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
@@ -877,11 +875,11 @@ fn continuation_character_data_inside(tokenizer: &mut Tokenizer, info: Info) ->
/// ```
fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::Char('>') => {
+ Some('>') => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_close(t, info)))
}
- Code::Char('-') if info.kind == Kind::Comment => {
+ Some('-') if info.kind == Kind::Comment => {
tokenizer.consume();
State::Fn(Box::new(|t| continuation_declaration_inside(t, info)))
}
@@ -897,7 +895,7 @@ fn continuation_declaration_inside(tokenizer: &mut Tokenizer, info: Info) -> Sta
/// ```
fn continuation_close(tokenizer: &mut Tokenizer, info: Info) -> State {
match tokenizer.current {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None | Some('\n') => {
tokenizer.exit(Token::HtmlFlowData);
continuation_after(tokenizer)
}