From c6f92eaedf197beafef461ee6c2bd067e7160c49 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Tue, 21 Jun 2022 16:06:50 +0200 Subject: Refactor to improve a bunch of states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Improve passing stuff around * Add traits to enums for markers and such * Fix “life time” stuff I didn’t understand --- src/compiler.rs | 58 +++-- src/construct/autolink.rs | 7 +- src/construct/character_reference.rs | 64 +++-- src/construct/code_fenced.rs | 68 +++-- src/construct/code_text.rs | 26 +- src/construct/definition.rs | 12 +- src/construct/heading_setext.rs | 56 +++-- src/construct/html_flow.rs | 462 +++++++++++++--------------------- src/construct/html_text.rs | 20 +- src/construct/paragraph.rs | 46 ++-- src/construct/partial_label.rs | 111 ++++---- src/construct/partial_space_or_tab.rs | 70 +++--- src/construct/partial_title.rs | 91 ++++--- src/construct/thematic_break.rs | 97 +++++-- src/content/string.rs | 8 - src/tokenizer.rs | 1 + 16 files changed, 605 insertions(+), 592 deletions(-) (limited to 'src') diff --git a/src/compiler.rs b/src/compiler.rs index 5c7f6d8..9bc2488 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -9,13 +9,37 @@ use crate::util::{ }; /// To do. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub enum LineEnding { CarriageReturnLineFeed, CarriageReturn, LineFeed, } +impl LineEnding { + /// Turn the line ending into a [str]. + fn as_str(&self) -> &str { + match self { + LineEnding::CarriageReturnLineFeed => "\r\n", + LineEnding::CarriageReturn => "\r", + LineEnding::LineFeed => "\n", + } + } + /// Turn a [Code] into a line ending. + /// + /// ## Panics + /// + /// Panics if `code` is not `\r\n`, `\r`, or `\n`. + fn from_code(code: Code) -> LineEnding { + match code { + Code::CarriageReturnLineFeed => LineEnding::CarriageReturnLineFeed, + Code::Char('\r') => LineEnding::CarriageReturn, + Code::Char('\n') => LineEnding::LineFeed, + _ => unreachable!("invalid code"), + } + } +} + /// Configuration (optional). #[derive(Default, Debug)] pub struct Options { @@ -120,29 +144,20 @@ pub fn compile(events: &[Event], codes: &[Code], options: &Options) -> String { || event.token_type == TokenType::LineEnding) { let codes = codes_from_span(codes, &from_exit_event(events, index)); - let code = *codes.first().unwrap(); - line_ending_inferred = Some(if code == Code::CarriageReturnLineFeed { - LineEnding::CarriageReturnLineFeed - } else if code == Code::Char('\r') { - LineEnding::CarriageReturn - } else { - LineEnding::LineFeed - }); + line_ending_inferred = Some(LineEnding::from_code(*codes.first().unwrap())); break; } index += 1; } - let line_ending_default: LineEnding; - - if let Some(value) = line_ending_inferred { - line_ending_default = value; + let line_ending_default = if let Some(value) = line_ending_inferred { + value } else if let Some(value) = &options.default_line_ending { - line_ending_default = value.clone(); + value.clone() } else { - line_ending_default = LineEnding::LineFeed; - } + LineEnding::LineFeed + }; index = 0; @@ -557,17 +572,8 @@ fn buf_tail(buffers: &mut [Vec]) -> &Vec { /// Add a line ending. fn line_ending(buffers: &mut [Vec], default: &LineEnding) { let tail = buf_tail_mut(buffers); - - println!("xxx: {:?}", default); - - let line_ending = match default { - LineEnding::CarriageReturnLineFeed => "\r\n", - LineEnding::CarriageReturn => "\r", - LineEnding::LineFeed => "\n", - }; - // lastWasTag = false - tail.push(line_ending.to_string()); + tail.push(default.as_str().to_string()); } /// Add a line ending if needed (as in, there’s no eol/eof already). diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index c9596a6..8376b98 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -266,12 +266,11 @@ fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnRes ) } Code::Char('>') => { - let tail_index = tokenizer.events.len(); - let head_index = tokenizer.events.len() - 1; + let index = tokenizer.events.len(); tokenizer.exit(TokenType::AutolinkProtocol); // Change the token type. - tokenizer.events[head_index].token_type = TokenType::AutolinkEmail; - tokenizer.events[tail_index].token_type = TokenType::AutolinkEmail; + tokenizer.events[index - 1].token_type = TokenType::AutolinkEmail; + tokenizer.events[index].token_type = TokenType::AutolinkEmail; end(tokenizer, code) } _ => email_value(tokenizer, code, size), diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index c946dae..decf852 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -59,7 +59,7 @@ use crate::constant::{ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Kind of a character reference. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub enum Kind { /// Numeric decimal character reference (` `). Decimal, @@ -69,6 +69,28 @@ pub enum Kind { Named, } +impl Kind { + /// Get the maximum size of characters allowed in a character reference. + fn max(&self) -> usize { + match self { + Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, + Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, + Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX, + } + } + + /// Check if a char is allowed. + fn allowed(&self, char: char) -> bool { + let check = match self { + Kind::Hexadecimal => char::is_ascii_hexdigit, + Kind::Decimal => char::is_ascii_digit, + Kind::Named => char::is_ascii_alphanumeric, + }; + + check(&char) + } +} + /// State needed to parse character references. #[derive(Debug, Clone)] struct Info { @@ -141,10 +163,10 @@ fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::CharacterReferenceValue); ( - State::Fn(Box::new(|tokenizer, code| { + State::Fn(Box::new(|t, c| { value( - tokenizer, - code, + t, + c, Info { buffer: vec![], kind: Kind::Hexadecimal, @@ -179,7 +201,7 @@ fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// a|23;b /// a&#x|9;b /// ``` -fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { +fn value(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { match code { Code::Char(';') if !info.buffer.is_empty() => { tokenizer.exit(TokenType::CharacterReferenceValue); @@ -198,36 +220,10 @@ fn value(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { (State::Ok, None) } Code::Char(char) => { - let len = info.buffer.len(); - - let cont = match info.kind { - Kind::Hexadecimal - if char.is_ascii_hexdigit() - && len < CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX => - { - true - } - Kind::Decimal - if char.is_ascii_digit() && len < CHARACTER_REFERENCE_DECIMAL_SIZE_MAX => - { - true - } - Kind::Named - if char.is_ascii_alphanumeric() && len < CHARACTER_REFERENCE_NAMED_SIZE_MAX => - { - true - } - _ => false, - }; - - if cont { - let mut clone = info; - clone.buffer.push(char); + if info.buffer.len() < info.kind.max() && info.kind.allowed(char) { + info.buffer.push(char); tokenizer.consume(code); - ( - State::Fn(Box::new(|tokenizer, code| value(tokenizer, code, clone))), - None, - ) + (State::Fn(Box::new(|t, c| value(t, c, info))), None) } else { (State::Nok, None) } diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 30ec911..f79705c 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -99,11 +99,49 @@ use crate::util::span::from_exit_event; #[derive(Debug, Clone, PartialEq)] pub enum Kind { /// Grave accent (tick) code. + /// + /// ## Example + /// + /// ````markdown + /// ```rust + /// println!("I <3 🦀"); + /// ``` + /// ```` GraveAccent, /// Tilde code. + /// + /// ## Example + /// + /// ```markdown + /// ~~~rust + /// println!("I <3 🦀"); + /// ~~~ + /// ``` Tilde, } +impl Kind { + /// Turn the kind into a [char]. + fn as_char(&self) -> char { + match self { + Kind::GraveAccent => '`', + Kind::Tilde => '~', + } + } + /// Turn a [char] into a kind. + /// + /// ## Panics + /// + /// Panics if `char` is not `~` or `` ` ``. + fn from_char(char: char) -> Kind { + match char { + '`' => Kind::GraveAccent, + '~' => Kind::Tilde, + _ => unreachable!("invalid char"), + } + } +} + /// State needed to parse code (fenced). #[derive(Debug, Clone)] struct Info { @@ -160,11 +198,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult Info { prefix, size: 0, - kind: if char == '`' { - Kind::GraveAccent - } else { - Kind::Tilde - }, + kind: Kind::from_char(char), }, ) } @@ -180,14 +214,8 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult /// ~~~ /// ``` fn sequence_open(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { - let marker = if info.kind == Kind::GraveAccent { - '`' - } else { - '~' - }; - match code { - Code::Char(char) if char == marker => { + Code::Char(char) if char == info.kind.as_char() => { tokenizer.consume(code); ( State::Fn(Box::new(|t, c| { @@ -375,14 +403,8 @@ fn close_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResu /// |~~~ /// ``` fn close_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { - let marker = if info.kind == Kind::GraveAccent { - '`' - } else { - '~' - }; - match code { - Code::Char(char) if char == marker => { + Code::Char(char) if char == info.kind.as_char() => { tokenizer.enter(TokenType::CodeFencedFenceSequence); close_sequence(tokenizer, code, info, 0) } @@ -398,14 +420,8 @@ fn close_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnRes /// ~|~~ /// ``` fn close_sequence(tokenizer: &mut Tokenizer, code: Code, info: Info, size: usize) -> StateFnResult { - let marker = if info.kind == Kind::GraveAccent { - '`' - } else { - '~' - }; - match code { - Code::Char(char) if char == marker => { + Code::Char(char) if char == info.kind.as_char() => { tokenizer.consume(code); ( State::Fn(Box::new(move |t, c| close_sequence(t, c, info, size + 1))), diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 1f34e41..94e0106 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -113,9 +113,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnR if let Code::Char('`') = code { tokenizer.consume(code); ( - State::Fn(Box::new(move |tokenizer, code| { - sequence_open(tokenizer, code, size + 1) - })), + State::Fn(Box::new(move |t, c| sequence_open(t, c, size + 1))), None, ) } else { @@ -138,9 +136,7 @@ fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnRe tokenizer.consume(code); tokenizer.exit(TokenType::CodeTextLineEnding); ( - State::Fn(Box::new(move |tokenizer, code| { - between(tokenizer, code, size_open) - })), + State::Fn(Box::new(move |t, c| between(t, c, size_open))), None, ) } @@ -168,12 +164,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResul } _ => { tokenizer.consume(code); - ( - State::Fn(Box::new(move |tokenizer, code| { - data(tokenizer, code, size_open) - })), - None, - ) + (State::Fn(Box::new(move |t, c| data(t, c, size_open))), None) } } } @@ -193,8 +184,8 @@ fn sequence_close( Code::Char('`') => { tokenizer.consume(code); ( - State::Fn(Box::new(move |tokenizer, code| { - sequence_close(tokenizer, code, size_open, size + 1) + State::Fn(Box::new(move |t, c| { + sequence_close(t, c, size_open, size + 1) })), None, ) @@ -205,12 +196,11 @@ fn sequence_close( (State::Ok, Some(vec![code])) } _ => { - let tail_index = tokenizer.events.len(); - let head_index = tokenizer.events.len() - 1; + let index = tokenizer.events.len(); tokenizer.exit(TokenType::CodeTextSequence); // Change the token type. - tokenizer.events[head_index].token_type = TokenType::CodeTextData; - tokenizer.events[tail_index].token_type = TokenType::CodeTextData; + tokenizer.events[index - 1].token_type = TokenType::CodeTextData; + tokenizer.events[index].token_type = TokenType::CodeTextData; between(tokenizer, code, size_open) } } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 03baee6..61c4d34 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -144,16 +144,14 @@ fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn destination_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { let event = tokenizer.events.last().unwrap(); - // Blank line not ok. - let char_nok = matches!( - code, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') - ); // Whitespace. if (event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace) - && !char_nok - { + // Blank line not ok. + && !matches!( + code, + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + ) { tokenizer.go(destination, destination_after)(tokenizer, code) } else { (State::Nok, None) diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index a418041..7c41855 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -57,12 +57,49 @@ use crate::util::{link::link, span::from_exit_event}; /// Kind of underline. #[derive(Debug, Clone, PartialEq)] pub enum Kind { - /// Grave accent (tick) code. + /// Dash (rank 2) heading. + /// + /// ## Example + /// + /// ```markdown + /// alpha + /// ----- + /// ``` Dash, - /// Tilde code. + + /// Equals to (rank 1) heading. + /// + /// ## Example + /// + /// ```markdown + /// alpha + /// ===== + /// ``` EqualsTo, } +impl Kind { + /// Turn the kind into a [char]. + fn as_char(&self) -> char { + match self { + Kind::Dash => '-', + Kind::EqualsTo => '=', + } + } + /// Turn a [char] into a kind. + /// + /// ## Panics + /// + /// Panics if `char` is not `-` or `=`. + fn from_char(char: char) -> Kind { + match char { + '-' => Kind::Dash, + '=' => Kind::EqualsTo, + _ => unreachable!("invalid char"), + } + } +} + /// Start of a heading (setext). /// /// ```markdown @@ -232,13 +269,8 @@ fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnRes match code { Code::Char(char) if char == '-' || char == '=' => { - let marker = if char == '-' { - Kind::Dash - } else { - Kind::EqualsTo - }; tokenizer.enter(TokenType::HeadingSetextUnderline); - underline_sequence_inside(tokenizer, code, marker) + underline_sequence_inside(tokenizer, code, Kind::from_char(char)) } _ => (State::Nok, None), } @@ -251,15 +283,11 @@ fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnRes /// =|= /// ``` fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { - let marker = if kind == Kind::Dash { '-' } else { '=' }; - match code { - Code::Char(char) if char == marker => { + Code::Char(char) if char == kind.as_char() => { tokenizer.consume(code); ( - State::Fn(Box::new(move |tokenizer, code| { - underline_sequence_inside(tokenizer, code, kind) - })), + State::Fn(Box::new(move |t, c| underline_sequence_inside(t, c, kind))), None, ) } diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 4819e63..d5937c5 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -97,10 +97,8 @@ use crate::construct::{blank_line::start as blank_line, partial_space_or_tab::sp use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Kind of HTML (flow). -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, PartialEq)] enum Kind { - /// Not yet known. - Unknown, /// Symbol for `