From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 28 Jul 2022 16:48:00 +0200 Subject: Refactor to work on `char`s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event. --- src/construct/code_fenced.rs | 65 ++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) (limited to 'src/construct/code_fenced.rs') diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 2fea95e..98fa54f 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -107,8 +107,8 @@ use crate::construct::{ partial_space_or_tab::{space_or_tab, space_or_tab_min_max}, }; use crate::token::Token; -use crate::tokenizer::{Code, ContentType, State, Tokenizer}; -use crate::util::span::from_exit_event; +use crate::tokenizer::{ContentType, State, Tokenizer}; +use crate::util::slice::{Position, Slice}; /// Kind of fences. #[derive(Debug, Clone, PartialEq)] @@ -155,17 +155,6 @@ impl Kind { _ => unreachable!("invalid char"), } } - /// Turn [Code] into a kind. - /// - /// ## Panics - /// - /// Panics if `code` is not ``Code::Char('~' | '`')``. - fn from_code(code: Code) -> Kind { - match code { - Code::Char(char) => Kind::from_char(char), - _ => unreachable!("invalid code"), - } - } } /// State needed to parse code (fenced). @@ -217,20 +206,23 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { if let Some(event) = tail { if event.token_type == Token::SpaceOrTab { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; + prefix = Slice::from_position( + &tokenizer.parse_state.chars, + &Position::from_exit_event(&tokenizer.events, tokenizer.events.len() - 1), + ) + .size(); } } match tokenizer.current { - Code::Char('`' | '~') => { + Some(char) if matches!(char, '`' | '~') => { tokenizer.enter(Token::CodeFencedFenceSequence); sequence_open( tokenizer, Info { prefix, size: 0, - kind: Kind::from_code(tokenizer.current), + kind: Kind::from_char(char), }, ) } @@ -248,7 +240,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer) -> State { /// ``` fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(|t| { info.size += 1; @@ -273,7 +265,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, mut info: Info) -> State { /// ``` fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -282,7 +274,7 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { _ => { tokenizer.enter(Token::CodeFencedFenceInfo); tokenizer.enter_with_content(Token::Data, Some(ContentType::String)); - info_inside(tokenizer, info, vec![]) + info_inside(tokenizer, info) } } } @@ -295,9 +287,9 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// | console.log(1) /// | ~~~ /// ``` -fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec) -> State { +fn info_inside(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.exit(Token::CodeFencedFence); @@ -305,16 +297,15 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec) -> S tokenizer.concrete = true; at_break(tokenizer, info) } - Code::VirtualSpace | Code::Char('\t' | ' ') => { + Some('\t' | ' ') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t| meta_before(t, info))(tokenizer) } - Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok, - Code::Char(_) => { - codes.push(tokenizer.current); + Some('`') if info.kind == Kind::GraveAccent => State::Nok, + Some(_) => { tokenizer.consume(); - State::Fn(Box::new(|t| info_inside(t, info, codes))) + State::Fn(Box::new(|t| info_inside(t, info))) } } } @@ -329,7 +320,7 @@ fn info_inside(tokenizer: &mut Tokenizer, info: Info, mut codes: Vec) -> S /// ``` fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); // Do not form containers. tokenizer.concrete = true; @@ -353,7 +344,7 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::Data); tokenizer.exit(Token::CodeFencedFenceMeta); tokenizer.exit(Token::CodeFencedFence); @@ -361,7 +352,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info) -> State { tokenizer.concrete = true; at_break(tokenizer, info) } - Code::Char('`') if info.kind == Kind::GraveAccent => State::Nok, + Some('`') if info.kind == Kind::GraveAccent => State::Nok, _ => { tokenizer.consume(); State::Fn(Box::new(|t| meta(t, info))) @@ -422,7 +413,7 @@ fn at_non_lazy_break(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + Some('\n') => { tokenizer.enter(Token::LineEnding); tokenizer.consume(); tokenizer.exit(Token::LineEnding); @@ -461,7 +452,7 @@ fn close_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.enter(Token::CodeFencedFenceSequence); close_sequence(tokenizer, info, 0) } @@ -479,7 +470,7 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { match tokenizer.current { - Code::Char(char) if char == info.kind.as_char() => { + Some(char) if char == info.kind.as_char() => { tokenizer.consume(); State::Fn(Box::new(move |t| close_sequence(t, info, size + 1))) } @@ -501,7 +492,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, size: usize) -> State { /// ``` fn close_sequence_after(tokenizer: &mut Tokenizer) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFencedFence); State::Ok } @@ -547,9 +538,7 @@ fn content_start(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - at_break(tokenizer, info) - } + None | Some('\n') => at_break(tokenizer, info), _ => { tokenizer.enter(Token::CodeFlowChunk); content_continue(tokenizer, info) @@ -567,7 +556,7 @@ fn content_begin(tokenizer: &mut Tokenizer, info: Info) -> State { /// ``` fn content_continue(tokenizer: &mut Tokenizer, info: Info) -> State { match tokenizer.current { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + None | Some('\n') => { tokenizer.exit(Token::CodeFlowChunk); at_break(tokenizer, info) } -- cgit