diff options
31 files changed, 396 insertions, 372 deletions
@@ -154,7 +154,6 @@ cargo doc --document-private-items - [ ] (3) Check subtokenizer unraveling is ok - [ ] (3) Remove splicing and cloning in subtokenizer - [ ] (3) Pass more references around -- [ ] (1) Remove todos in `span.rs` if not needed - [ ] (1) Get markers from constructs (`string`, `text`) - [ ] (3) Read through rust docs to figure out what useful functions there are, and fix stuff I’m doing manually now @@ -276,3 +275,4 @@ important. - [x] (3) Unicode punctuation - [x] (1) Use rust to crawl unicode - [x] (1) Document attention +- [x] (1) Remove todos in `span.rs` if not needed diff --git a/src/construct/attention.rs b/src/construct/attention.rs index dff8633..2144864 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -109,6 +109,13 @@ enum MarkerKind { } impl MarkerKind { + /// Turn the kind into a [char]. + fn as_char(&self) -> char { + match self { + MarkerKind::Asterisk => '*', + MarkerKind::Underscore => '_', + } + } /// Turn [char] into a kind. /// /// ## Panics @@ -137,14 +144,23 @@ impl MarkerKind { /// Attentention sequence that we can take markers from. #[derive(Debug)] struct Sequence { + /// Marker used in this sequence. marker: MarkerKind, + /// The index into events where this sequence’s `Enter` currently resides. event_index: usize, + /// The (shifted) point where this sequence starts. start_point: Point, + /// The (shifted) index where this sequence starts. start_index: usize, + /// The (shifted) point where this sequence end. end_point: Point, + /// The (shifted) index where this sequence end. end_index: usize, + /// The number of markers we can still use. size: usize, + /// Whether this sequence can open attention. open: bool, + /// Whether this sequence can close attention. close: bool, } @@ -155,9 +171,9 @@ struct Sequence { /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char(char) if char == '*' || char == '_' => { + Code::Char('*' | '_') => { tokenizer.enter(TokenType::AttentionSequence); - inside(tokenizer, code, char) + inside(tokenizer, code, MarkerKind::from_code(code)) } _ => (State::Nok, None), } @@ -168,9 +184,9 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ```markdown /// *|* /// ``` -fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult { +fn inside(tokenizer: &mut Tokenizer, code: Code, marker: MarkerKind) -> StateFnResult { match code { - Code::Char(char) if char == marker => { + Code::Char(char) if char == marker.as_char() => { tokenizer.consume(code); (State::Fn(Box::new(move |t, c| inside(t, c, marker))), None) } diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 6486a2d..e29bf8b 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -148,17 +148,11 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// a<u|ser@example.com>b /// ``` fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // Whether this character can be both a protocol and email atext. - let unknown = match code { - Code::Char('+' | '-' | '.') => true, - Code::Char(char) if char.is_ascii_alphanumeric() => true, - _ => false, - }; - - if unknown { - scheme_inside_or_email_atext(tokenizer, code, 1) - } else { - email_atext(tokenizer, code) + match code { + Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { + scheme_inside_or_email_atext(tokenizer, code, 1) + } + _ => email_atext(tokenizer, code), } } @@ -173,20 +167,14 @@ fn scheme_inside_or_email_atext( code: Code, size: usize, ) -> StateFnResult { - if let Code::Char(':') = code { - tokenizer.consume(code); - (State::Fn(Box::new(url_inside)), None) - } else { - // Whether this character can be both a protocol and email atext. - let unknown = match code { - Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true, - Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => { - true - } - _ => false, - }; - - if unknown { + match code { + Code::Char(':') => { + tokenizer.consume(code); + (State::Fn(Box::new(url_inside)), None) + } + Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') + if size < AUTOLINK_SCHEME_SIZE_MAX => + { tokenizer.consume(code); ( State::Fn(Box::new(move |t, c| { @@ -194,9 +182,8 @@ fn scheme_inside_or_email_atext( })), None, ) - } else { - email_atext(tokenizer, code) } + _ => email_atext(tokenizer, code), } } @@ -291,22 +278,22 @@ fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnRes /// a<user.name@ex-|ample.com>b /// ``` fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { - let ok = match code { - Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true, - Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true, - _ => false, - }; - - if ok { - tokenizer.consume(code); - let func = if let Code::Char('-') = code { - email_value - } else { - email_label - }; - (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None) - } else { - (State::Nok, None) + match code { + Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |t, c| email_value(t, c, size + 1))), + None, + ) + } + Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => { + tokenizer.consume(code); + ( + State::Fn(Box::new(move |t, c| email_label(t, c, size + 1))), + None, + ) + } + _ => (State::Nok, None), } } @@ -325,7 +312,7 @@ fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::Autolink); (State::Ok, None) } - _ => unreachable!("expected `>` at `end`"), + _ => unreachable!("expected `>`"), } } diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index bc42d21..65e49ca 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -138,21 +138,18 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// a&|#x9;b /// ``` fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let info = Info { + buffer: vec![], + kind: Kind::Named, + }; if let Code::Char('#') = code { tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric); tokenizer.consume(code); tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric); - (State::Fn(Box::new(numeric)), None) + (State::Fn(Box::new(|t, c| numeric(t, c, info))), None) } else { tokenizer.enter(TokenType::CharacterReferenceValue); - value( - tokenizer, - code, - Info { - buffer: vec![], - kind: Kind::Named, - }, - ) + value(tokenizer, code, info) } } @@ -163,37 +160,18 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// a&#|123;b /// a&#|x9;b /// ``` -fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn numeric(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { if let Code::Char('x' | 'X') = code { tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal); tokenizer.consume(code); tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal); tokenizer.enter(TokenType::CharacterReferenceValue); - - ( - State::Fn(Box::new(|t, c| { - value( - t, - c, - Info { - buffer: vec![], - kind: Kind::Hexadecimal, - }, - ) - })), - None, - ) + info.kind = Kind::Hexadecimal; + (State::Fn(Box::new(|t, c| value(t, c, info))), None) } else { tokenizer.enter(TokenType::CharacterReferenceValue); - - value( - tokenizer, - code, - Info { - buffer: vec![], - kind: Kind::Decimal, - }, - ) + info.kind = Kind::Decimal; + value(tokenizer, code, info) } } @@ -210,20 +188,19 @@ fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { fn value(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { match code { Code::Char(';') if !info.buffer.is_empty() => { - tokenizer.exit(TokenType::CharacterReferenceValue); - let value = info.buffer.iter().collect::<String>(); - - if let Kind::Named = info.kind { - if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) { - return (State::Nok, None); - } + if Kind::Named == info.kind + && !CHARACTER_REFERENCE_NAMES + .contains(&info.buffer.iter().collect::<String>().as_str()) + { + (State::Nok, None) + } else { + tokenizer.exit(TokenType::CharacterReferenceValue); + tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); + tokenizer.consume(code); + tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); + tokenizer.exit(TokenType::CharacterReference); + (State::Ok, None) } - - tokenizer.enter(TokenType::CharacterReferenceMarkerSemi); - tokenizer.consume(code); - tokenizer.exit(TokenType::CharacterReferenceMarkerSemi); - tokenizer.exit(TokenType::CharacterReference); - (State::Ok, None) } Code::Char(char) => { if info.buffer.len() < info.kind.max() && info.kind.allowed(char) { diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index f2d243a..05266ba 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -151,6 +151,17 @@ impl Kind { _ => unreachable!("invalid char"), } } + /// Turn [Code] into a kind. + /// + /// ## Panics + /// + /// Panics if `code` is not ``Code::Char('~' | '`')``. + fn from_code(code: Code) -> Kind { + match code { + Code::Char(char) => Kind::from_char(char), + _ => unreachable!("invalid code"), + } + } } /// State needed to parse code (fenced). @@ -172,10 +183,6 @@ struct Info { /// console.log(1); /// ~~~ /// ``` -/// -/// Parsing note: normally, the prefix is already stripped. -/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need -/// it. pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::CodeFenced); tokenizer.enter(TokenType::CodeFencedFence); @@ -202,7 +209,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult } match code { - Code::Char(char) if char == '`' || char == '~' => { + Code::Char('`' | '~') => { tokenizer.enter(TokenType::CodeFencedFenceSequence); sequence_open( tokenizer, @@ -210,7 +217,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult Info { prefix, size: 0, - kind: Kind::from_char(char), + kind: Kind::from_code(code), }, ) } @@ -237,11 +244,11 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State None, ) } - _ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN => (State::Nok, None), - _ => { + _ if info.size >= CODE_FENCED_SEQUENCE_SIZE_MIN => { tokenizer.exit(TokenType::CodeFencedFenceSequence); tokenizer.attempt_opt(space_or_tab(), |t, c| info_before(t, c, info))(tokenizer, code) } + _ => (State::Nok, None), } } @@ -291,7 +298,7 @@ fn info_inside( tokenizer.exit(TokenType::CodeFencedFenceInfo); tokenizer.attempt_opt(space_or_tab(), |t, c| meta_before(t, c, info))(tokenizer, code) } - Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char('`') if info.kind == Kind::GraveAccent => (State::Nok, None), Code::Char(_) => { codes.push(code); tokenizer.consume(code); @@ -339,7 +346,7 @@ fn meta(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { tokenizer.exit(TokenType::CodeFencedFence); at_break(tokenizer, code, info) } - Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None), + Code::Char('`') if info.kind == Kind::GraveAccent => (State::Nok, None), _ => { tokenizer.consume(code); (State::Fn(Box::new(|t, c| meta(t, c, info))), None) @@ -369,7 +376,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult } }, )(tokenizer, code), - _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code), + _ => unreachable!("expected eof/eol"), } } diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index c595c75..a6dc7eb 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -138,7 +138,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnR fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult { match code { Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.enter(TokenType::CodeTextLineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::CodeTextLineEnding); @@ -165,7 +165,7 @@ fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnRe /// ``` fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '`') => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '`') => { tokenizer.exit(TokenType::CodeTextData); between(tokenizer, code, size_open) } diff --git a/src/construct/definition.rs b/src/construct/definition.rs index e1afd03..db4a009 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -227,7 +227,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::Definition); // You’d be interrupting. tokenizer.interrupt = true; @@ -293,7 +293,7 @@ fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn title_after_after_optional_whitespace(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { (State::Ok, Some(vec![code])) } _ => (State::Nok, None), diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 1e755a3..212d276 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HardBreakEscape); (State::Ok, Some(vec![code])) } diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs index 6709e51..35a7cab 100644 --- a/src/construct/hard_break_trailing.rs +++ b/src/construct/hard_break_trailing.rs @@ -76,7 +76,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult { None, ) } - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if size >= HARD_BREAK_PREFIX_SIZE_MIN => { tokenizer.exit(TokenType::HardBreakTrailingSpace); diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 3ce7052..9fa2ace 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// To do. +/// Resolve heading (atx). pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { let mut edit_map = EditMap::new(); let mut index = 0; diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index df20aa7..211434f 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -179,7 +179,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// To do. +/// Resolve heading (setext). pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { let mut edit_map = EditMap::new(); let mut index = 0; @@ -207,7 +207,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { tokenizer.events[enter].token_type = TokenType::HeadingSetextText; tokenizer.events[exit].token_type = TokenType::HeadingSetextText; - // Add of Enter:HeadingSetext, Exit:HeadingSetext. + // Add Enter:HeadingSetext, Exit:HeadingSetext. let mut heading_enter = tokenizer.events[enter].clone(); heading_enter.token_type = TokenType::HeadingSetext; let mut heading_exit = tokenizer.events[index].clone(); diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index a1bddad..229b0ef 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -103,6 +103,7 @@ use crate::construct::{ blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::codes::{parse, serialize}; /// Kind of HTML (flow). #[derive(Debug, PartialEq)] @@ -164,6 +165,17 @@ impl QuoteKind { _ => unreachable!("invalid char"), } } + /// Turn [Code] into a kind. + /// + /// ## Panics + /// + /// Panics if `code` is not `Code::Char('"' | '\'')`. + fn from_code(code: Code) -> QuoteKind { + match code { + Code::Char(char) => QuoteKind::from_char(char), + _ => unreachable!("invalid code"), + } + } } /// State needed to parse HTML (flow). @@ -175,7 +187,7 @@ struct Info { start_tag: bool, /// Used depending on `kind` to either collect all parsed characters, or to /// store expected characters. - buffer: Vec<char>, + buffer: Vec<Code>, /// `index` into `buffer` when expecting certain characters. index: usize, /// Current quote, when in a double or single quoted attribute value. @@ -254,7 +266,7 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { None, ) } - Code::Char(char) if char.is_ascii_alphabetic() => { + Code::Char('A'..='Z' | 'a'..='z') => { info.start_tag = true; tag_name(tokenizer, code, info) } @@ -282,14 +294,14 @@ fn declaration_open(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> St Code::Char('[') => { tokenizer.consume(code); info.kind = Kind::Cdata; - info.buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + info.buffer = parse("CDATA["); info.index = 0; ( State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))), None, ) } - Code::Char(char) if char.is_ascii_alphabetic() => { + Code::Char('A'..='Z' | 'a'..='z') => { tokenizer.consume(code); info.kind = Kind::Declaration; ( @@ -329,22 +341,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Sta /// <![CDATA|[>&<]]> /// ``` fn cdata_open_inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { - match code { - Code::Char(char) if char == info.buffer[info.index] => { - info.index += 1; - tokenizer.consume(code); + if code == info.buffer[info.index] { + info.index += 1; + tokenizer.consume(code); - if info.index == info.buffer.len() { - info.buffer.clear(); - (State::Fn(Box::new(|t, c| continuation(t, c, info))), None) - } else { - ( - State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))), - None, - ) - } + if info.index == info.buffer.len() { + info.buffer.clear(); + (State::Fn(Box::new(|t, c| continuation(t, c, info))), None) + } else { + ( + State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))), + None, + ) } - _ => (State::Nok, None), + } else { + (State::Nok, None) } } @@ -355,9 +366,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> S /// ``` fn tag_close_start(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { match code { - Code::Char(char) if char.is_ascii_alphabetic() => { + Code::Char('A'..='Z' | 'a'..='z') => { tokenizer.consume(code); - info.buffer.push(char); + info.buffer.push(code); (State::Fn(Box::new(|t, c| tag_name(t, c, info))), None) } _ => (State::Nok, None), @@ -376,13 +387,9 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => { - let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); let name = tag_name_buffer.as_str(); - let slash = if let Code::Char(char) = code { - char == '/' - } else { - false - }; + let slash = matches!(code, Code::Char('/')); info.buffer.clear(); @@ -413,9 +420,9 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes } } } - Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(code); - info.buffer.push(char); + info.buffer.push(code); (State::Fn(Box::new(|t, c| tag_name(t, c, info))), None) } Code::Char(_) => (State::Nok, None), @@ -481,7 +488,7 @@ fn complete_attribute_name_before( tokenizer.consume(code); (State::Fn(Box::new(|t, c| complete_end(t, c, info))), None) } - Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(code); ( State::Fn(Box::new(|t, c| complete_attribute_name(t, c, info))), @@ -508,13 +515,7 @@ fn complete_attribute_name_before( /// ``` fn complete_attribute_name(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { match code { - Code::Char(char) - if char == '-' - || char == '.' - || char == ':' - || char == '_' - || char.is_ascii_alphanumeric() => - { + Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(code); ( State::Fn(Box::new(|t, c| complete_attribute_name(t, c, info))), @@ -571,9 +572,9 @@ fn complete_attribute_value_before( ) -> StateFnResult { match code { Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), - Code::Char(char) if char == '"' || char == '\'' => { + Code::Char('"' | '\'') => { tokenizer.consume(code); - info.quote = Some(QuoteKind::from_char(char)); + info.quote = Some(QuoteKind::from_code(code)); ( State::Fn(Box::new(|t, c| complete_attribute_value_quoted(t, c, info))), None, @@ -602,7 +603,7 @@ fn complete_attribute_value_quoted( info: Info, ) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => { tokenizer.consume(code); ( @@ -860,7 +861,7 @@ fn continuation_raw_end_tag( ) -> StateFnResult { match code { Code::Char('>') => { - let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase(); + let tag_name_buffer = serialize(&info.buffer, false).to_lowercase(); info.buffer.clear(); if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) { @@ -873,9 +874,9 @@ fn continuation_raw_end_tag( continuation(tokenizer, code, info) } } - Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => { + Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => { tokenizer.consume(code); - info.buffer.push(char); + info.buffer.push(code); ( State::Fn(Box::new(|t, c| continuation_raw_end_tag(t, c, info))), None, diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index 2ac0ccd..0926f48 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -56,6 +56,7 @@ use crate::construct::partial_space_or_tab::space_or_tab; use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer}; +use crate::util::codes::parse; /// Start of HTML (text) /// @@ -94,7 +95,7 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); (State::Fn(Box::new(instruction)), None) } - Code::Char(char) if char.is_ascii_alphabetic() => { + Code::Char('A'..='Z' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(tag_open)), None) } @@ -117,13 +118,13 @@ fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } Code::Char('[') => { tokenizer.consume(code); - let buffer = vec!['C', 'D', 'A', 'T', 'A', '[']; + let buffer = parse("CDATA["); ( State::Fn(Box::new(|t, c| cdata_open_inside(t, c, buffer, 0))), None, ) } - Code::Char(char) if char.is_ascii_alphabetic() => { + Code::Char('A'..='Z' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(declaration)), None) } @@ -197,7 +198,7 @@ fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(comment)) } Code::Char('-') => { @@ -239,25 +240,24 @@ fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { fn cdata_open_inside( tokenizer: &mut Tokenizer, code: Code, - buffer: Vec<char>, + buffer: Vec<Code>, index: usize, ) -> StateFnResult { - match code { - Code::Char(char) if char == buffer[index] => { - tokenizer.consume(code); + if code == buffer[index] { + tokenizer.consume(code); - if index + 1 == buffer.len() { - (State::Fn(Box::new(cdata)), None) - } else { - ( - State::Fn(Box::new(move |t, c| { - cdata_open_inside(t, c, buffer, index + 1) - })), - None, - ) - } + if index + 1 == buffer.len() { + (State::Fn(Box::new(cdata)), None) + } else { + ( + State::Fn(Box::new(move |t, c| { + cdata_open_inside(t, c, buffer, index + 1) + })), + None, + ) } - _ => (State::Nok, None), + } else { + (State::Nok, None) } } @@ -269,7 +269,7 @@ fn cdata_open_inside( fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(cdata)) } Code::Char(']') => { @@ -319,7 +319,7 @@ fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::Char('>') => end(tokenizer, code), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(declaration)) } _ => { @@ -338,7 +338,7 @@ fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(instruction)) } Code::Char('?') => { @@ -372,7 +372,7 @@ fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char(char) if char.is_ascii_alphabetic() => { + Code::Char('A'..='Z' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(tag_close)), None) } @@ -388,7 +388,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(tag_close)), None) } @@ -404,7 +404,7 @@ fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(tag_close_between)) } Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -422,13 +422,13 @@ fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => { + Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(tag_open)), None) } Code::CarriageReturnLineFeed | Code::VirtualSpace - | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), + | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), _ => (State::Nok, None), } } @@ -442,7 +442,7 @@ fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(tag_open_between)) } Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -453,7 +453,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); (State::Fn(Box::new(end)), None) } - Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => { + Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(tag_open_attribute_name)), None) } @@ -470,13 +470,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char(char) - if char == '-' - || char == '.' - || char == ':' - || char == '_' - || char.is_ascii_alphanumeric() => - { + Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => { tokenizer.consume(code); (State::Fn(Box::new(tag_open_attribute_name)), None) } @@ -494,7 +488,7 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResu /// ``` fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after)) } Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -519,7 +513,7 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> State fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before)) } Code::VirtualSpace | Code::Char('\t' | ' ') => { @@ -555,7 +549,7 @@ fn tag_open_attribute_value_quoted( ) -> StateFnResult { match code { Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending( + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending( tokenizer, code, Box::new(move |t, c| tag_open_attribute_value_quoted(t, c, marker)), @@ -589,7 +583,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> S Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None), Code::CarriageReturnLineFeed | Code::VirtualSpace - | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), + | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer, code), Code::Char(_) => { tokenizer.consume(code); (State::Fn(Box::new(tag_open_attribute_value_unquoted)), None) @@ -607,7 +601,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer, code: Code) match code { Code::CarriageReturnLineFeed | Code::VirtualSpace - | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code), + | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer, code), _ => (State::Nok, None), } } @@ -646,7 +640,7 @@ fn at_line_ending( return_state: Box<StateFn>, ) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::HtmlTextData); tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); @@ -656,7 +650,7 @@ fn at_line_ending( None, ) } - _ => unreachable!("expected line ending"), + _ => unreachable!("expected eol"), } } diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index ae2f4de..5ec278e 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -45,7 +45,7 @@ use crate::util::edit_map::EditMap; pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - unreachable!("unexpected eol/eof at start of paragraph") + unreachable!("unexpected eol/eof") } _ => { tokenizer.enter(TokenType::Paragraph); @@ -99,7 +99,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> { { // Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding, Enter:Paragraph. edit_map.add(exit_index, 4, vec![]); - println!("rm {:?} {:?}", exit_index, exit_index + 4); // Add Exit:LineEnding position info to Exit:Data. let line_ending_exit = &tokenizer.events[enter_next_index - 1]; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 9f99570..555ccaf 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -6,8 +6,6 @@ //! [string]: crate::content::string //! [text]: crate::content::text -// To do: pass token types in? - use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer}; use crate::util::edit_map::EditMap; @@ -34,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnR fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult { match code { Code::None => (State::Ok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); @@ -58,7 +56,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnRe /// ``` fn data(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult { let done = match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => true, + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => true, _ if stop.contains(&code) => true, _ => false, }; diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 8b281c7..31c13ec 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -171,7 +171,7 @@ fn enclosed(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult tokenizer.exit(info.options.string.clone()); enclosed_before(tokenizer, code, info) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '<') => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '<') => { (State::Nok, None) } Code::Char('\\') => { @@ -235,7 +235,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace - | Code::Char('\t' | '\r' | '\n' | ' ') => { + | Code::Char('\t' | '\n' | '\r' | ' ') => { if info.balance > 0 { (State::Nok, None) } else { diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 32182d6..f201f60 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -133,7 +133,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes tokenizer.exit(info.options.label); (State::Ok, None) } - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -165,7 +165,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes /// ``` fn label(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '[' | ']') => { tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs index d2934b3..5b1ec5e 100644 --- a/src/construct/partial_space_or_tab.rs +++ b/src/construct/partial_space_or_tab.rs @@ -195,7 +195,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResul /// ``` fn after_space_or_tab(tokenizer: &mut Tokenizer, code: Code, mut info: EolInfo) -> StateFnResult { match code { - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.enter_with_content(TokenType::LineEnding, info.options.content_type); if info.connect { @@ -254,7 +254,7 @@ fn after_more_space_or_tab(_tokenizer: &mut Tokenizer, code: Code) -> StateFnRes // Blank line not allowed. if matches!( code, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') ) { (State::Nok, None) } else { diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index caacb0d..010f554 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -102,6 +102,19 @@ impl Kind { _ => unreachable!("invalid char"), } } + /// Turn [Code] into a kind. + /// + /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. + /// + /// ## Panics + /// + /// Panics if `code` is not `Code::Char('(' | '"' | '\'')`. + fn from_code(code: Code) -> Kind { + match code { + Code::Char(char) => Kind::from_char(char), + _ => unreachable!("invalid code"), + } + } } /// State needed to parse titles. @@ -124,10 +137,10 @@ struct Info { /// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFnResult { match code { - Code::Char(char) if char == '"' || char == '\'' || char == '(' => { + Code::Char('"' | '\'' | '(') => { let info = Info { connect: false, - kind: Kind::from_char(char), + kind: Kind::from_code(code), options, }; tokenizer.enter(info.options.title.clone()); @@ -180,7 +193,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes begin(tokenizer, code, info) } Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go( + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go( space_or_tab_eol_with_options(EolOptions { content_type: Some(ContentType::String), connect: info.connect, @@ -216,7 +229,7 @@ fn title(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult { tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.exit(TokenType::Data); at_break(tokenizer, code, info) } diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 62b1205..c9ec564 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -33,7 +33,7 @@ pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { space_or_tab(), if matches!( tokenizer.previous, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') ) { // If there’s whitespace, and we were at an eol/eof, `ok` ok @@ -48,7 +48,7 @@ pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { fn at_eol(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if matches!( code, - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') ) { ok(tokenizer, code) } else { diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs index 8d29157..28aca34 100644 --- a/src/construct/thematic_break.rs +++ b/src/construct/thematic_break.rs @@ -95,7 +95,7 @@ impl Kind { /// /// ## Panics /// - /// Panics if `char` is not `*`, `_`, or `_`. + /// Panics if `char` is not `*`, `-`, or `_`. fn from_char(char: char) -> Kind { match char { '*' => Kind::Asterisk, @@ -104,6 +104,19 @@ impl Kind { _ => unreachable!("invalid char"), } } + /// Turn [Code] into a kind. + /// + /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`. + /// + /// ## Panics + /// + /// Panics if `code` is not `Code::Char('*' | '-' | '_')`. + fn from_code(code: Code) -> Kind { + match code { + Code::Char(char) => Kind::from_char(char), + _ => unreachable!("invalid code"), + } + } } /// State needed to parse thematic breaks. @@ -133,11 +146,11 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - Code::Char(char) if char == '*' || char == '-' || char == '_' => at_break( + Code::Char('*' | '-' | '_') => at_break( tokenizer, code, Info { - kind: Kind::from_char(char), + kind: Kind::from_code(code), size: 0, }, ), diff --git a/src/content/flow.rs b/src/content/flow.rs index 3ff948d..74c6a62 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -140,7 +140,7 @@ fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.interrupt = false; (State::Fn(Box::new(start)), None) } - _ => unreachable!("expected eol/eof after blank line `{:?}`", code), + _ => unreachable!("expected eol/eof"), } } @@ -162,7 +162,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.exit(TokenType::LineEnding); (State::Fn(Box::new(start)), None) } - _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code), + _ => unreachable!("expected eol/eof"), } } diff --git a/src/content/mod.rs b/src/content/mod.rs index 395e41b..ae8ad83 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -1,6 +1,5 @@ //! Content types found in markdown. -#[allow(clippy::module_inception)] pub mod flow; pub mod string; pub mod text; diff --git a/src/content/text.rs b/src/content/text.rs index ecb6ae1..cf630f1 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -8,15 +8,15 @@ //! //! * [Attention][crate::construct::attention] //! * [Autolink][crate::construct::autolink] -//! * [HTML (text)][crate::construct::html_text] +//! * [Character escape][crate::construct::character_escape] +//! * [Character reference][crate::construct::character_reference] +//! * [Code (text)][crate::construct::code_text] //! * [Hard break (escape)][crate::construct::hard_break_escape] //! * [Hard break (trailing)][crate::construct::hard_break_trailing] -//! * [Code (text)][crate::construct::code_text] +//! * [HTML (text)][crate::construct::html_text] //! * [Label start (image)][crate::construct::label_start_image] //! * [Label start (link)][crate::construct::label_start_link] //! * [Label end][crate::construct::label_end] -//! * [Character escape][crate::construct::character_escape] -//! * [Character reference][crate::construct::character_reference] use crate::construct::{ attention::start as attention, autolink::start as autolink, diff --git a/src/parser.rs b/src/parser.rs index 89a0de1..32689d6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -3,7 +3,8 @@ use std::collections::HashSet; // To do: this should start with `containers`, when they’re done. use crate::content::flow::flow; -use crate::tokenizer::{as_codes, Code, Event, Point}; +use crate::tokenizer::{Code, Event, Point}; +use crate::util::codes::parse as parse_codes; /// Information needed, in all content types, when parsing markdown. /// @@ -22,7 +23,7 @@ pub struct ParseState { /// Passes the codes back so the compiler can access the source. pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) { let mut parse_state = ParseState { - codes: as_codes(value), + codes: parse_codes(value), definitions: HashSet::new(), }; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1fa94d7..f0f9ff0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -11,7 +11,6 @@ //! [`attempt`]: Tokenizer::attempt //! [`check`]: Tokenizer::check -use crate::constant::TAB_SIZE; use crate::parser::ParseState; use std::collections::HashMap; @@ -2224,83 +2223,6 @@ fn feed_impl( check_statefn_result((state, None)) } -/// Turn a string into codes. -pub fn as_codes(value: &str) -> Vec<Code> { - let mut codes: Vec<Code> = vec![]; - let mut at_start = true; - let mut at_carriage_return = false; - let mut column = 1; - - for char in value.chars() { - if at_start { - if char == '\u{feff}' { - // Ignore. - continue; - } - - at_start = false; - } - - // Send a CRLF. - if at_carriage_return && '\n' == char { - at_carriage_return = false; - codes.push(Code::CarriageReturnLineFeed); - } else { - // Send the previous CR: we’re not at a next `\n`. - if at_carriage_return { - at_carriage_return = false; - codes.push(Code::Char('\r')); - } - - match char { - // Send a replacement character. - '\0' => { - column += 1; - codes.push(Code::Char('�')); - } - // Send a tab and virtual spaces. - '\t' => { - let remainder = column % TAB_SIZE; - let mut virtual_spaces = if remainder == 0 { - 0 - } else { - TAB_SIZE - remainder - }; - codes.push(Code::Char(char)); - column += 1; - while virtual_spaces > 0 { - codes.push(Code::VirtualSpace); - column += 1; - virtual_spaces -= 1; - } - } - // Send an LF. - '\n' => { - column = 1; - codes.push(Code::Char(char)); - } - // Don’t send anything yet. - '\r' => { - column = 1; - at_carriage_return = true; - } - // Send the char. - _ => { - column += 1; - codes.push(Code::Char(char)); - } - } - }; - } - - // Send the last CR: we’re not at a next `\n`. - if at_carriage_return { - codes.push(Code::Char('\r')); - } - - codes -} - /// Check a [`StateFnResult`][], make sure its valid (that there are no bugs), /// and clean a final eof passed back in `remainder`. fn check_statefn_result(result: StateFnResult) -> StateFnResult { diff --git a/src/util/codes.rs b/src/util/codes.rs new file mode 100644 index 0000000..8a46d02 --- /dev/null +++ b/src/util/codes.rs @@ -0,0 +1,126 @@ +//! Utilities to deal with character codes. + +use crate::constant::TAB_SIZE; +use crate::tokenizer::Code; + +/// Turn a string into codes. +pub fn parse(value: &str) -> Vec<Code> { + let mut codes: Vec<Code> = vec![]; + let mut at_start = true; + let mut at_carriage_return = false; + let mut column = 1; + + for char in value.chars() { + if at_start { + if char == '\u{feff}' { + // Ignore. + continue; + } + + at_start = false; + } + + // Send a CRLF. + if at_carriage_return && '\n' == char { + at_carriage_return = false; + codes.push(Code::CarriageReturnLineFeed); + } else { + // Send the previous CR: we’re not at a next `\n`. + if at_carriage_return { + at_carriage_return = false; + codes.push(Code::Char('\r')); + } + + match char { + // Send a replacement character. + '\0' => { + column += 1; + codes.push(Code::Char('�')); + } + // Send a tab and virtual spaces. + '\t' => { + let remainder = column % TAB_SIZE; + let mut virtual_spaces = if remainder == 0 { + 0 + } else { + TAB_SIZE - remainder + }; + codes.push(Code::Char(char)); + column += 1; + while virtual_spaces > 0 { + codes.push(Code::VirtualSpace); + column += 1; + virtual_spaces -= 1; + } + } + // Send an LF. + '\n' => { + column = 1; + codes.push(Code::Char(char)); + } + // Don’t send anything yet. + '\r' => { + column = 1; + at_carriage_return = true; + } + // Send the char. + _ => { + column += 1; + codes.push(Code::Char(char)); + } + } + }; + } + + // Send the last CR: we’re not at a next `\n`. + if at_carriage_return { + codes.push(Code::Char('\r')); + } + + codes +} + +/// Serialize codes, optionally expanding tabs. +pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { + let mut at_tab = false; + let mut index = 0; + let mut value: Vec<char> = vec![]; + + while index < codes.len() { + let code = codes[index]; + let mut at_tab_next = false; + + match code { + Code::CarriageReturnLineFeed => { + value.push('\r'); + value.push('\n'); + } + Code::Char(char) if char == '\n' || char == '\r' => { + value.push(char); + } + Code::Char(char) if char == '\t' => { + at_tab_next = true; + value.push(if expand_tabs { ' ' } else { char }); + } + Code::VirtualSpace => { + if !expand_tabs && at_tab { + index += 1; + continue; + } + value.push(' '); + } + Code::Char(char) => { + value.push(char); + } + Code::None => { + unreachable!("unexpected EOF code in codes"); + } + } + + at_tab = at_tab_next; + + index += 1; + } + + value.into_iter().collect() +} diff --git a/src/util/encode.rs b/src/util/encode.rs index 5762c22..a3bd589 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,11 +21,36 @@ /// /// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode) pub fn encode(value: &str) -> String { - // To do: replacing 4 times might just be slow. - // Perhaps we can walk the chars. - value - .replace('&', "&") - .replace('"', """) - .replace('<', "<") - .replace('>', ">") + let mut result: Vec<&str> = vec![]; + let mut start = 0; + let mut index = 0; + + for byte in value.bytes() { + if let Some(replacement) = match byte { + b'&' => Some("&"), + b'"' => Some("""), + b'<' => Some("<"), + b'>' => Some(">"), + _ => None, + } { + if start != index { + result.push(&value[start..index]); + } + + result.push(replacement); + start = index + 1; + } + + index += 1; + } + + if start == 0 { + value.to_string() + } else { + if start < index { + result.push(&value[start..index]); + } + + result.join("") + } } diff --git a/src/util/mod.rs b/src/util/mod.rs index 68ef275..d1a0e01 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@ //! Utilities used when compiling markdown. +pub mod codes; pub mod decode_character_reference; pub mod edit_map; pub mod encode; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 4753f7b..123a3a9 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String { // Collapse markdown whitespace and trim it. for char in value.chars() { match char { - '\t' | '\r' | '\n' | ' ' => { + '\t' | '\n' | '\r' | ' ' => { at_whitespace = true; } _ => { diff --git a/src/util/span.rs b/src/util/span.rs index 02811cc..32dd00f 100644 --- a/src/util/span.rs +++ b/src/util/span.rs @@ -1,20 +1,15 @@ //! Utilities to deal with semantic labels. use crate::tokenizer::{Code, Event, EventType}; +use crate::util::codes::serialize as serialize_codes; /// A struct representing the span of an opening and closing event of a token. #[derive(Debug)] pub struct Span { - // To do: probably needed in the future. - // start: Point, /// Absolute offset (and `index` in `codes`) of where this span starts. pub start_index: usize, - // To do: probably needed in the future. - // end: Point, /// Absolute offset (and `index` in `codes`) of where this span ends. pub end_index: usize, - // To do: probably needed in the future. - // token_type: TokenType, } /// Get a span from an event. @@ -29,10 +24,8 @@ pub struct Span { /// When `micromark` is used, this function never panics. pub fn from_exit_event(events: &[Event], index: usize) -> Span { let exit = &events[index]; - // let end = exit.point.clone(); let end_index = exit.index; let token_type = exit.token_type.clone(); - // To do: support `enter` events if needed and walk forwards? assert_eq!( exit.event_type, EventType::Exit, @@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span { let enter = &events[enter_index]; if enter.event_type == EventType::Enter && enter.token_type == token_type { return Span { - // start: enter.point.clone(), start_index: enter.index, - // end, end_index, - // token_type, }; } @@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String { pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] { &codes[span.start_index..span.end_index] } - -/// Serialize a slice of codes, optionally expanding tabs. -fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String { - let mut at_tab = false; - let mut index = 0; - let mut value: Vec<char> = vec![]; - - while index < codes.len() { - let code = codes[index]; - let mut at_tab_next = false; - - match code { - Code::CarriageReturnLineFeed => { - value.push('\r'); - value.push('\n'); - } - Code::Char(char) if char == '\n' || char == '\r' => { - value.push(char); - } - Code::Char(char) if char == '\t' => { - at_tab_next = true; - value.push(if expand_tabs { ' ' } else { char }); - } - Code::VirtualSpace => { - if !expand_tabs && at_tab { - index += 1; - continue; - } - value.push(' '); - } - Code::Char(char) => { - value.push(char); - } - Code::None => { - unreachable!("unexpected EOF code in codes"); - } - } - - at_tab = at_tab_next; - - index += 1; - } - - value.into_iter().collect() -} |