diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 18:22:59 +0200 |
commit | 0eeff9148e327183e532752f46421a75506dd7a6 (patch) | |
tree | 4f0aed04f90aa759ce96a2e87aa719e7fa95c450 /src/construct/character_reference.rs | |
parent | 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (diff) | |
download | markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.gz markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.tar.bz2 markdown-rs-0eeff9148e327183e532752f46421a75506dd7a6.zip |
Refactor to improve states
* Remove custom kind wrappers, use plain bytes instead
* Remove `Into`s, use the explicit expected types instead
* Refactor to use `slice.as_str` in most places
* Remove unneeded unique check before adding a definition
* Use a shared CDATA prefix in constants
* Inline byte checks into matches
* Pass bytes back from parser instead of whole parse state
* Refactor to work more often on bytes
* Rename custom `size` to `len`
Diffstat (limited to '')
-rw-r--r-- | src/construct/character_reference.rs | 132 |
1 files changed, 46 insertions, 86 deletions
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 90763c1..cd489a4 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -66,67 +66,18 @@ use crate::constant::{ CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, CHARACTER_REFERENCE_NAMED_SIZE_MAX, }; use crate::token::Token; -use crate::tokenizer::{Point, State, Tokenizer}; -use crate::util::slice::{Position, Slice}; - -/// Kind of a character reference. -#[derive(Debug, Clone, PartialEq)] -pub enum Kind { - /// Numeric decimal character reference. - /// - /// ```markdown - /// > | a	b - /// ^^^^^ - /// ``` - Decimal, - /// Numeric hexadecimal character reference. - /// - /// ```markdown - /// > | a{b - /// ^^^^^^ - /// ``` - Hexadecimal, - /// Named character reference. - /// - /// ```markdown - /// > | a&b - /// ^^^^^ - /// ``` - Named, -} - -impl Kind { - /// Get the maximum size of characters allowed in the value of a character - /// reference. - fn max(&self) -> usize { - match self { - Kind::Hexadecimal => CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, - Kind::Decimal => CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, - Kind::Named => CHARACTER_REFERENCE_NAMED_SIZE_MAX, - } - } - - /// Check if a byte ([`u8`]) is allowed. - fn allowed(&self, byte: u8) -> bool { - let check = match self { - Kind::Hexadecimal => u8::is_ascii_hexdigit, - Kind::Decimal => u8::is_ascii_digit, - Kind::Named => u8::is_ascii_alphanumeric, - }; - - check(&byte) - } -} +use crate::tokenizer::{State, Tokenizer}; +use crate::util::slice::Slice; /// State needed to parse character references. #[derive(Debug, Clone)] struct Info { - /// Place of value start. - start: Point, - /// Size of value. - size: usize, - /// Kind of character reference. - kind: Kind, + /// Index of where value starts. + start: usize, + /// Marker of character reference. + marker: u8, + /// Maximum number of characters in the value for this kind. + max: usize, } /// Start of a character reference. @@ -174,9 +125,9 @@ fn open(tokenizer: &mut Tokenizer) -> State { value( tokenizer, Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Named, + start: tokenizer.point.index, + marker: b'&', + max: CHARACTER_REFERENCE_NAMED_SIZE_MAX, }, ) } @@ -198,17 +149,17 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { tokenizer.exit(Token::CharacterReferenceMarkerHexadecimal); tokenizer.enter(Token::CharacterReferenceValue); let info = Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Hexadecimal, + start: tokenizer.point.index, + marker: b'x', + max: CHARACTER_REFERENCE_HEXADECIMAL_SIZE_MAX, }; State::Fn(Box::new(|t| value(t, info))) } else { tokenizer.enter(Token::CharacterReferenceValue); let info = Info { - start: tokenizer.point.clone(), - size: 0, - kind: Kind::Decimal, + start: tokenizer.point.index, + marker: b'#', + max: CHARACTER_REFERENCE_DECIMAL_SIZE_MAX, }; value(tokenizer, info) } @@ -227,21 +178,22 @@ fn numeric(tokenizer: &mut Tokenizer) -> State { /// > | a	b /// ^ /// ``` -fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { +fn value(tokenizer: &mut Tokenizer, info: Info) -> State { + let size = tokenizer.point.index - info.start; + match tokenizer.current { - Some(b';') if info.size > 0 => { - if Kind::Named == info.kind { - // To do: fix slice. - let value = Slice::from_position( + Some(b';') if size > 0 => { + // Named. + if info.marker == b'&' { + // Guaranteed to be valid ASCII bytes. + let slice = Slice::from_indices( tokenizer.parse_state.bytes, - &Position { - start: &info.start, - end: &tokenizer.point, - }, - ) - .serialize(); + info.start, + tokenizer.point.index, + ); + let name = slice.as_str(); - if !CHARACTER_REFERENCES.iter().any(|d| d.0 == value) { + if !CHARACTER_REFERENCES.iter().any(|d| d.0 == name) { return State::Nok; } } @@ -253,14 +205,22 @@ fn value(tokenizer: &mut Tokenizer, mut info: Info) -> State { tokenizer.exit(Token::CharacterReference); State::Ok } - Some(byte) => { - if info.size < info.kind.max() && info.kind.allowed(byte) { - info.size += 1; - tokenizer.consume(); - State::Fn(Box::new(|t| value(t, info))) - } else { - State::Nok - } + // ASCII digit, for named, decimal, and hexadecimal references. + Some(b'0'..=b'9') if size < info.max => { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) + } + // ASCII hex letters, for named and hexadecimal references. + Some(b'A'..=b'F' | b'a'..=b'f') + if matches!(info.marker, b'&' | b'x') && size < info.max => + { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) + } + // Non-hex ASCII alphabeticals, for named references. + Some(b'G'..=b'Z' | b'g'..=b'z') if info.marker == b'&' && size < info.max => { + tokenizer.consume(); + State::Fn(Box::new(|t| value(t, info))) } _ => State::Nok, } |