diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 10:49:07 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-07-29 10:49:07 +0200 |
commit | 148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f (patch) | |
tree | 7655ffebe0c6a917c3c391edacde03d754f2de4f /src/tokenizer.rs | |
parent | 6f61649ac8d08fff85a99172afbf4cd852dda2e6 (diff) | |
download | markdown-rs-148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f.tar.gz markdown-rs-148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f.tar.bz2 markdown-rs-148ede7f0f42f0ccb1620b13d91f35d0c7d04c2f.zip |
Refactor to work on bytes (`u8`)
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 126 |
1 files changed, 61 insertions, 65 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3cbad0f..9c5e9f6 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -27,8 +27,8 @@ pub enum ContentType { #[derive(Debug, PartialEq)] pub enum CharAction { - Normal(char), - Insert(char), + Normal(u8), + Insert(u8), Ignore, } @@ -42,8 +42,8 @@ pub struct Point { pub line: usize, /// 1-indexed column number. /// This is increases up to a tab stop for tabs. - /// Some editors count tabs as 1 character, so this position is not always - /// the same as editors. + /// Some editors count tabs as 1 character, so this position is not the + /// same as editors. pub column: usize, /// 0-indexed position in the document. /// @@ -81,7 +81,7 @@ pub struct Event { } /// The essence of the state machine are functions: `StateFn`. -/// It’s responsible for dealing with the current char. +/// It’s responsible for dealing with the current byte. /// It yields a [`State`][]. pub type StateFn = dyn FnOnce(&mut Tokenizer) -> State; @@ -157,9 +157,9 @@ struct InternalState { /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. stack_len: usize, /// Previous code. - previous: Option<char>, + previous: Option<u8>, /// Current code. - current: Option<char>, + current: Option<u8>, /// Current relative and absolute position in the file. point: Point, } @@ -173,17 +173,17 @@ pub struct Tokenizer<'a> { first_line: usize, /// To do. line_start: Point, - /// Track whether a character is expected to be consumed, and whether it’s - /// actually consumed + /// Track whether the current byte is already consumed (`true`) or expected + /// to be consumed (`false`). /// /// Tracked to make sure everything’s valid. consumed: bool, /// Track whether this tokenizer is done. resolved: bool, - /// Current character code. - pub current: Option<char>, - /// Previous character code. - pub previous: Option<char>, + /// Current byte. + pub current: Option<u8>, + /// Previous byte. + pub previous: Option<u8>, /// Current relative and absolute place in the file. pub point: Point, /// Semantic labels of one or more codes in `codes`. @@ -297,13 +297,13 @@ impl<'a> Tokenizer<'a> { } /// Prepare for a next code to get consumed. - pub fn expect(&mut self, char: Option<char>) { - debug_assert!(self.consumed, "expected previous character to be consumed"); + pub fn expect(&mut self, byte: Option<u8>) { + debug_assert!(self.consumed, "expected previous byte to be consumed"); self.consumed = false; - self.current = char; + self.current = byte; } - /// Consume the current character. + /// Consume the current byte. /// Each [`StateFn`][] is expected to call this to signal that this code is /// used, or call a next `StateFn`. pub fn consume(&mut self) { @@ -320,23 +320,23 @@ impl<'a> Tokenizer<'a> { self.consumed = true; } - /// Move to the next (virtual) character. + /// Move to the next (virtual) byte. pub fn move_one(&mut self) { - match char_action(&self.parse_state.chars, &self.point) { + match byte_action(self.parse_state.bytes, &self.point) { CharAction::Ignore => { self.point.index += 1; } - CharAction::Insert(char) => { - self.previous = Some(char); + CharAction::Insert(byte) => { + self.previous = Some(byte); self.point.column += 1; self.point.vs += 1; } - CharAction::Normal(char) => { - self.previous = Some(char); + CharAction::Normal(byte) => { + self.previous = Some(byte); self.point.vs = 0; self.point.index += 1; - if char == '\n' { + if byte == b'\n' { self.point.line += 1; self.point.column = 1; @@ -355,7 +355,7 @@ impl<'a> Tokenizer<'a> { } } - /// Move (virtual) characters. + /// Move (virtual) bytes. pub fn move_to(&mut self, to: (usize, usize)) { let (to_index, to_vs) = to; while self.point.index < to_index || self.point.index == to_index && self.point.vs < to_vs { @@ -382,10 +382,10 @@ impl<'a> Tokenizer<'a> { pub fn enter_with_link(&mut self, token_type: Token, link: Option<Link>) { let mut point = self.point.clone(); - // Move back past ignored chars. + // Move back past ignored bytes. while point.index > 0 { point.index -= 1; - let action = char_action(&self.parse_state.chars, &point); + let action = byte_action(self.parse_state.bytes, &point); if !matches!(action, CharAction::Ignore) { point.index += 1; break; @@ -432,13 +432,13 @@ impl<'a> Tokenizer<'a> { // A bit weird, but if we exit right after a line ending, we *don’t* want to consider // potential skips. - if matches!(self.previous, Some('\n')) { + if matches!(self.previous, Some(b'\n')) { point = self.line_start.clone(); } else { - // Move back past ignored chars. + // Move back past ignored bytes. while point.index > 0 { point.index -= 1; - let action = char_action(&self.parse_state.chars, &point); + let action = byte_action(self.parse_state.bytes, &point); if !matches!(action, CharAction::Ignore) { point.index += 1; break; @@ -520,7 +520,7 @@ impl<'a> Tokenizer<'a> { pub fn go_until( &mut self, state_fn: impl FnOnce(&mut Tokenizer) -> State + 'static, - until: impl Fn(Option<char>) -> bool + 'static, + until: impl Fn(Option<u8>) -> bool + 'static, done: impl FnOnce(State) -> Box<StateFn> + 'static, ) -> Box<StateFn> { attempt_impl( @@ -653,21 +653,19 @@ impl<'a> Tokenizer<'a> { while self.point.index < max { match state { State::Ok | State::Nok => break, - State::Fn(func) => match char_action(&self.parse_state.chars, &self.point) { + State::Fn(func) => match byte_action(self.parse_state.bytes, &self.point) { CharAction::Ignore => { state = State::Fn(Box::new(func)); self.move_one(); } - CharAction::Insert(char) => { - log::debug!("main: passing (fake): `{:?}` ({:?})", char, self.point); - self.expect(Some(char)); + CharAction::Insert(byte) => { + log::debug!("main: passing (fake): `{:?}` ({:?})", byte, self.point); + self.expect(Some(byte)); state = func(self); - // self.point.column += 1; - // self.point.vs += 1; } - CharAction::Normal(char) => { - log::debug!("main: passing: `{:?}` ({:?})", char, self.point); - self.expect(Some(char)); + CharAction::Normal(byte) => { + log::debug!("main: passing: `{:?}` ({:?})", byte, self.point); + self.expect(Some(byte)); state = func(self); } }, @@ -690,28 +688,28 @@ impl<'a> Tokenizer<'a> { // To do: clean this? // We sometimes move back when flushing, so then we use those codes. if self.point.index == max { - let char = None; - log::debug!("main: flushing eof: `{:?}` ({:?})", char, self.point); - self.expect(char); + let byte = None; + log::debug!("main: flushing eof: `{:?}` ({:?})", byte, self.point); + self.expect(byte); state = func(self); } else { - match char_action(&self.parse_state.chars, &self.point) { + match byte_action(self.parse_state.bytes, &self.point) { CharAction::Ignore => { state = State::Fn(Box::new(func)); self.move_one(); } - CharAction::Insert(char) => { + CharAction::Insert(byte) => { log::debug!( "main: flushing (fake): `{:?}` ({:?})", - char, + byte, self.point ); - self.expect(Some(char)); + self.expect(Some(byte)); state = func(self); } - CharAction::Normal(char) => { - log::debug!("main: flushing: `{:?}` ({:?})", char, self.point); - self.expect(Some(char)); + CharAction::Normal(byte) => { + log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point); + self.expect(Some(byte)); state = func(self); } } @@ -735,22 +733,20 @@ impl<'a> Tokenizer<'a> { } } -fn char_action(chars: &[char], point: &Point) -> CharAction { - if point.index < chars.len() { - let char = chars[point.index]; +fn byte_action(bytes: &[u8], point: &Point) -> CharAction { + if point.index < bytes.len() { + let byte = bytes[point.index]; - if char == '\0' { - CharAction::Normal(char::REPLACEMENT_CHARACTER) - } else if char == '\r' { + if byte == b'\r' { // CRLF. - if point.index < chars.len() - 1 && chars[point.index + 1] == '\n' { + if point.index < bytes.len() - 1 && bytes[point.index + 1] == b'\n' { CharAction::Ignore } // CR. else { - CharAction::Normal('\n') + CharAction::Normal(b'\n') } - } else if char == '\t' { + } else if byte == b'\t' { let remainder = point.column % TAB_SIZE; let vs = if remainder == 0 { 0 @@ -761,19 +757,19 @@ fn char_action(chars: &[char], point: &Point) -> CharAction { // On the tab itself, first send it. if point.vs == 0 { if vs == 0 { - CharAction::Normal(char) + CharAction::Normal(byte) } else { - CharAction::Insert(char) + CharAction::Insert(byte) } } else if vs == 0 { - CharAction::Normal(' ') + CharAction::Normal(b' ') } else { - CharAction::Insert(' ') + CharAction::Insert(b' ') } } // VS? else { - CharAction::Normal(char) + CharAction::Normal(byte) } } else { unreachable!("out of bounds") @@ -786,7 +782,7 @@ fn char_action(chars: &[char], point: &Point) -> CharAction { /// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. fn attempt_impl( state: impl FnOnce(&mut Tokenizer) -> State + 'static, - pause: Option<Box<dyn Fn(Option<char>) -> bool + 'static>>, + pause: Option<Box<dyn Fn(Option<u8>) -> bool + 'static>>, start: usize, done: impl FnOnce(&mut Tokenizer, State) -> State + 'static, ) -> Box<StateFn> { |