diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-08 15:52:16 +0200 |
commit | 4c06c8554c35887f8f5147783953b2b7e7c2327f (patch) | |
tree | 1b2463848a3ae4c645f7f1a325877ee829ab65c5 /src/tokenizer.rs | |
download | markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2 markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip |
.
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 580 |
1 files changed, 580 insertions, 0 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 0000000..c8b1440 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,580 @@ +//! The tokenizer glues states from the state machine together. +//! +//! It facilitates everything needed to turn codes into tokens and events with +//! a state machine. +//! It also enables logic needed for parsing markdown, such as an [`attempt`][] +//! to parse something, which can succeed or, when unsuccessful, revert the +//! attempt. +//! Similarly, a [`check`][] exists, which does the same as an `attempt` but +//! reverts even if successful. +//! +//! [`attempt`]: Tokenizer::attempt +//! [`check`]: Tokenizer::check + +use crate::constant::TAB_SIZE; + +/// Semantic label of a span. +// To do: figure out how to share this so extensions can add their own stuff, +// though perhaps that’s impossible and we should inline all extensions? +// To do: document each variant. +#[derive(Debug, Clone, PartialEq)] +pub enum TokenType { + AtxHeading, + AtxHeadingSequence, + AtxHeadingWhitespace, + AtxHeadingText, + + CharacterEscape, + CharacterEscapeMarker, + CharacterEscapeValue, + + CharacterReference, + CharacterReferenceMarker, + CharacterReferenceMarkerNumeric, + CharacterReferenceMarkerHexadecimal, + CharacterReferenceMarkerSemi, + CharacterReferenceValue, + + CodeFenced, + CodeFencedFence, + CodeFencedFenceSequence, + CodeFencedFenceWhitespace, + CodeFencedFenceInfo, + CodeFencedFenceMeta, + + CodeIndented, + CodeIndentedPrefixWhitespace, + + CodeFlowChunk, + + Data, + + HtmlFlow, + HtmlFlowData, + + ThematicBreak, + ThematicBreakSequence, + ThematicBreakWhitespace, + + Whitespace, + LineEnding, + BlankLineEnding, + BlankLineWhitespace, + + Content, + ContentPhrasing, + ChunkString, +} + +/// Enum representing a character code. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Code { + /// End of the input stream (called eof). + None, + /// Used to make parsing line endings easier as it represents both + /// `Code::Char('\r')` and `Code::Char('\n')` combined. + CarriageReturnLineFeed, + /// the expansion of a tab (`Code::Char('\t')`), depending on where the tab + /// ocurred, it’s followed by 0 to 3 (both inclusive) `Code::VirtualSpace`s. + VirtualSpace, + /// The most frequent variant of this enum is `Code::Char(char)`, which just + /// represents a char, but micromark adds meaning to certain other values. + Char(char), +} + +/// A location in the document (`line`/`column`/`offset`). +/// +/// The interface for the location in the document comes from unist `Point`: +/// <https://github.com/syntax-tree/unist#point>. +#[derive(Debug, Clone, PartialEq)] +pub struct Point { + /// 1-indexed line number. + pub line: usize, + /// 1-indexed column number. + /// Note that this is increases up to a tab stop for tabs. + /// Some editors count tabs as 1 character, so this position is not always + /// the same as editors. + pub column: usize, + /// 0-indexed position in the document. + pub offset: usize, +} + +/// Possible event types. +#[derive(Debug, PartialEq)] +pub enum EventType { + /// The start of something. + Enter, + /// The end of something. + Exit, +} + +/// Something semantic happening somewhere. +#[derive(Debug)] +pub struct Event { + pub event_type: EventType, + pub token_type: TokenType, + pub point: Point, + pub index: usize, +} + +/// The essence of the state machine are functions: `StateFn`. +/// It’s responsible for dealing with that single passed [`Code`][]. +/// It yields a [`StateFnResult`][]. +pub type StateFn = dyn FnOnce(&mut Tokenizer, Code) -> StateFnResult; +/// Each [`StateFn`][] yields something back: primarily the state. +/// In certain cases, it can also yield back up parsed codes that were passed down. +pub type StateFnResult = (State, Option<Vec<Code>>); + +/// The result of a state. +pub enum State { + /// There is a future state: a boxed [`StateFn`][] to pass the next code to. + Fn(Box<StateFn>), + /// The state is successful. + Ok, + /// The state is not successful. + Nok, +} + +/// The internal state of a tokenizer, not to be confused with states from the +/// state machine, this instead is all the information about where we currently +/// are and what’s going on. +#[derive(Debug, Clone)] +struct InternalState { + /// Length of `events`. We only add to events, so reverting will just pop stuff off. + events_len: usize, + /// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt. + stack_len: usize, + /// Current code. + current: Code, + /// `index` in codes of the current code. + index: usize, + /// Current relative and absolute position in the file. + point: Point, +} + +/// A tokenizer itself. +#[derive(Debug)] +pub struct Tokenizer { + /// Track whether a character is expected to be consumed, and whether it’s + /// actually consumed + /// + /// Tracked to make sure everything’s valid. + consumed: bool, + /// Semantic labels of one or more codes in `codes`. + pub events: Vec<Event>, + /// Hierarchy of semantic labels. + /// + /// Tracked to make sure everything’s valid. + stack: Vec<TokenType>, + /// Current character code. + current: Code, + /// `index` in codes of the current code. + index: usize, + /// Current relative and absolute place in the file. + point: Point, +} + +impl Tokenizer { + /// Create a new tokenizer. + pub fn new() -> Tokenizer { + Tokenizer { + current: Code::None, + index: 0, + consumed: true, + point: Point { + line: 1, + column: 1, + offset: 0, + }, + stack: vec![], + events: vec![], + } + } + + /// Prepare for a next code to get consumed. + fn expect(&mut self, code: Code) { + assert!(self.consumed, "expected previous character to be consumed"); + self.consumed = false; + self.current = code; + } + + /// Consume the current character. + /// Each [`StateFn`][] is expected to call this to signal that this code is + /// used, or call a next `StateFn`. + pub fn consume(&mut self, code: Code) { + assert_eq!( + code, self.current, + "expected given code to equal expected code" + ); + log::debug!("consume: `{:?}` ({:?})", code, self.point); + assert!(!self.consumed, "expected code to not have been consumed: this might be because `x(code)` instead of `x` was returned"); + + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + self.point.line += 1; + self.point.column = 1; + self.point.offset += if code == Code::CarriageReturnLineFeed { + 2 + } else { + 1 + }; + // To do: accountForPotentialSkip() + log::debug!("position: after eol: `{:?}`", self.point); + } + Code::VirtualSpace => { + // Empty. + } + _ => { + self.point.column += 1; + self.point.offset += 1; + } + } + + self.index += 1; + // Mark as consumed. + self.consumed = true; + } + + /// Mark the start of a semantic label. + pub fn enter(&mut self, token_type: TokenType) { + log::debug!("enter `{:?}` ({:?})", token_type, self.point); + let event = Event { + event_type: EventType::Enter, + token_type: token_type.clone(), + point: self.point.clone(), + index: self.index, + }; + + self.events.push(event); + self.stack.push(token_type); + } + + /// Mark the end of a semantic label. + pub fn exit(&mut self, token_type: TokenType) { + let token_on_stack = self.stack.pop().expect("cannot close w/o open tokens"); + + assert_eq!( + token_on_stack, token_type, + "expected exit TokenType to match current TokenType" + ); + + let ev = self.events.last().expect("cannot close w/o open event"); + + let point = self.point.clone(); + + assert!( + token_on_stack != ev.token_type || ev.point != point, + "expected non-empty TokenType" + ); + + log::debug!("exit `{:?}` ({:?})", token_type, self.point); + let event = Event { + event_type: EventType::Exit, + token_type, + point, + index: self.index, + }; + + self.events.push(event); + } + + /// Capture the internal state. + fn capture(&mut self) -> InternalState { + InternalState { + index: self.index, + current: self.current, + point: self.point.clone(), + events_len: self.events.len(), + stack_len: self.stack.len(), + } + } + + /// Apply the internal state. + fn free(&mut self, previous: InternalState) { + self.index = previous.index; + self.current = previous.current; + self.point = previous.point; + assert!( + self.events.len() >= previous.events_len, + "expected to restore less events than before" + ); + self.events.truncate(previous.events_len); + assert!( + self.stack.len() >= previous.stack_len, + "expected to restore less stack items than before" + ); + self.stack.truncate(previous.stack_len); + } + + /// Check if `state` and its future states are successful or not. + /// + /// This captures the current state of the tokenizer, returns a wrapped + /// state that captures all codes and feeds them to `state` and its future + /// states until it yields [`State::Ok`][] or [`State::Nok`][]. + /// It then applies the captured state, calls `done`, and feeds all + /// captured codes to its future states. + pub fn check( + &mut self, + state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + done: impl FnOnce(bool) -> Box<StateFn> + 'static, + ) -> Box<StateFn> { + let previous = self.capture(); + + attempt_impl( + state, + vec![], + |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| { + let codes = result.0; + tokenizer.free(previous); + log::debug!( + "check: {:?}, codes: {:?}, at {:?}", + ok, + codes, + tokenizer.point + ); + let result = done(ok); + tokenizer.feed(codes, result, false) + }, + ) + } + + /// Attempt to parse with `state` and its future states, reverting if + /// unsuccessful. + /// + /// This captures the current state of the tokenizer, returns a wrapped + /// state that captures all codes and feeds them to `state` and its future + /// states until it yields [`State::Ok`][], at which point it calls `done` + /// and yields its result. + /// If instead [`State::Nok`][] was yielded, the captured state is applied, + /// `done` is called, and all captured codes are fed to its future states. + pub fn attempt( + &mut self, + state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + done: impl FnOnce(bool) -> Box<StateFn> + 'static, + ) -> Box<StateFn> { + let previous = self.capture(); + + attempt_impl( + state, + vec![], + |result: (Vec<Code>, Vec<Code>), ok, tokenizer: &mut Tokenizer| { + let codes = if ok { + result.1 + } else { + tokenizer.free(previous); + result.0 + }; + + log::debug!( + "attempt: {:?}, codes: {:?}, at {:?}", + ok, + codes, + tokenizer.point + ); + let result = done(ok); + tokenizer.feed(codes, result, false) + }, + ) + } + + /// Feed a list of `codes` into `start`. + /// + /// This is set up to support repeatedly calling `feed`, and thus streaming + /// markdown into the state machine, and normally pauses after feeding. + /// When `done: true` is passed, the EOF is fed. + pub fn feed( + &mut self, + codes: Vec<Code>, + start: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + drain: bool, + ) -> StateFnResult { + let mut codes = codes; + let mut state = State::Fn(Box::new(start)); + let mut index = 0; + + self.consumed = true; + + while index < codes.len() { + let code = codes[index]; + + match state { + State::Nok | State::Ok => { + break; + } + State::Fn(func) => { + log::debug!("main: passing `{:?}`", code); + self.expect(code); + let (next, remainder) = check_statefn_result(func(self, code)); + state = next; + index = index + 1 + - (if let Some(ref x) = remainder { + x.len() + } else { + 0 + }); + } + } + } + + // Yield to a higher loop if we shouldn’t feed EOFs. + if !drain { + return (state, Some(codes.split_off(index))); + } + + loop { + // Feed EOF. + match state { + State::Ok | State::Nok => break, + State::Fn(func) => { + let code = Code::None; + log::debug!("main: passing eof"); + self.expect(code); + let (next, remainder) = check_statefn_result(func(self, code)); + + if let Some(ref x) = remainder { + if !x.is_empty() { + // To do: handle? + unreachable!("drain:remainder {:?}", x); + } + } + + state = next; + } + } + } + + check_statefn_result((state, None)) + } +} + +/// Internal utility to wrap states to also capture codes. +/// +/// Recurses into itself. +/// Used in [`Tokenizer::attempt`][Tokenizer::attempt] and [`Tokenizer::check`][Tokenizer::check]. +fn attempt_impl( + state: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, + codes: Vec<Code>, + done: impl FnOnce((Vec<Code>, Vec<Code>), bool, &mut Tokenizer) -> StateFnResult + 'static, +) -> Box<StateFn> { + Box::new(|tokenizer, code| { + let mut codes = codes; + + let (next, remainder) = check_statefn_result(state(tokenizer, code)); + + match code { + Code::None => {} + _ => { + codes.push(code); + } + } + + // To do: `remainder` must never be bigger than codes I guess? + // To do: `remainder` probably has to be taken *from* `codes`, in a similar vain to the `Ok` handling below. + match next { + State::Ok => { + let remaining = if let Some(x) = remainder { x } else { vec![] }; + check_statefn_result(done((codes, remaining), true, tokenizer)) + } + State::Nok => check_statefn_result(done((codes, vec![]), false, tokenizer)), + State::Fn(func) => { + check_statefn_result((State::Fn(attempt_impl(func, codes, done)), None)) + } + } + }) +} + +/// Turn a string into codes. +// To do: handle BOM at start? +pub fn as_codes(value: &str) -> Vec<Code> { + let mut codes: Vec<Code> = vec![]; + let mut at_carriage_return = false; + let mut column = 1; + + for char in value.chars() { + // Send a CRLF. + if at_carriage_return && '\n' == char { + at_carriage_return = false; + codes.push(Code::CarriageReturnLineFeed); + } else { + // Send the previous CR: we’re not at a next `\n`. + if at_carriage_return { + at_carriage_return = false; + codes.push(Code::Char('\r')); + } + + match char { + // Send a replacement character. + '\0' => { + column += 1; + codes.push(Code::Char('�')); + } + // Send a tab and virtual spaces. + '\t' => { + // To do: is this correct? + let virtual_spaces = TAB_SIZE - (column % TAB_SIZE); + println!("tabs, expand {:?}, {:?}", column, virtual_spaces); + codes.push(Code::Char(char)); + column += 1; + let mut index = 0; + while index < virtual_spaces { + codes.push(Code::VirtualSpace); + column += 1; + index += 1; + } + } + // Send an LF. + '\n' => { + column = 1; + codes.push(Code::Char(char)); + } + // Don’t send anything yet. + '\r' => { + column = 1; + at_carriage_return = true; + } + // Send the char. + _ => { + column += 1; + codes.push(Code::Char(char)); + } + } + }; + } + + // To do: handle a final CR? + + codes +} + +/// Check a [`StateFnResult`][], make sure its valid (that there are no bugs), +/// and clean a final eof passed back in `remainder`. +fn check_statefn_result(result: StateFnResult) -> StateFnResult { + let (state, mut remainder) = result; + + match state { + State::Nok | State::Fn(_) => { + if let Some(ref x) = remainder { + assert_eq!( + x.len(), + 0, + "expected `None` to be passed back as remainder from `State::Nok`, `State::Fn`" + ); + } + } + State::Ok => {} + } + + // Remove an eof. + // For convencience, feeding back an eof is allowed, but cleaned here. + // Most states handle eof and eol in the same branch, and hence pass + // all back. + // This might not be needed, because if EOF is passed back, we’re at the EOF. + // But they’re not supposed to be in codes, so here we remove them. + if let Some(ref mut list) = remainder { + if Some(&Code::None) == list.last() { + list.pop(); + } + } + + (state, remainder) +} |