diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-10 16:29:56 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-10 16:29:56 +0200 |
commit | 5133042973f31a3992f216e591d840bb491bfd45 (patch) | |
tree | 810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src/tokenizer.rs | |
parent | 021d5f989ae41ae39a9b937b498141d9dc70d894 (diff) | |
download | markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2 markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip |
Add proper support for subtokenization
- Add “content” content type
- Add paragraph
- Add skips
- Add linked tokens
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r-- | src/tokenizer.rs | 40 |
1 files changed, 37 insertions, 3 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 35e768e..1746a19 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,6 +12,7 @@ //! [`check`]: Tokenizer::check use crate::constant::TAB_SIZE; +use std::collections::HashMap; /// Semantic label of a span. // To do: figure out how to share this so extensions can add their own stuff, @@ -64,7 +65,10 @@ pub enum TokenType { Content, ContentChunk, + Paragraph, + ChunkString, + ChunkText, } /// Enum representing a character code. @@ -101,7 +105,7 @@ pub struct Point { } /// Possible event types. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub enum EventType { /// The start of something. Enter, @@ -110,12 +114,14 @@ pub enum EventType { } /// Something semantic happening somewhere. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Event { pub event_type: EventType, pub token_type: TokenType, pub point: Point, pub index: usize, + pub previous: Option<usize>, + pub next: Option<usize>, } /// The essence of the state machine are functions: `StateFn`. @@ -156,6 +162,7 @@ struct InternalState { /// A tokenizer itself. #[derive(Debug)] pub struct Tokenizer { + column_start: HashMap<usize, usize>, /// Track whether a character is expected to be consumed, and whether it’s /// actually consumed /// @@ -180,6 +187,7 @@ impl Tokenizer { pub fn new(point: Point, index: usize) -> Tokenizer { Tokenizer { current: Code::None, + column_start: HashMap::new(), index, consumed: true, point, @@ -195,6 +203,28 @@ impl Tokenizer { self.current = code; } + pub fn define_skip(&mut self, point: &Point, index: usize) { + self.column_start.insert(point.line, point.column); + self.account_for_potential_skip(); + log::debug!("position: define skip: `{:?}` ({:?})", point, index); + } + + fn account_for_potential_skip(&mut self) { + println!("account?: {:?} {:?}", self.point, self.index); + match self.column_start.get(&self.point.line) { + None => {} + Some(next_column) => { + if self.point.column == 1 { + let col = *next_column; + self.point.column = col; + self.point.offset += col - 1; + self.index += col - 1; + println!("account! {:?} {:?}", self.point, self.index); + } + } + }; + } + /// Consume the current character. /// Each [`StateFn`][] is expected to call this to signal that this code is /// used, or call a next `StateFn`. @@ -215,7 +245,7 @@ impl Tokenizer { } else { 1 }; - // To do: accountForPotentialSkip() + self.account_for_potential_skip(); log::debug!("position: after eol: `{:?}`", self.point); } Code::VirtualSpace => { @@ -240,6 +270,8 @@ impl Tokenizer { token_type: token_type.clone(), point: self.point.clone(), index: self.index, + previous: None, + next: None, }; self.events.push(event); @@ -270,6 +302,8 @@ impl Tokenizer { token_type, point, index: self.index, + previous: None, + next: None, }; self.events.push(event); |