aboutsummaryrefslogtreecommitdiffstats
path: root/src/tokenizer.rs
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-10 16:29:56 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-10 16:29:56 +0200
commit5133042973f31a3992f216e591d840bb491bfd45 (patch)
tree810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src/tokenizer.rs
parent021d5f989ae41ae39a9b937b498141d9dc70d894 (diff)
downloadmarkdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz
markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2
markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip
Add proper support for subtokenization
- Add “content” content type - Add paragraph - Add skips - Add linked tokens
Diffstat (limited to 'src/tokenizer.rs')
-rw-r--r--src/tokenizer.rs40
1 files changed, 37 insertions, 3 deletions
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 35e768e..1746a19 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -12,6 +12,7 @@
//! [`check`]: Tokenizer::check
use crate::constant::TAB_SIZE;
+use std::collections::HashMap;
/// Semantic label of a span.
// To do: figure out how to share this so extensions can add their own stuff,
@@ -64,7 +65,10 @@ pub enum TokenType {
Content,
ContentChunk,
+ Paragraph,
+
ChunkString,
+ ChunkText,
}
/// Enum representing a character code.
@@ -101,7 +105,7 @@ pub struct Point {
}
/// Possible event types.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Clone)]
pub enum EventType {
/// The start of something.
Enter,
@@ -110,12 +114,14 @@ pub enum EventType {
}
/// Something semantic happening somewhere.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
pub struct Event {
pub event_type: EventType,
pub token_type: TokenType,
pub point: Point,
pub index: usize,
+ pub previous: Option<usize>,
+ pub next: Option<usize>,
}
/// The essence of the state machine are functions: `StateFn`.
@@ -156,6 +162,7 @@ struct InternalState {
/// A tokenizer itself.
#[derive(Debug)]
pub struct Tokenizer {
+ column_start: HashMap<usize, usize>,
/// Track whether a character is expected to be consumed, and whether it’s
/// actually consumed
///
@@ -180,6 +187,7 @@ impl Tokenizer {
pub fn new(point: Point, index: usize) -> Tokenizer {
Tokenizer {
current: Code::None,
+ column_start: HashMap::new(),
index,
consumed: true,
point,
@@ -195,6 +203,28 @@ impl Tokenizer {
self.current = code;
}
+ pub fn define_skip(&mut self, point: &Point, index: usize) {
+ self.column_start.insert(point.line, point.column);
+ self.account_for_potential_skip();
+ log::debug!("position: define skip: `{:?}` ({:?})", point, index);
+ }
+
+ fn account_for_potential_skip(&mut self) {
+ println!("account?: {:?} {:?}", self.point, self.index);
+ match self.column_start.get(&self.point.line) {
+ None => {}
+ Some(next_column) => {
+ if self.point.column == 1 {
+ let col = *next_column;
+ self.point.column = col;
+ self.point.offset += col - 1;
+ self.index += col - 1;
+ println!("account! {:?} {:?}", self.point, self.index);
+ }
+ }
+ };
+ }
+
/// Consume the current character.
/// Each [`StateFn`][] is expected to call this to signal that this code is
/// used, or call a next `StateFn`.
@@ -215,7 +245,7 @@ impl Tokenizer {
} else {
1
};
- // To do: accountForPotentialSkip()
+ self.account_for_potential_skip();
log::debug!("position: after eol: `{:?}`", self.point);
}
Code::VirtualSpace => {
@@ -240,6 +270,8 @@ impl Tokenizer {
token_type: token_type.clone(),
point: self.point.clone(),
index: self.index,
+ previous: None,
+ next: None,
};
self.events.push(event);
@@ -270,6 +302,8 @@ impl Tokenizer {
token_type,
point,
index: self.index,
+ previous: None,
+ next: None,
};
self.events.push(event);