aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-09 15:01:46 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-09 15:01:46 +0200
commit021d5f989ae41ae39a9b937b498141d9dc70d894 (patch)
tree8009a01d69cbd4f8200ffd34fc4031265b67406e
parent344c3db875056d4aec509f24fb2dbeaf7e2a14b6 (diff)
downloadmarkdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.tar.gz
markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.tar.bz2
markdown-rs-021d5f989ae41ae39a9b937b498141d9dc70d894.zip
Add basic subtokenization, string content in fenced code
-rw-r--r--src/content/flow.rs23
-rw-r--r--src/content/string.rs20
-rw-r--r--src/lib.rs1
-rw-r--r--src/parser.rs12
-rw-r--r--src/subtokenize.rs67
-rw-r--r--src/tokenizer.rs15
-rw-r--r--src/util.rs8
-rw-r--r--tests/code_fenced.rs11
8 files changed, 105 insertions, 52 deletions
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 693ffb5..6f94424 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -26,28 +26,17 @@ use crate::construct::{
html_flow::start as html_flow, partial_whitespace::start as whitespace,
thematic_break::start as thematic_break,
};
-use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+use crate::subtokenize::subtokenize;
+use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
use crate::util::get_span;
/// Turn `codes` as the flow content type into events.
// To do: remove this `allow` when all the content types are glued together.
#[allow(dead_code)]
-pub fn flow(codes: &[Code]) -> Vec<Event> {
- let mut tokenizer = Tokenizer::new();
- let (state, remainder) = tokenizer.feed(codes, Box::new(start), true);
-
- if let Some(ref x) = remainder {
- if !x.is_empty() {
- unreachable!("expected no final remainder {:?}", x);
- }
- }
-
- match state {
- State::Ok => {}
- _ => unreachable!("expected final state to be `State::Ok`"),
- }
-
- tokenizer.events
+pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
+ let mut tokenizer = Tokenizer::new(point, index);
+ tokenizer.feed(codes, Box::new(start), true);
+ subtokenize(tokenizer.events, codes)
}
/// Before flow.
diff --git a/src/content/string.rs b/src/content/string.rs
index 1239a36..64f544b 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -13,26 +13,14 @@
use crate::construct::{
character_escape::start as character_escape, character_reference::start as character_reference,
};
-use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
/// Turn `codes` as the string content type into events.
// To do: remove this `allow` when all the content types are glued together.
#[allow(dead_code)]
-pub fn string(codes: &[Code]) -> Vec<Event> {
- let mut tokenizer = Tokenizer::new();
- let (state, remainder) = tokenizer.feed(codes, Box::new(before), true);
-
- if let Some(ref x) = remainder {
- if !x.is_empty() {
- unreachable!("expected no final remainder {:?}", x);
- }
- }
-
- match state {
- State::Ok => {}
- _ => unreachable!("expected final state to be `State::Ok`"),
- }
-
+pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> {
+ let mut tokenizer = Tokenizer::new(point, index);
+ tokenizer.feed(codes, Box::new(before), true);
tokenizer.events
}
diff --git a/src/lib.rs b/src/lib.rs
index 1624a22..cf0b05b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,6 +9,7 @@ mod constant;
mod construct;
mod content;
mod parser;
+mod subtokenize;
mod tokenizer;
mod util;
diff --git a/src/parser.rs b/src/parser.rs
index e156e33..5648942 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,13 +2,21 @@
// To do: this should start with `containers`, when they’re done.
// To do: definitions and such will mean more data has to be passed around.
use crate::content::flow::flow;
-use crate::tokenizer::{as_codes, Code, Event};
+use crate::tokenizer::{as_codes, Code, Event, Point};
/// Turn a string of markdown into events.
/// Passes the codes back so the compiler can access the source.
pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) {
let codes = as_codes(value);
// To do: pass a reference to this around, and slices in the (back)feeding. Might be tough.
- let events = flow(&codes);
+ let events = flow(
+ &codes,
+ Point {
+ line: 1,
+ column: 1,
+ offset: 0,
+ },
+ 0,
+ );
(events, codes)
}
diff --git a/src/subtokenize.rs b/src/subtokenize.rs
new file mode 100644
index 0000000..c1a8435
--- /dev/null
+++ b/src/subtokenize.rs
@@ -0,0 +1,67 @@
+use crate::content::string::string;
+use crate::tokenizer::{Code, Event, EventType, TokenType};
+use crate::util::{slice_codes, Span};
+
+pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> Vec<Event> {
+ let mut events = events;
+ let mut index = 0;
+
+ // println!("before");
+ // while index < events.len() {
+ // let event = &events[index];
+ // println!(
+ // "ev1: {:?} {:?} {:?}",
+ // event.event_type, event.token_type, index
+ // );
+ // index += 1;
+ // }
+ //
+ // index = 0;
+ //
+ // println!("change");
+
+ while index < events.len() {
+ let event = &events[index];
+
+ // println!(
+ // "ev2: {:?} {:?} {:?}",
+ // event.event_type, event.token_type, index
+ // );
+
+ if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString {
+ let exit = &events[index + 1];
+
+ assert_eq!(
+ exit.event_type,
+ EventType::Exit,
+ "expected `enter` of `{:?}` to be follow by an `exit` event",
+ event.token_type
+ );
+ assert_eq!(
+ exit.token_type, event.token_type,
+ "expected `exit` of `{:?}` to follow its `enter` event",
+ event.token_type
+ );
+
+ let subevents = string(
+ slice_codes(
+ codes,
+ &Span {
+ start_index: event.index,
+ end_index: exit.index,
+ },
+ ),
+ event.point.clone(),
+ event.index,
+ );
+ let len = subevents.len();
+ // To do: recursion needed?
+ events.splice(index..(index + 2), subevents);
+ index += len;
+ } else {
+ index += 1;
+ }
+ }
+
+ events
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index faee8d9..35e768e 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -177,16 +177,12 @@ pub struct Tokenizer {
impl Tokenizer {
/// Create a new tokenizer.
- pub fn new() -> Tokenizer {
+ pub fn new(point: Point, index: usize) -> Tokenizer {
Tokenizer {
current: Code::None,
- index: 0,
+ index,
consumed: true,
- point: Point {
- line: 1,
- column: 1,
- offset: 0,
- },
+ point,
stack: vec![],
events: vec![],
}
@@ -499,6 +495,11 @@ impl Tokenizer {
}
}
+ match state {
+ State::Ok => {}
+ _ => unreachable!("expected final state to be `State::Ok`"),
+ }
+
check_statefn_result((state, None))
}
}
diff --git a/src/util.rs b/src/util.rs
index 47359a3..5a916cd 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -165,12 +165,12 @@ pub fn get_span(events: &[Event], index: usize) -> Span {
assert_eq!(
exit.event_type,
EventType::Exit,
- "expected get_span to be called on `exit` event"
+ "expected `get_span` to be called on `exit` event"
);
- let mut start_index = index - 1;
+ let mut enter_index = index - 1;
loop {
- let enter = &events[start_index];
+ let enter = &events[enter_index];
if enter.event_type == EventType::Enter && enter.token_type == token_type {
return Span {
// start: enter.point.clone(),
@@ -181,7 +181,7 @@ pub fn get_span(events: &[Event], index: usize) -> Span {
};
}
- start_index -= 1;
+ enter_index -= 1;
}
}
diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs
index 46fa9cb..6419f67 100644
--- a/tests/code_fenced.rs
+++ b/tests/code_fenced.rs
@@ -219,12 +219,11 @@ fn code_fenced() {
"should support an eof in the prefix, in content"
);
- // To do: strings.
- // assert_eq!(
- // micromark("```j\\+s&copy;"),
- // "<pre><code class=\"language-j+s©\"></code></pre>\n",
- // "should support character escapes and character references in info strings"
- // );
+ assert_eq!(
+ micromark("```j\\+s&copy;"),
+ "<pre><code class=\"language-j+s©\"></code></pre>\n",
+ "should support character escapes and character references in info strings"
+ );
assert_eq!(
micromark(" ```\naaa\n ```"),