From 021d5f989ae41ae39a9b937b498141d9dc70d894 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Thu, 9 Jun 2022 15:01:46 +0200 Subject: Add basic subtokenization, string content in fenced code --- src/content/flow.rs | 23 +++++------------- src/content/string.rs | 20 +++------------ src/lib.rs | 1 + src/parser.rs | 12 +++++++-- src/subtokenize.rs | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/tokenizer.rs | 15 ++++++------ src/util.rs | 8 +++--- tests/code_fenced.rs | 11 ++++----- 8 files changed, 105 insertions(+), 52 deletions(-) create mode 100644 src/subtokenize.rs diff --git a/src/content/flow.rs b/src/content/flow.rs index 693ffb5..6f94424 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -26,28 +26,17 @@ use crate::construct::{ html_flow::start as html_flow, partial_whitespace::start as whitespace, thematic_break::start as thematic_break, }; -use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; +use crate::subtokenize::subtokenize; +use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; use crate::util::get_span; /// Turn `codes` as the flow content type into events. // To do: remove this `allow` when all the content types are glued together. #[allow(dead_code)] -pub fn flow(codes: &[Code]) -> Vec { - let mut tokenizer = Tokenizer::new(); - let (state, remainder) = tokenizer.feed(codes, Box::new(start), true); - - if let Some(ref x) = remainder { - if !x.is_empty() { - unreachable!("expected no final remainder {:?}", x); - } - } - - match state { - State::Ok => {} - _ => unreachable!("expected final state to be `State::Ok`"), - } - - tokenizer.events +pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec { + let mut tokenizer = Tokenizer::new(point, index); + tokenizer.feed(codes, Box::new(start), true); + subtokenize(tokenizer.events, codes) } /// Before flow. diff --git a/src/content/string.rs b/src/content/string.rs index 1239a36..64f544b 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -13,26 +13,14 @@ use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, }; -use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; +use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; /// Turn `codes` as the string content type into events. // To do: remove this `allow` when all the content types are glued together. #[allow(dead_code)] -pub fn string(codes: &[Code]) -> Vec { - let mut tokenizer = Tokenizer::new(); - let (state, remainder) = tokenizer.feed(codes, Box::new(before), true); - - if let Some(ref x) = remainder { - if !x.is_empty() { - unreachable!("expected no final remainder {:?}", x); - } - } - - match state { - State::Ok => {} - _ => unreachable!("expected final state to be `State::Ok`"), - } - +pub fn string(codes: &[Code], point: Point, index: usize) -> Vec { + let mut tokenizer = Tokenizer::new(point, index); + tokenizer.feed(codes, Box::new(before), true); tokenizer.events } diff --git a/src/lib.rs b/src/lib.rs index 1624a22..cf0b05b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ mod constant; mod construct; mod content; mod parser; +mod subtokenize; mod tokenizer; mod util; diff --git a/src/parser.rs b/src/parser.rs index e156e33..5648942 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,13 +2,21 @@ // To do: this should start with `containers`, when they’re done. // To do: definitions and such will mean more data has to be passed around. use crate::content::flow::flow; -use crate::tokenizer::{as_codes, Code, Event}; +use crate::tokenizer::{as_codes, Code, Event, Point}; /// Turn a string of markdown into events. /// Passes the codes back so the compiler can access the source. pub fn parse(value: &str) -> (Vec, Vec) { let codes = as_codes(value); // To do: pass a reference to this around, and slices in the (back)feeding. Might be tough. - let events = flow(&codes); + let events = flow( + &codes, + Point { + line: 1, + column: 1, + offset: 0, + }, + 0, + ); (events, codes) } diff --git a/src/subtokenize.rs b/src/subtokenize.rs new file mode 100644 index 0000000..c1a8435 --- /dev/null +++ b/src/subtokenize.rs @@ -0,0 +1,67 @@ +use crate::content::string::string; +use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::util::{slice_codes, Span}; + +pub fn subtokenize(events: Vec, codes: &[Code]) -> Vec { + let mut events = events; + let mut index = 0; + + // println!("before"); + // while index < events.len() { + // let event = &events[index]; + // println!( + // "ev1: {:?} {:?} {:?}", + // event.event_type, event.token_type, index + // ); + // index += 1; + // } + // + // index = 0; + // + // println!("change"); + + while index < events.len() { + let event = &events[index]; + + // println!( + // "ev2: {:?} {:?} {:?}", + // event.event_type, event.token_type, index + // ); + + if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString { + let exit = &events[index + 1]; + + assert_eq!( + exit.event_type, + EventType::Exit, + "expected `enter` of `{:?}` to be follow by an `exit` event", + event.token_type + ); + assert_eq!( + exit.token_type, event.token_type, + "expected `exit` of `{:?}` to follow its `enter` event", + event.token_type + ); + + let subevents = string( + slice_codes( + codes, + &Span { + start_index: event.index, + end_index: exit.index, + }, + ), + event.point.clone(), + event.index, + ); + let len = subevents.len(); + // To do: recursion needed? + events.splice(index..(index + 2), subevents); + index += len; + } else { + index += 1; + } + } + + events +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index faee8d9..35e768e 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -177,16 +177,12 @@ pub struct Tokenizer { impl Tokenizer { /// Create a new tokenizer. - pub fn new() -> Tokenizer { + pub fn new(point: Point, index: usize) -> Tokenizer { Tokenizer { current: Code::None, - index: 0, + index, consumed: true, - point: Point { - line: 1, - column: 1, - offset: 0, - }, + point, stack: vec![], events: vec![], } @@ -499,6 +495,11 @@ impl Tokenizer { } } + match state { + State::Ok => {} + _ => unreachable!("expected final state to be `State::Ok`"), + } + check_statefn_result((state, None)) } } diff --git a/src/util.rs b/src/util.rs index 47359a3..5a916cd 100644 --- a/src/util.rs +++ b/src/util.rs @@ -165,12 +165,12 @@ pub fn get_span(events: &[Event], index: usize) -> Span { assert_eq!( exit.event_type, EventType::Exit, - "expected get_span to be called on `exit` event" + "expected `get_span` to be called on `exit` event" ); - let mut start_index = index - 1; + let mut enter_index = index - 1; loop { - let enter = &events[start_index]; + let enter = &events[enter_index]; if enter.event_type == EventType::Enter && enter.token_type == token_type { return Span { // start: enter.point.clone(), @@ -181,7 +181,7 @@ pub fn get_span(events: &[Event], index: usize) -> Span { }; } - start_index -= 1; + enter_index -= 1; } } diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs index 46fa9cb..6419f67 100644 --- a/tests/code_fenced.rs +++ b/tests/code_fenced.rs @@ -219,12 +219,11 @@ fn code_fenced() { "should support an eof in the prefix, in content" ); - // To do: strings. - // assert_eq!( - // micromark("```j\\+s©"), - // "
\n", - // "should support character escapes and character references in info strings" - // ); + assert_eq!( + micromark("```j\\+s©"), + "
\n", + "should support character escapes and character references in info strings" + ); assert_eq!( micromark(" ```\naaa\n ```"), -- cgit