diff options
Diffstat (limited to '')
-rw-r--r-- | src/compiler.rs | 23 | ||||
-rw-r--r-- | src/content/content.rs | 84 | ||||
-rw-r--r-- | src/content/flow.rs | 45 | ||||
-rw-r--r-- | src/content/mod.rs | 2 | ||||
-rw-r--r-- | src/content/string.rs | 42 | ||||
-rw-r--r-- | src/subtokenize.rs | 166 | ||||
-rw-r--r-- | src/tokenizer.rs | 40 |
7 files changed, 279 insertions, 123 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 3632d29..05a56e1 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -38,7 +38,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St match event.event_type { EventType::Enter => match token_type { - TokenType::Content => { + TokenType::Paragraph => { buf_tail_mut(buffers).push("<p>".to_string()); } TokenType::CodeIndented => { @@ -62,7 +62,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St ignore_encode = true; } } - TokenType::ContentChunk + TokenType::Content | TokenType::AtxHeading | TokenType::AtxHeadingSequence | TokenType::AtxHeadingWhitespace @@ -79,7 +79,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::HtmlFlowData | TokenType::CodeFencedFence | TokenType::CodeFencedFenceSequence - | TokenType::ChunkString + | TokenType::ChunkText | TokenType::CodeFencedFenceWhitespace | TokenType::Data | TokenType::CharacterEscape @@ -97,7 +97,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St } }, EventType::Exit => match token_type { - TokenType::ThematicBreakSequence + TokenType::Content + | TokenType::ThematicBreakSequence | TokenType::ThematicBreakWhitespace | TokenType::CodeIndentedPrefixWhitespace | TokenType::BlankLineEnding @@ -120,7 +121,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St // last_was_tag = false; buf_tail_mut(buffers).push(res); } - TokenType::Content => { + TokenType::Paragraph => { buf_tail_mut(buffers).push("</p>".to_string()); } TokenType::CodeIndented | TokenType::CodeFenced => { @@ -278,17 +279,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St character_reference_kind = None; } - // To do: `ContentPhrasing` should be parsed as phrasing first. // This branch below currently acts as the resulting `data` tokens. - // To do: initial and final whitespace should be handled in `text`. - TokenType::ContentChunk => { - // last_was_tag = false; - buf_tail_mut(buffers).push(encode( - slice_serialize(codes, &get_span(events, index), false).trim(), - )); - } - // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported. - TokenType::ChunkString | TokenType::Data | TokenType::CharacterEscapeValue => { + // To do: `ChunkText` does not belong here. Remove it when subtokenization is supported. + TokenType::ChunkText | TokenType::Data | TokenType::CharacterEscapeValue => { // last_was_tag = false; buf_tail_mut(buffers).push(encode(&slice_serialize( codes, diff --git a/src/content/content.rs b/src/content/content.rs new file mode 100644 index 0000000..7bf692f --- /dev/null +++ b/src/content/content.rs @@ -0,0 +1,84 @@ +//! The `content`, ahum, content type. +//! +//! **Content** is zero or more definitions, and then zero or one paragraph. +//! It’s a weird one, and needed to make certain edge cases around definitions +//! spec compliant. +//! Definitions are unlike other things in markdown, in that they behave like +//! **text** in that they can contain arbitrary line endings, but *have* to end +//! at a line ending. +//! If they end in something else, the whole definition instead is seen as a +//! paragraph. +//! +//! The constructs found in content are: +//! +//! * Definition +//! * Paragraph + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Before content. +/// +/// ```markdown +/// |[x]: y +/// |asd +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("expected non-eol/eof"); + } + _ => paragraph_initial(tokenizer, code) + // To do: definition. + // _ => tokenizer.attempt(definition, |ok| { + // Box::new(if ok { + // a + // } else { + // b + // }) + // })(tokenizer, code), + } +} + +/// Before a paragraph. +/// +/// ```markdown +/// |asd +/// ``` +fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("expected non-eol/eof"); + } + _ => { + tokenizer.enter(TokenType::Paragraph); + tokenizer.enter(TokenType::ChunkText); + data(tokenizer, code) + } + } +} + +/// In a line in a paragraph. +/// +/// ```markdown +/// |\& +/// |qwe +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::Paragraph); + (State::Ok, None) + } + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.consume(code); + tokenizer.exit(TokenType::ChunkText); + tokenizer.enter(TokenType::ChunkText); + (State::Fn(Box::new(data)), None) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(data)), None) + } + } +} diff --git a/src/content/flow.rs b/src/content/flow.rs index 6f94424..0d1bd22 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -31,8 +31,6 @@ use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Toke use crate::util::get_span; /// Turn `codes` as the flow content type into events. -// To do: remove this `allow` when all the content types are glued together. -#[allow(dead_code)] pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { let mut tokenizer = Tokenizer::new(point, index); tokenizer.feed(codes, Box::new(start), true); @@ -49,7 +47,7 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { /// | bravo /// |*** /// ``` -fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), _ => tokenizer.attempt(blank_line, |ok| { @@ -168,7 +166,7 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { _ => { tokenizer.enter(TokenType::Content); tokenizer.enter(TokenType::ContentChunk); - content(tokenizer, code) + content(tokenizer, code, tokenizer.events.len() - 1) } } } @@ -178,21 +176,26 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// al|pha /// ``` // To do: lift limitations as documented above. -fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult { match code { - Code::None => { - tokenizer.exit(TokenType::ContentChunk); - content_end(tokenizer, code) - } + Code::None => content_end(tokenizer, code), Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.exit(TokenType::ContentChunk); - tokenizer.check(continuation_construct, |ok| { - Box::new(if ok { content_continue } else { content_end }) + tokenizer.check(continuation_construct, move |ok| { + Box::new(move |t, c| { + if ok { + content_continue(t, c, previous) + } else { + content_end(t, c) + } + }) })(tokenizer, code) } _ => { tokenizer.consume(code); - (State::Fn(Box::new(content)), None) + ( + State::Fn(Box::new(move |t, c| content(t, c, previous))), + None, + ) } } } @@ -254,17 +257,21 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> } } -fn content_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - // To do: should this be part of the content chunk? - // That’s what `micromark-js` does. - tokenizer.enter(TokenType::LineEnding); +fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); + tokenizer.exit(TokenType::ContentChunk); tokenizer.enter(TokenType::ContentChunk); - (State::Fn(Box::new(content)), None) + let next_index = tokenizer.events.len() - 1; + tokenizer.events[previous_index].next = Some(next_index); + tokenizer.events[next_index].previous = Some(previous_index); + ( + State::Fn(Box::new(move |t, c| content(t, c, next_index))), + None, + ) } fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::ContentChunk); tokenizer.exit(TokenType::Content); after(tokenizer, code) } diff --git a/src/content/mod.rs b/src/content/mod.rs index d5771a3..4c0a7f4 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -1,4 +1,6 @@ //! Content types found in markdown. +#[allow(clippy::module_inception)] +pub mod content; pub mod flow; pub mod string; diff --git a/src/content/string.rs b/src/content/string.rs index 64f544b..ff9e3fc 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -5,7 +5,7 @@ //! It exists in things such as identifiers (media references, definitions), //! titles, URLs, code (fenced) info and meta parts. //! -//! The constructs found in strin are: +//! The constructs found in string are: //! //! * [Character escape][crate::construct::character_escape] //! * [Character reference][crate::construct::character_reference] @@ -13,16 +13,7 @@ use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, }; -use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; - -/// Turn `codes` as the string content type into events. -// To do: remove this `allow` when all the content types are glued together. -#[allow(dead_code)] -pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> { - let mut tokenizer = Tokenizer::new(point, index); - tokenizer.feed(codes, Box::new(before), true); - tokenizer.events -} +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Before string. /// @@ -33,33 +24,12 @@ pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> { /// |\& /// |qwe /// ``` -fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None => (State::Ok, None), - _ => tokenizer.attempt(character_reference, |ok| { - Box::new(if ok { - before - } else { - before_not_character_reference - }) - })(tokenizer, code), - } -} - -/// Before string, not at a character reference. -/// -/// Assume character escape. -/// -/// ```markdown -/// |\& -/// |qwe -/// ``` -fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), - _ => tokenizer.attempt(character_escape, |ok| { + _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { Box::new(if ok { - before + start } else { before_not_character_escape }) @@ -98,7 +68,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { // To do: somehow get these markers from constructs. Code::Char('&' | '\\') => { tokenizer.exit(TokenType::Data); - before(tokenizer, code) + start(tokenizer, code) } _ => { tokenizer.consume(code); diff --git a/src/subtokenize.rs b/src/subtokenize.rs index c1a8435..adf843f 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -1,66 +1,132 @@ -use crate::content::string::string; -use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::content::content::start as content; +use crate::content::string::start as string; +use crate::tokenizer::{ + Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, +}; use crate::util::{slice_codes, Span}; +use std::collections::HashMap; pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> Vec<Event> { let mut events = events; let mut index = 0; - - // println!("before"); - // while index < events.len() { - // let event = &events[index]; - // println!( - // "ev1: {:?} {:?} {:?}", - // event.event_type, event.token_type, index - // ); - // index += 1; - // } - // - // index = 0; - // - // println!("change"); + // Map of first chunks its tokenizer. + let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new(); + // Map of chunks to their head and corresponding range of events. + let mut link_to_info: HashMap<usize, (usize, usize, usize)> = HashMap::new(); while index < events.len() { let event = &events[index]; - // println!( - // "ev2: {:?} {:?} {:?}", - // event.event_type, event.token_type, index - // ); + // Find each first opening chunk. + if (event.token_type == TokenType::ChunkString + || event.token_type == TokenType::ContentChunk) && + event.event_type == EventType::Enter && + // No need to enter linked events again. + event.previous == None + { + // Index into `events` pointing to a chunk. + let mut index_opt: Option<usize> = Some(index); + // Subtokenizer. + let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); + // Substate. + let mut result: StateFnResult = ( + State::Fn(Box::new(if event.token_type == TokenType::ContentChunk { + content + } else { + string + })), + None, + ); + // Indices into `codes` of each end of chunk. + let mut ends: Vec<usize> = vec![]; - if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString { - let exit = &events[index + 1]; + // Loop through chunks to pass them in order to the subtokenizer. + while let Some(index_ptr) = index_opt { + let enter = &events[index_ptr]; + let span = Span { + start_index: enter.index, + end_index: events[index_ptr + 1].index, + }; + ends.push(span.end_index); - assert_eq!( - exit.event_type, - EventType::Exit, - "expected `enter` of `{:?}` to be follow by an `exit` event", - event.token_type - ); - assert_eq!( - exit.token_type, event.token_type, - "expected `exit` of `{:?}` to follow its `enter` event", - event.token_type - ); + if enter.previous != None { + tokenizer.define_skip(&enter.point, span.start_index); + } - let subevents = string( - slice_codes( - codes, - &Span { - start_index: event.index, - end_index: exit.index, - }, - ), - event.point.clone(), - event.index, - ); - let len = subevents.len(); - // To do: recursion needed? - events.splice(index..(index + 2), subevents); - index += len; - } else { - index += 1; + let func: Box<StateFn> = match result.0 { + State::Fn(func) => func, + _ => unreachable!("cannot be ok/nok"), + }; + + result = tokenizer.feed(slice_codes(codes, &span), func, enter.next == None); + + if let Some(ref x) = result.1 { + if !x.is_empty() { + // To do: handle? + unreachable!("subtokenize:remainder {:?}", x); + } + } + + index_opt = enter.next; + } + + // Now, loop through all subevents (and `ends`), to figure out + // which parts belong where. + // Current index. + let mut subindex = 0; + // Index into subevents that starts the current slice. + let mut last_start = 0; + // Counter into `ends`. + let mut end_index = 0; + let mut index_opt: Option<usize> = Some(index); + + while subindex < tokenizer.events.len() { + let subevent = &tokenizer.events[subindex]; + + // Find the first event that starts after the end we’re looking + // for. + // To do: is this logic correct? + if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] { + let link = index_opt.unwrap(); + link_to_info.insert(link, (index, last_start, subindex)); + + last_start = subindex; + end_index += 1; + index_opt = events[link].next; + } + + subindex += 1; + } + + let link = index_opt.unwrap(); + link_to_info.insert(link, (index, last_start, subindex)); + head_to_tokenizer.insert(index, tokenizer); } + + index += 1; + } + + // Now that we fed everything into a tokenizer, and we know which parts + // belong where, the final task is to splice the events from each + // tokenizer into the current events. + // To do: instead of splicing, it might be possible to create a new `events` + // from each slice and slices from events? + let mut index = events.len() - 1; + + while index > 0 { + let slice_opt = link_to_info.get(&index); + + if let Some(slice) = slice_opt { + let (head, start, end) = *slice; + // If there’s a slice at this index, it must also point to a head, + // and that head must have a tokenizer. + let tokenizer = head_to_tokenizer.get(&head).unwrap(); + + // To do: figure out a way that moves instead of clones? + events.splice(index..(index + 2), tokenizer.events[start..end].to_vec()); + } + + index -= 1; } events diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 35e768e..1746a19 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,6 +12,7 @@ //! [`check`]: Tokenizer::check use crate::constant::TAB_SIZE; +use std::collections::HashMap; /// Semantic label of a span. // To do: figure out how to share this so extensions can add their own stuff, @@ -64,7 +65,10 @@ pub enum TokenType { Content, ContentChunk, + Paragraph, + ChunkString, + ChunkText, } /// Enum representing a character code. @@ -101,7 +105,7 @@ pub struct Point { } /// Possible event types. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub enum EventType { /// The start of something. Enter, @@ -110,12 +114,14 @@ pub enum EventType { } /// Something semantic happening somewhere. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Event { pub event_type: EventType, pub token_type: TokenType, pub point: Point, pub index: usize, + pub previous: Option<usize>, + pub next: Option<usize>, } /// The essence of the state machine are functions: `StateFn`. @@ -156,6 +162,7 @@ struct InternalState { /// A tokenizer itself. #[derive(Debug)] pub struct Tokenizer { + column_start: HashMap<usize, usize>, /// Track whether a character is expected to be consumed, and whether it’s /// actually consumed /// @@ -180,6 +187,7 @@ impl Tokenizer { pub fn new(point: Point, index: usize) -> Tokenizer { Tokenizer { current: Code::None, + column_start: HashMap::new(), index, consumed: true, point, @@ -195,6 +203,28 @@ impl Tokenizer { self.current = code; } + pub fn define_skip(&mut self, point: &Point, index: usize) { + self.column_start.insert(point.line, point.column); + self.account_for_potential_skip(); + log::debug!("position: define skip: `{:?}` ({:?})", point, index); + } + + fn account_for_potential_skip(&mut self) { + println!("account?: {:?} {:?}", self.point, self.index); + match self.column_start.get(&self.point.line) { + None => {} + Some(next_column) => { + if self.point.column == 1 { + let col = *next_column; + self.point.column = col; + self.point.offset += col - 1; + self.index += col - 1; + println!("account! {:?} {:?}", self.point, self.index); + } + } + }; + } + /// Consume the current character. /// Each [`StateFn`][] is expected to call this to signal that this code is /// used, or call a next `StateFn`. @@ -215,7 +245,7 @@ impl Tokenizer { } else { 1 }; - // To do: accountForPotentialSkip() + self.account_for_potential_skip(); log::debug!("position: after eol: `{:?}`", self.point); } Code::VirtualSpace => { @@ -240,6 +270,8 @@ impl Tokenizer { token_type: token_type.clone(), point: self.point.clone(), index: self.index, + previous: None, + next: None, }; self.events.push(event); @@ -270,6 +302,8 @@ impl Tokenizer { token_type, point, index: self.index, + previous: None, + next: None, }; self.events.push(event); |