From 17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 10 Jun 2022 16:47:43 +0200 Subject: Add text content type * Add character reference and character escapes in text * Add recursive subtokenization --- readme.md | 6 ++-- src/compiler.rs | 4 +-- src/content/flow.rs | 14 +++++---- src/content/mod.rs | 1 + src/content/string.rs | 10 ++----- src/content/text.rs | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/subtokenize.rs | 14 ++++++--- src/tokenizer.rs | 2 +- 8 files changed, 107 insertions(+), 24 deletions(-) create mode 100644 src/content/text.rs diff --git a/readme.md b/readme.md index cf42885..527170d 100644 --- a/readme.md +++ b/readme.md @@ -123,18 +123,18 @@ cargo doc --document-private-items - [ ] (8) container - [ ] block quote - [ ] list -- [ ] (1) flow +- [x] (1) flow - [x] blank line - [x] code (fenced) - [x] code (indented) - - [ ] content + - [x] content - [x] heading (atx) - [x] html (flow) - [x] thematic break - [ ] (3) content - [ ] definition - [ ] heading (setext) - - [ ] paragraph + - [x] paragraph - [ ] (5) text - [ ] attention (strong, emphasis) (text) - [ ] autolink diff --git a/src/compiler.rs b/src/compiler.rs index 05a56e1..48983b6 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -79,7 +79,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::HtmlFlowData | TokenType::CodeFencedFence | TokenType::CodeFencedFenceSequence - | TokenType::ChunkText | TokenType::CodeFencedFenceWhitespace | TokenType::Data | TokenType::CharacterEscape @@ -280,8 +279,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St character_reference_kind = None; } // This branch below currently acts as the resulting `data` tokens. - // To do: `ChunkText` does not belong here. Remove it when subtokenization is supported. - TokenType::ChunkText | TokenType::Data | TokenType::CharacterEscapeValue => { + TokenType::Data | TokenType::CharacterEscapeValue => { // last_was_tag = false; buf_tail_mut(buffers).push(encode(&slice_serialize( codes, diff --git a/src/content/flow.rs b/src/content/flow.rs index 0d1bd22..6fa8c25 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -34,7 +34,11 @@ use crate::util::get_span; pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec { let mut tokenizer = Tokenizer::new(point, index); tokenizer.feed(codes, Box::new(start), true); - subtokenize(tokenizer.events, codes) + let mut result = (tokenizer.events, false); + while !result.1 { + result = subtokenize(result.0, codes); + } + result.0 } /// Before flow. @@ -165,7 +169,7 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } _ => { tokenizer.enter(TokenType::Content); - tokenizer.enter(TokenType::ContentChunk); + tokenizer.enter(TokenType::ChunkContent); content(tokenizer, code, tokenizer.events.len() - 1) } } @@ -259,8 +263,8 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { tokenizer.consume(code); - tokenizer.exit(TokenType::ContentChunk); - tokenizer.enter(TokenType::ContentChunk); + tokenizer.exit(TokenType::ChunkContent); + tokenizer.enter(TokenType::ChunkContent); let next_index = tokenizer.events.len() - 1; tokenizer.events[previous_index].next = Some(next_index); tokenizer.events[next_index].previous = Some(previous_index); @@ -271,7 +275,7 @@ fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize } fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::ContentChunk); + tokenizer.exit(TokenType::ChunkContent); tokenizer.exit(TokenType::Content); after(tokenizer, code) } diff --git a/src/content/mod.rs b/src/content/mod.rs index 4c0a7f4..d13df79 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -4,3 +4,4 @@ pub mod content; pub mod flow; pub mod string; +pub mod text; diff --git a/src/content/string.rs b/src/content/string.rs index ff9e3fc..2723785 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -17,8 +17,6 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Before string. /// -/// First we assume character reference. -/// /// ```markdown /// |& /// |\& @@ -28,11 +26,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { - Box::new(if ok { - start - } else { - before_not_character_escape - }) + Box::new(if ok { start } else { before_data }) })(tokenizer, code), } } @@ -44,7 +38,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ```markdown /// |qwe /// ``` -fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if let Code::None = code { (State::Ok, None) } else { diff --git a/src/content/text.rs b/src/content/text.rs new file mode 100644 index 0000000..2c93b18 --- /dev/null +++ b/src/content/text.rs @@ -0,0 +1,80 @@ +//! The text content type. +//! +//! **Text** contains phrasing content such as attention (emphasis, strong), +//! media (links, images), and actual text. +//! +//! The constructs found in text are: +//! +//! * Autolink +//! * Attention +//! * HTML (text) +//! * Hard break escape +//! * Code (text) +//! * Line ending +//! * Label start (image) +//! * Label start (link) +//! * [Character escape][crate::construct::character_escape] +//! * [Character reference][crate::construct::character_reference] + +use crate::construct::{ + character_escape::start as character_escape, character_reference::start as character_reference, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Before text. +/// +/// First we assume character reference. +/// +/// ```markdown +/// |& +/// |\& +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { + Box::new(if ok { start } else { before_data }) + })(tokenizer, code), + } +} + +/// Before text. +/// +/// We’re at data. +/// +/// ```markdown +/// |qwe +/// ``` +fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::None = code { + (State::Ok, None) + } else { + tokenizer.enter(TokenType::Data); + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } +} + +/// In data. +/// +/// ```markdown +/// q|w|e +/// ``` +fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::Data); + (State::Ok, None) + } + // To do: somehow get these markers from constructs. + Code::Char('&' | '\\') => { + tokenizer.exit(TokenType::Data); + start(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } + } +} diff --git a/src/subtokenize.rs b/src/subtokenize.rs index adf843f..d72eb69 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -1,37 +1,43 @@ use crate::content::content::start as content; use crate::content::string::start as string; +use crate::content::text::start as text; use crate::tokenizer::{ Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, }; use crate::util::{slice_codes, Span}; use std::collections::HashMap; -pub fn subtokenize(events: Vec, codes: &[Code]) -> Vec { +pub fn subtokenize(events: Vec, codes: &[Code]) -> (Vec, bool) { let mut events = events; let mut index = 0; // Map of first chunks its tokenizer. let mut head_to_tokenizer: HashMap = HashMap::new(); // Map of chunks to their head and corresponding range of events. let mut link_to_info: HashMap = HashMap::new(); + let mut done = true; while index < events.len() { let event = &events[index]; // Find each first opening chunk. if (event.token_type == TokenType::ChunkString - || event.token_type == TokenType::ContentChunk) && + || event.token_type == TokenType::ChunkText + || event.token_type == TokenType::ChunkContent) && event.event_type == EventType::Enter && // No need to enter linked events again. event.previous == None { + done = false; // Index into `events` pointing to a chunk. let mut index_opt: Option = Some(index); // Subtokenizer. let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); // Substate. let mut result: StateFnResult = ( - State::Fn(Box::new(if event.token_type == TokenType::ContentChunk { + State::Fn(Box::new(if event.token_type == TokenType::ChunkContent { content + } else if event.token_type == TokenType::ChunkText { + text } else { string })), @@ -129,5 +135,5 @@ pub fn subtokenize(events: Vec, codes: &[Code]) -> Vec { index -= 1; } - events + (events, done) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 1746a19..4d235ed 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -63,10 +63,10 @@ pub enum TokenType { BlankLineWhitespace, Content, - ContentChunk, Paragraph, + ChunkContent, ChunkString, ChunkText, } -- cgit