From 17f4eec55ad0a5f74aedbcff6c2f0119ad52e584 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Fri, 10 Jun 2022 16:47:43 +0200 Subject: Add text content type * Add character reference and character escapes in text * Add recursive subtokenization --- src/content/flow.rs | 14 +++++---- src/content/mod.rs | 1 + src/content/string.rs | 10 ++----- src/content/text.rs | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 13 deletions(-) create mode 100644 src/content/text.rs (limited to 'src/content') diff --git a/src/content/flow.rs b/src/content/flow.rs index 0d1bd22..6fa8c25 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -34,7 +34,11 @@ use crate::util::get_span; pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec { let mut tokenizer = Tokenizer::new(point, index); tokenizer.feed(codes, Box::new(start), true); - subtokenize(tokenizer.events, codes) + let mut result = (tokenizer.events, false); + while !result.1 { + result = subtokenize(result.0, codes); + } + result.0 } /// Before flow. @@ -165,7 +169,7 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } _ => { tokenizer.enter(TokenType::Content); - tokenizer.enter(TokenType::ContentChunk); + tokenizer.enter(TokenType::ChunkContent); content(tokenizer, code, tokenizer.events.len() - 1) } } @@ -259,8 +263,8 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { tokenizer.consume(code); - tokenizer.exit(TokenType::ContentChunk); - tokenizer.enter(TokenType::ContentChunk); + tokenizer.exit(TokenType::ChunkContent); + tokenizer.enter(TokenType::ChunkContent); let next_index = tokenizer.events.len() - 1; tokenizer.events[previous_index].next = Some(next_index); tokenizer.events[next_index].previous = Some(previous_index); @@ -271,7 +275,7 @@ fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize } fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::ContentChunk); + tokenizer.exit(TokenType::ChunkContent); tokenizer.exit(TokenType::Content); after(tokenizer, code) } diff --git a/src/content/mod.rs b/src/content/mod.rs index 4c0a7f4..d13df79 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -4,3 +4,4 @@ pub mod content; pub mod flow; pub mod string; +pub mod text; diff --git a/src/content/string.rs b/src/content/string.rs index ff9e3fc..2723785 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -17,8 +17,6 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; /// Before string. /// -/// First we assume character reference. -/// /// ```markdown /// |& /// |\& @@ -28,11 +26,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { - Box::new(if ok { - start - } else { - before_not_character_escape - }) + Box::new(if ok { start } else { before_data }) })(tokenizer, code), } } @@ -44,7 +38,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ```markdown /// |qwe /// ``` -fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if let Code::None = code { (State::Ok, None) } else { diff --git a/src/content/text.rs b/src/content/text.rs new file mode 100644 index 0000000..2c93b18 --- /dev/null +++ b/src/content/text.rs @@ -0,0 +1,80 @@ +//! The text content type. +//! +//! **Text** contains phrasing content such as attention (emphasis, strong), +//! media (links, images), and actual text. +//! +//! The constructs found in text are: +//! +//! * Autolink +//! * Attention +//! * HTML (text) +//! * Hard break escape +//! * Code (text) +//! * Line ending +//! * Label start (image) +//! * Label start (link) +//! * [Character escape][crate::construct::character_escape] +//! * [Character reference][crate::construct::character_reference] + +use crate::construct::{ + character_escape::start as character_escape, character_reference::start as character_reference, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Before text. +/// +/// First we assume character reference. +/// +/// ```markdown +/// |& +/// |\& +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt_2(character_reference, character_escape, |ok| { + Box::new(if ok { start } else { before_data }) + })(tokenizer, code), + } +} + +/// Before text. +/// +/// We’re at data. +/// +/// ```markdown +/// |qwe +/// ``` +fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::None = code { + (State::Ok, None) + } else { + tokenizer.enter(TokenType::Data); + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } +} + +/// In data. +/// +/// ```markdown +/// q|w|e +/// ``` +fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::Data); + (State::Ok, None) + } + // To do: somehow get these markers from constructs. + Code::Char('&' | '\\') => { + tokenizer.exit(TokenType::Data); + start(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } + } +} -- cgit