diff options
Diffstat (limited to '')
-rw-r--r-- | src/content/flow.rs | 258 | ||||
-rw-r--r-- | src/content/mod.rs | 4 | ||||
-rw-r--r-- | src/content/string.rs | 120 |
3 files changed, 382 insertions, 0 deletions
diff --git a/src/content/flow.rs b/src/content/flow.rs new file mode 100644 index 0000000..21c5721 --- /dev/null +++ b/src/content/flow.rs @@ -0,0 +1,258 @@ +//! The flow content type. +//! +//! **Flow** represents the sections, such as headings, code, and content, which +//! is parsed per line. +//! An example is HTML, which has a certain starting condition (such as +//! `<script>` on its own line), then continues for a while, until an end +//! condition is found (such as `</style>`). +//! If that line with an end condition is never found, that flow goes until +//! the end. +//! +//! The constructs found in flow are: +//! +//! * [Blank line][crate::construct::blank_line] +//! * [Code (fenced)][crate::construct::code_fenced] +//! * [Code (indented)][crate::construct::code_indented] +//! * [Heading (atx)][crate::construct::heading_atx] +//! * [HTML (flow)][crate::construct::html_flow] +//! * [Thematic break][crate::construct::thematic_break] +//! +//! <!-- To do: `setext` in content? Link to content. --> + +use crate::construct::{ + blank_line::start as blank_line, code_fenced::start as code_fenced, + code_indented::start as code_indented, heading_atx::start as heading_atx, + html_flow::start as html_flow, partial_whitespace::start as whitespace, + thematic_break::start as thematic_break, +}; +use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; + +/// Turn `codes` as the flow content type into events. +// To do: remove this `allow` when all the content types are glued together. +#[allow(dead_code)] +pub fn flow(codes: Vec<Code>) -> Vec<Event> { + let mut tokenizer = Tokenizer::new(); + let (state, remainder) = tokenizer.feed(codes, Box::new(start), true); + + if let Some(ref x) = remainder { + if !x.is_empty() { + unreachable!("expected no final remainder {:?}", x); + } + } + + match state { + State::Ok => {} + _ => unreachable!("expected final state to be `State::Ok`"), + } + + tokenizer.events +} + +/// Before flow. +/// +/// First we assume a blank line. +// +/// ```markdown +/// | +/// |## alpha +/// | bravo +/// |*** +/// ``` +fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(blank_line, |ok| { + Box::new(if ok { blank_line_after } else { initial_before }) + })(tokenizer, code), + } +} + +/// After a blank line. +/// +/// Move to `start` afterwards. +/// +/// ```markdown +/// ␠␠| +/// ``` +fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::BlankLineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::BlankLineEnding); + (State::Fn(Box::new(start)), None) + } + _ => unreachable!("expected eol/eof after blank line `{:?}`", code), + } +} + +/// Before flow (initial). +/// +/// “Initial” flow means unprefixed flow, so right at the start of a line. +/// Interestingly, the only flow (initial) construct is indented code. +/// Move to `before` afterwards. +/// +/// ```markdown +/// |qwe +/// | asd +/// ``` +fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(code_indented, |ok| { + Box::new(if ok { + after + } else { + initial_before_not_code_indented + }) + })(tokenizer, code), + } +} + +/// After a flow construct. +/// +/// ```markdown +/// ## alpha| +/// | +/// ~~~js +/// asd +/// ~~~| +/// ``` +fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(start)), None) + } + _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code), + } +} + +/// Before flow (initial), but not at code (indented). +/// +/// ```markdown +/// |qwe +/// ``` +fn initial_before_not_code_indented(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(code_fenced, |ok| { + Box::new(if ok { + after + } else { + initial_before_not_code_fenced + }) + })(tokenizer, code), + } +} + +/// Before flow (initial), but not at code (fenced). +/// +/// ```markdown +/// |qwe +/// ``` +fn initial_before_not_code_fenced(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(html_flow, |ok| Box::new(if ok { after } else { before }))( + tokenizer, code, + ), + } +} + +/// Before flow, but not at code (indented) or code (fenced). +/// +/// Compared to flow (initial), normal flow can be arbitrarily prefixed. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(before_after_prefix), + )(tokenizer, code) +} + +/// Before flow, after potential whitespace. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(heading_atx, |ok| { + Box::new(if ok { after } else { before_not_heading_atx }) + })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_not_heading_atx(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(thematic_break, |ok| { + Box::new(if ok { after } else { before_not_thematic_break }) + })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) or thematic break. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn before_not_thematic_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt(html_flow, |ok| { + Box::new(if ok { after } else { content_before }) + })(tokenizer, code) +} + +/// Before flow, but not before a heading (atx) or thematic break. +/// +/// At this point, we’re at content (zero or more definitions and zero or one +/// paragraph/setext heading). +/// +/// ```markdown +/// |qwe +/// ``` +// To do: currently only parses a single line. +// To do: +// - Multiline +// - One or more definitions. +// - Setext heading. +fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + after(tokenizer, code) + } + _ => { + tokenizer.enter(TokenType::Content); + tokenizer.enter(TokenType::ContentPhrasing); + tokenizer.consume(code); + (State::Fn(Box::new(content)), None) + } + } +} +/// In content. +/// +/// ```markdown +/// al|pha +/// ``` +// To do: lift limitations as documented above. +fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::Char('\n' | '\r') => { + tokenizer.exit(TokenType::ContentPhrasing); + tokenizer.exit(TokenType::Content); + after(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(content)), None) + } + } +} diff --git a/src/content/mod.rs b/src/content/mod.rs new file mode 100644 index 0000000..d5771a3 --- /dev/null +++ b/src/content/mod.rs @@ -0,0 +1,4 @@ +//! Content types found in markdown. + +pub mod flow; +pub mod string; diff --git a/src/content/string.rs b/src/content/string.rs new file mode 100644 index 0000000..a8a81b2 --- /dev/null +++ b/src/content/string.rs @@ -0,0 +1,120 @@ +//! The string content type. +//! +//! **String** is a limited **text** like content type which only allows +//! character escapes and character references. +//! It exists in things such as identifiers (media references, definitions), +//! titles, URLs, code (fenced) info and meta parts. +//! +//! The constructs found in strin are: +//! +//! * [Character escape][crate::construct::character_escape] +//! * [Character reference][crate::construct::character_reference] + +use crate::construct::{ + character_escape::start as character_escape, character_reference::start as character_reference, +}; +use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer}; + +/// Turn `codes` as the string content type into events. +// To do: remove this `allow` when all the content types are glued together. +#[allow(dead_code)] +pub fn string(codes: Vec<Code>) -> Vec<Event> { + let mut tokenizer = Tokenizer::new(); + let (state, remainder) = tokenizer.feed(codes, Box::new(before), true); + + if let Some(ref x) = remainder { + if !x.is_empty() { + unreachable!("expected no final remainder {:?}", x); + } + } + + match state { + State::Ok => {} + _ => unreachable!("expected final state to be `State::Ok`"), + } + + tokenizer.events +} + +/// Before string. +/// +/// First we assume character reference. +/// +/// ```markdown +/// |& +/// |\& +/// |qwe +/// ``` +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(character_reference, |ok| { + Box::new(if ok { + before + } else { + before_not_character_reference + }) + })(tokenizer, code), + } +} + +/// Before string, not at a character reference. +/// +/// Assume character escape. +/// +/// ```markdown +/// |\& +/// |qwe +/// ``` +fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => (State::Ok, None), + _ => tokenizer.attempt(character_escape, |ok| { + Box::new(if ok { + before + } else { + before_not_character_escape + }) + })(tokenizer, code), + } +} + +/// Before string, not at a character reference or character escape. +/// +/// We’re at data. +/// +/// ```markdown +/// |qwe +/// ``` +fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + if let Code::None = code { + (State::Ok, None) + } else { + tokenizer.enter(TokenType::Data); + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } +} + +/// In data. +/// +/// ```markdown +/// q|w|e +/// ``` +fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => { + tokenizer.exit(TokenType::Data); + (State::Ok, None) + } + // To do: somehow get these markers from constructs. + Code::Char('&' | '\\') => { + tokenizer.exit(TokenType::Data); + before(tokenizer, code) + } + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } + } +} |