aboutsummaryrefslogtreecommitdiffstats
path: root/src/content
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-08 15:52:16 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-08 15:52:16 +0200
commit4c06c8554c35887f8f5147783953b2b7e7c2327f (patch)
tree1b2463848a3ae4c645f7f1a325877ee829ab65c5 /src/content
downloadmarkdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.gz
markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.tar.bz2
markdown-rs-4c06c8554c35887f8f5147783953b2b7e7c2327f.zip
.
Diffstat (limited to '')
-rw-r--r--src/content/flow.rs258
-rw-r--r--src/content/mod.rs4
-rw-r--r--src/content/string.rs120
3 files changed, 382 insertions, 0 deletions
diff --git a/src/content/flow.rs b/src/content/flow.rs
new file mode 100644
index 0000000..21c5721
--- /dev/null
+++ b/src/content/flow.rs
@@ -0,0 +1,258 @@
+//! The flow content type.
+//!
+//! **Flow** represents the sections, such as headings, code, and content, which
+//! is parsed per line.
+//! An example is HTML, which has a certain starting condition (such as
+//! `<script>` on its own line), then continues for a while, until an end
+//! condition is found (such as `</style>`).
+//! If that line with an end condition is never found, that flow goes until
+//! the end.
+//!
+//! The constructs found in flow are:
+//!
+//! * [Blank line][crate::construct::blank_line]
+//! * [Code (fenced)][crate::construct::code_fenced]
+//! * [Code (indented)][crate::construct::code_indented]
+//! * [Heading (atx)][crate::construct::heading_atx]
+//! * [HTML (flow)][crate::construct::html_flow]
+//! * [Thematic break][crate::construct::thematic_break]
+//!
+//! <!-- To do: `setext` in content? Link to content. -->
+
+use crate::construct::{
+ blank_line::start as blank_line, code_fenced::start as code_fenced,
+ code_indented::start as code_indented, heading_atx::start as heading_atx,
+ html_flow::start as html_flow, partial_whitespace::start as whitespace,
+ thematic_break::start as thematic_break,
+};
+use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+
+/// Turn `codes` as the flow content type into events.
+// To do: remove this `allow` when all the content types are glued together.
+#[allow(dead_code)]
+pub fn flow(codes: Vec<Code>) -> Vec<Event> {
+ let mut tokenizer = Tokenizer::new();
+ let (state, remainder) = tokenizer.feed(codes, Box::new(start), true);
+
+ if let Some(ref x) = remainder {
+ if !x.is_empty() {
+ unreachable!("expected no final remainder {:?}", x);
+ }
+ }
+
+ match state {
+ State::Ok => {}
+ _ => unreachable!("expected final state to be `State::Ok`"),
+ }
+
+ tokenizer.events
+}
+
+/// Before flow.
+///
+/// First we assume a blank line.
+//
+/// ```markdown
+/// |
+/// |## alpha
+/// | bravo
+/// |***
+/// ```
+fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(blank_line, |ok| {
+ Box::new(if ok { blank_line_after } else { initial_before })
+ })(tokenizer, code),
+ }
+}
+
+/// After a blank line.
+///
+/// Move to `start` afterwards.
+///
+/// ```markdown
+/// ␠␠|
+/// ```
+fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::BlankLineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::BlankLineEnding);
+ (State::Fn(Box::new(start)), None)
+ }
+ _ => unreachable!("expected eol/eof after blank line `{:?}`", code),
+ }
+}
+
+/// Before flow (initial).
+///
+/// “Initial” flow means unprefixed flow, so right at the start of a line.
+/// Interestingly, the only flow (initial) construct is indented code.
+/// Move to `before` afterwards.
+///
+/// ```markdown
+/// |qwe
+/// | asd
+/// ```
+fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(code_indented, |ok| {
+ Box::new(if ok {
+ after
+ } else {
+ initial_before_not_code_indented
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// After a flow construct.
+///
+/// ```markdown
+/// ## alpha|
+/// |
+/// ~~~js
+/// asd
+/// ~~~|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(start)), None)
+ }
+ _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code),
+ }
+}
+
+/// Before flow (initial), but not at code (indented).
+///
+/// ```markdown
+/// |qwe
+/// ```
+fn initial_before_not_code_indented(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(code_fenced, |ok| {
+ Box::new(if ok {
+ after
+ } else {
+ initial_before_not_code_fenced
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// Before flow (initial), but not at code (fenced).
+///
+/// ```markdown
+/// |qwe
+/// ```
+fn initial_before_not_code_fenced(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(html_flow, |ok| Box::new(if ok { after } else { before }))(
+ tokenizer, code,
+ ),
+ }
+}
+
+/// Before flow, but not at code (indented) or code (fenced).
+///
+/// Compared to flow (initial), normal flow can be arbitrarily prefixed.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(before_after_prefix),
+ )(tokenizer, code)
+}
+
+/// Before flow, after potential whitespace.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(heading_atx, |ok| {
+ Box::new(if ok { after } else { before_not_heading_atx })
+ })(tokenizer, code)
+}
+
+/// Before flow, but not before a heading (atx)
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before_not_heading_atx(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(thematic_break, |ok| {
+ Box::new(if ok { after } else { before_not_thematic_break })
+ })(tokenizer, code)
+}
+
+/// Before flow, but not before a heading (atx) or thematic break.
+///
+/// ```markdown
+/// |qwe
+/// ```
+pub fn before_not_thematic_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(html_flow, |ok| {
+ Box::new(if ok { after } else { content_before })
+ })(tokenizer, code)
+}
+
+/// Before flow, but not before a heading (atx) or thematic break.
+///
+/// At this point, we’re at content (zero or more definitions and zero or one
+/// paragraph/setext heading).
+///
+/// ```markdown
+/// |qwe
+/// ```
+// To do: currently only parses a single line.
+// To do:
+// - Multiline
+// - One or more definitions.
+// - Setext heading.
+fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ after(tokenizer, code)
+ }
+ _ => {
+ tokenizer.enter(TokenType::Content);
+ tokenizer.enter(TokenType::ContentPhrasing);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(content)), None)
+ }
+ }
+}
+/// In content.
+///
+/// ```markdown
+/// al|pha
+/// ```
+// To do: lift limitations as documented above.
+fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::ContentPhrasing);
+ tokenizer.exit(TokenType::Content);
+ after(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(content)), None)
+ }
+ }
+}
diff --git a/src/content/mod.rs b/src/content/mod.rs
new file mode 100644
index 0000000..d5771a3
--- /dev/null
+++ b/src/content/mod.rs
@@ -0,0 +1,4 @@
+//! Content types found in markdown.
+
+pub mod flow;
+pub mod string;
diff --git a/src/content/string.rs b/src/content/string.rs
new file mode 100644
index 0000000..a8a81b2
--- /dev/null
+++ b/src/content/string.rs
@@ -0,0 +1,120 @@
+//! The string content type.
+//!
+//! **String** is a limited **text** like content type which only allows
+//! character escapes and character references.
+//! It exists in things such as identifiers (media references, definitions),
+//! titles, URLs, code (fenced) info and meta parts.
+//!
+//! The constructs found in strin are:
+//!
+//! * [Character escape][crate::construct::character_escape]
+//! * [Character reference][crate::construct::character_reference]
+
+use crate::construct::{
+ character_escape::start as character_escape, character_reference::start as character_reference,
+};
+use crate::tokenizer::{Code, Event, State, StateFnResult, TokenType, Tokenizer};
+
+/// Turn `codes` as the string content type into events.
+// To do: remove this `allow` when all the content types are glued together.
+#[allow(dead_code)]
+pub fn string(codes: Vec<Code>) -> Vec<Event> {
+ let mut tokenizer = Tokenizer::new();
+ let (state, remainder) = tokenizer.feed(codes, Box::new(before), true);
+
+ if let Some(ref x) = remainder {
+ if !x.is_empty() {
+ unreachable!("expected no final remainder {:?}", x);
+ }
+ }
+
+ match state {
+ State::Ok => {}
+ _ => unreachable!("expected final state to be `State::Ok`"),
+ }
+
+ tokenizer.events
+}
+
+/// Before string.
+///
+/// First we assume character reference.
+///
+/// ```markdown
+/// |&amp;
+/// |\&
+/// |qwe
+/// ```
+fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(character_reference, |ok| {
+ Box::new(if ok {
+ before
+ } else {
+ before_not_character_reference
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// Before string, not at a character reference.
+///
+/// Assume character escape.
+///
+/// ```markdown
+/// |\&
+/// |qwe
+/// ```
+fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ _ => tokenizer.attempt(character_escape, |ok| {
+ Box::new(if ok {
+ before
+ } else {
+ before_not_character_escape
+ })
+ })(tokenizer, code),
+ }
+}
+
+/// Before string, not at a character reference or character escape.
+///
+/// We’re at data.
+///
+/// ```markdown
+/// |qwe
+/// ```
+fn before_not_character_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ if let Code::None = code {
+ (State::Ok, None)
+ } else {
+ tokenizer.enter(TokenType::Data);
+ tokenizer.consume(code);
+ (State::Fn(Box::new(in_data)), None)
+ }
+}
+
+/// In data.
+///
+/// ```markdown
+/// q|w|e
+/// ```
+fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => {
+ tokenizer.exit(TokenType::Data);
+ (State::Ok, None)
+ }
+ // To do: somehow get these markers from constructs.
+ Code::Char('&' | '\\') => {
+ tokenizer.exit(TokenType::Data);
+ before(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(in_data)), None)
+ }
+ }
+}