diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-20 13:40:23 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-20 13:40:23 +0200 |
commit | 61271d73128f8553f8c4c17927828cde52a25eba (patch) | |
tree | 5b812e04f9f9311ae22209843db257f34fc90d8d /src | |
parent | 262aec96cece3e9dd55828397b8ec859e7cff606 (diff) | |
download | markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.tar.gz markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.tar.bz2 markdown-rs-61271d73128f8553f8c4c17927828cde52a25eba.zip |
Add paragraph
Diffstat (limited to 'src')
-rw-r--r-- | src/construct/blank_line.rs | 5 | ||||
-rw-r--r-- | src/construct/code_fenced.rs | 5 | ||||
-rw-r--r-- | src/construct/definition.rs | 5 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 4 | ||||
-rw-r--r-- | src/construct/html_flow.rs | 3 | ||||
-rw-r--r-- | src/construct/mod.rs | 3 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 177 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 5 | ||||
-rw-r--r-- | src/construct/partial_label.rs | 5 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 5 | ||||
-rw-r--r-- | src/content/flow.rs | 139 |
11 files changed, 215 insertions, 141 deletions
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 7f794b9..fdb1ee0 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -6,7 +6,7 @@ //! blank_line ::= *(' ' '\t') //! ``` //! -//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! Blank lines are sometimes needed, such as to differentiate a [paragraph][] //! from another paragraph. //! In several cases, blank lines are not needed between flow constructs, //! such as between two [heading (atx)][heading-atx]s. @@ -24,9 +24,10 @@ //! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) //! //! [flow]: crate::content::flow +//! [paragraph]: crate::construct::paragraph //! [heading-atx]: crate::construct::heading_atx //! -//! <!-- To do: link `list`, `paragraph` --> +//! <!-- To do: link `list` --> use crate::construct::partial_whitespace::start as whitespace; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 28ac20b..ba76aa8 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -64,7 +64,8 @@ //! ``` //! //! The `info` and `meta` parts are interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! In markdown, it is also possible to use [code (text)][code_text] in the //! [text][] content type. @@ -84,6 +85,8 @@ //! [text]: crate::content::text //! [code_indented]: crate::construct::code_indented //! [code_text]: crate::construct::code_text +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 65c0991..f7f8acd 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -38,7 +38,8 @@ //! //! The `label`, `destination`, and `title` parts are interpreted as the //! [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! ## References //! @@ -47,6 +48,8 @@ //! //! [flow]: crate::content::flow //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! //! <!-- To do: link link (reference) --> diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 3ff6fea..ab8b6a5 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -3,9 +3,9 @@ //! They’re formed with the following BNF: //! //! ```bnf -//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! heading_atx ::= 1*6'#' [ 1*space_or_tab text [ 1*space_or_tab 1*'#' ] ] *space_or_tab //! -//! code ::= . ; any unicode code point (other than line endings). +//! text ::= code - eol //! space_or_tab ::= ' ' | '\t' //! ``` //! diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index da4517d..5adac7d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -63,7 +63,7 @@ //! //! The **complete** production of HTML (flow) is not allowed to interrupt //! content. -//! That means that a blank line is needed between a paragraph and it. +//! That means that a blank line is needed between a [paragraph][] and it. //! However, [HTML (text)][html_text] has a similar production, which will //! typically kick-in instead. //! @@ -87,6 +87,7 @@ //! //! [flow]: crate::content::flow //! [html_text]: crate::construct::html_text +//! [paragraph]: crate::construct::paragraph //! [html_raw_names]: crate::constant::HTML_RAW_NAMES //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 3195205..1debb74 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -35,7 +35,7 @@ //! * label start (image) //! * label start (link) //! * list -//! * paragraph +//! * [paragraph][] //! * [thematic break][thematic_break] //! //! Each construct maintained here is explained with a BNF diagram. @@ -67,6 +67,7 @@ pub mod heading_atx; pub mod heading_setext; pub mod html_flow; pub mod html_text; +pub mod paragraph; pub mod partial_destination; pub mod partial_label; pub mod partial_title; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs new file mode 100644 index 0000000..50ef627 --- /dev/null +++ b/src/construct/paragraph.rs @@ -0,0 +1,177 @@ +//! Paragraph is a construct that occurs in the [flow] content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: lines cannot start other flow constructs. +//! ; Restriction: lines cannot be blank. +//! paragraph ::= 1*line *( eol 1*line ) +//! ``` +//! +//! Paragraphs in markdown relate to the `<p>` element in HTML. +//! See [*§ 4.4.1 The `p` element* in the HTML spec][html] for more info. +//! +//! Paragraphs can contain line endings and whitespace, but they are not +//! allowed to contain blank lines, or to be blank themselves. +//! +//! The paragraph is interpreted as the [text][] content type. +//! That means that [autolinks][autolink], [code (text)][code_text], etc are allowed. +//! +//! ## References +//! +//! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) +//! * [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs) +//! +//! [flow]: crate::content::flow +//! [text]: crate::content::text +//! [autolink]: crate::construct::autolink +//! [code_text]: crate::construct::code_text +//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element + +use crate::constant::TAB_SIZE; +use crate::construct::{ + code_fenced::start as code_fenced, heading_atx::start as heading_atx, + html_flow::start as html_flow, partial_whitespace::start as whitespace, + thematic_break::start as thematic_break, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::span::from_exit_event; + +/// Before a paragraph. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("unexpected eol/eof at start of paragraph") + } + _ => { + tokenizer.enter(TokenType::Paragraph); + tokenizer.enter(TokenType::ChunkText); + inside(tokenizer, code) + } + } +} + +/// In a paragraph. +/// +/// ```markdown +/// al|pha +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => end(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer + .check(interrupt, |ok| { + Box::new(if ok { at_line_ending } else { end }) + })(tokenizer, code), + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(inside)), None) + } + } +} + +/// At a line ending, not interrupting. +/// +/// ```markdown +/// alpha| +/// bravo. +/// ``` +fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.consume(code); + tokenizer.exit(TokenType::ChunkText); + tokenizer.enter(TokenType::ChunkText); + let next_index = tokenizer.events.len() - 1; + tokenizer.events[next_index - 2].next = Some(next_index); + tokenizer.events[next_index].previous = Some(next_index - 2); + (State::Fn(Box::new(inside)), None) +} + +/// At a line ending, done. +/// +/// ```markdown +/// alpha| +/// *** +/// ``` +fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::Paragraph); + (State::Ok, Some(vec![code])) +} + +/// Before a potential interruption. +/// +/// ```markdown +/// alpha| +/// *** +/// ``` +fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(interrupt_initial)), None) + } + _ => unreachable!("expected eol"), + } +} + +/// After a line ending. +/// +/// ```markdown +/// alpha| +/// ~~~js +/// ~~~ +/// ``` +fn interrupt_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt_2(code_fenced, html_flow, |ok| { + if ok { + Box::new(|_tokenizer, _code| (State::Nok, None)) + } else { + Box::new(|tokenizer, code| { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(interrupt_start), + )(tokenizer, code) + }) + } + })(tokenizer, code) +} + +/// After a line ending, after optional whitespace. +/// +/// ```markdown +/// alpha| +/// # bravo +/// ``` +fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + // Blank lines are not allowed in paragraph. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), + // To do: If code is disabled, indented lines are allowed. + _ if prefix >= TAB_SIZE => (State::Ok, None), + // To do: definitions, setext headings, etc? + _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| { + let result = if ok { + (State::Nok, None) + } else { + (State::Ok, None) + }; + Box::new(|_t, _c| result) + })(tokenizer, code), + } +} diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index a2f638b..58d07c1 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -26,7 +26,8 @@ //! URLs. //! //! The destination is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! ## References //! @@ -34,6 +35,8 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! //! <!-- To do: link label end. --> diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index f7ce8d7..4997390 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -19,7 +19,8 @@ //! contain blank lines, and they must not be blank themselves. //! //! The label is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! > 👉 **Note**: this label relates to, but is not, the initial “label” of //! > what is know as a reference in markdown: @@ -46,6 +47,8 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX //! //! <!-- To do: link label end, label starts. --> diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 7b5fa64..19ba8d4 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -17,7 +17,8 @@ //! They are allowed to be blank themselves. //! //! The title is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! ## References //! @@ -25,6 +26,8 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! //! <!-- To do: link label end. --> diff --git a/src/content/flow.rs b/src/content/flow.rs index 58be61d..22aa77f 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -19,17 +19,15 @@ //! * [HTML (flow)][crate::construct::html_flow] //! * [Thematic break][crate::construct::thematic_break] -use crate::constant::TAB_SIZE; use crate::construct::{ blank_line::start as blank_line, code_fenced::start as code_fenced, code_indented::start as code_indented, definition::start as definition, heading_atx::start as heading_atx, heading_setext::start as heading_setext, - html_flow::start as html_flow, partial_whitespace::start as whitespace, - thematic_break::start as thematic_break, + html_flow::start as html_flow, paragraph::start as paragraph, + partial_whitespace::start as whitespace, thematic_break::start as thematic_break, }; use crate::subtokenize::subtokenize; use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; -use crate::util::span::from_exit_event; /// Turn `codes` as the flow content type into events. pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { @@ -52,7 +50,7 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> { /// | bravo /// |*** /// ``` -pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None => (State::Ok, None), _ => tokenizer.attempt(blank_line, |ok| { @@ -132,7 +130,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ```markdown /// |qwe /// ``` -pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.attempt( |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), |_ok| Box::new(before_after_prefix), @@ -145,140 +143,21 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |# asd /// |*** /// ``` -pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.attempt_4( heading_atx, thematic_break, definition, heading_setext, - |ok| Box::new(if ok { after } else { paragraph_before }), + |ok| Box::new(if ok { after } else { before_paragraph }), )(tokenizer, code) } /// Before a paragraph. /// /// ```markdown -/// |qwe +/// |asd /// ``` -fn paragraph_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - after(tokenizer, code) - } - _ => { - tokenizer.enter(TokenType::Paragraph); - tokenizer.enter(TokenType::ChunkText); - paragraph_inside(tokenizer, code, tokenizer.events.len() - 1) - } - } -} - -/// In a paragraph. -/// -/// ```markdown -/// al|pha -/// ``` -fn paragraph_inside(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult { - match code { - Code::None => paragraph_end(tokenizer, code), - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.check(continuation_construct, move |ok| { - Box::new(move |t, c| { - if ok { - paragraph_continue(t, c, previous) - } else { - paragraph_end(t, c) - } - }) - })(tokenizer, code) - } - _ => { - tokenizer.consume(code); - ( - State::Fn(Box::new(move |t, c| paragraph_inside(t, c, previous))), - None, - ) - } - } -} - -fn continuation_construct(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - ( - State::Fn(Box::new(continuation_construct_initial_before)), - None, - ) - } - _ => unreachable!("expected eol"), - } -} - -fn continuation_construct_initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.attempt_2(code_fenced, html_flow, |ok| { - if ok { - Box::new(|_tokenizer, _code| (State::Nok, None)) - } else { - Box::new(|tokenizer, code| { - tokenizer.attempt( - |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), - |_ok| Box::new(continuation_construct_after_prefix), - )(tokenizer, code) - }) - } - })(tokenizer, code) -} - -fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let tail = tokenizer.events.last(); - let mut prefix = 0; - - if let Some(event) = tail { - if event.token_type == TokenType::Whitespace { - let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); - prefix = span.end_index - span.start_index; - } - } - - match code { - // Blank lines are not allowed in paragraph. - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), - // To do: If code is disabled, indented lines are part of the paragraph. - _ if prefix >= TAB_SIZE => (State::Ok, None), - // To do: definitions, setext headings, etc? - _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| { - let result = if ok { - (State::Nok, None) - } else { - (State::Ok, None) - }; - Box::new(|_t, _c| result) - })(tokenizer, code), - } -} - -fn paragraph_continue( - tokenizer: &mut Tokenizer, - code: Code, - previous_index: usize, -) -> StateFnResult { - tokenizer.consume(code); - tokenizer.exit(TokenType::ChunkText); - tokenizer.enter(TokenType::ChunkText); - let next_index = tokenizer.events.len() - 1; - tokenizer.events[previous_index].next = Some(next_index); - tokenizer.events[next_index].previous = Some(previous_index); - ( - State::Fn(Box::new(move |t, c| paragraph_inside(t, c, next_index))), - None, - ) -} - -fn paragraph_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::ChunkText); - tokenizer.exit(TokenType::Paragraph); - after(tokenizer, code) +fn before_paragraph(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.go(paragraph, after)(tokenizer, code) } |