diff options
Diffstat (limited to '')
-rw-r--r-- | src/construct/blank_line.rs | 5 | ||||
-rw-r--r-- | src/construct/code_fenced.rs | 5 | ||||
-rw-r--r-- | src/construct/definition.rs | 5 | ||||
-rw-r--r-- | src/construct/heading_atx.rs | 4 | ||||
-rw-r--r-- | src/construct/html_flow.rs | 3 | ||||
-rw-r--r-- | src/construct/mod.rs | 3 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 177 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 5 | ||||
-rw-r--r-- | src/construct/partial_label.rs | 5 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 5 |
10 files changed, 206 insertions, 11 deletions
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 7f794b9..fdb1ee0 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -6,7 +6,7 @@ //! blank_line ::= *(' ' '\t') //! ``` //! -//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! Blank lines are sometimes needed, such as to differentiate a [paragraph][] //! from another paragraph. //! In several cases, blank lines are not needed between flow constructs, //! such as between two [heading (atx)][heading-atx]s. @@ -24,9 +24,10 @@ //! * [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines) //! //! [flow]: crate::content::flow +//! [paragraph]: crate::construct::paragraph //! [heading-atx]: crate::construct::heading_atx //! -//! <!-- To do: link `list`, `paragraph` --> +//! <!-- To do: link `list` --> use crate::construct::partial_whitespace::start as whitespace; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 28ac20b..ba76aa8 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -64,7 +64,8 @@ //! ``` //! //! The `info` and `meta` parts are interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! In markdown, it is also possible to use [code (text)][code_text] in the //! [text][] content type. @@ -84,6 +85,8 @@ //! [text]: crate::content::text //! [code_indented]: crate::construct::code_indented //! [code_text]: crate::construct::code_text +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 65c0991..f7f8acd 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -38,7 +38,8 @@ //! //! The `label`, `destination`, and `title` parts are interpreted as the //! [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! ## References //! @@ -47,6 +48,8 @@ //! //! [flow]: crate::content::flow //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element //! //! <!-- To do: link link (reference) --> diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 3ff6fea..ab8b6a5 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -3,9 +3,9 @@ //! They’re formed with the following BNF: //! //! ```bnf -//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! heading_atx ::= 1*6'#' [ 1*space_or_tab text [ 1*space_or_tab 1*'#' ] ] *space_or_tab //! -//! code ::= . ; any unicode code point (other than line endings). +//! text ::= code - eol //! space_or_tab ::= ' ' | '\t' //! ``` //! diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index da4517d..5adac7d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -63,7 +63,7 @@ //! //! The **complete** production of HTML (flow) is not allowed to interrupt //! content. -//! That means that a blank line is needed between a paragraph and it. +//! That means that a blank line is needed between a [paragraph][] and it. //! However, [HTML (text)][html_text] has a similar production, which will //! typically kick-in instead. //! @@ -87,6 +87,7 @@ //! //! [flow]: crate::content::flow //! [html_text]: crate::construct::html_text +//! [paragraph]: crate::construct::paragraph //! [html_raw_names]: crate::constant::HTML_RAW_NAMES //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 3195205..1debb74 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -35,7 +35,7 @@ //! * label start (image) //! * label start (link) //! * list -//! * paragraph +//! * [paragraph][] //! * [thematic break][thematic_break] //! //! Each construct maintained here is explained with a BNF diagram. @@ -67,6 +67,7 @@ pub mod heading_atx; pub mod heading_setext; pub mod html_flow; pub mod html_text; +pub mod paragraph; pub mod partial_destination; pub mod partial_label; pub mod partial_title; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs new file mode 100644 index 0000000..50ef627 --- /dev/null +++ b/src/construct/paragraph.rs @@ -0,0 +1,177 @@ +//! Paragraph is a construct that occurs in the [flow] content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: lines cannot start other flow constructs. +//! ; Restriction: lines cannot be blank. +//! paragraph ::= 1*line *( eol 1*line ) +//! ``` +//! +//! Paragraphs in markdown relate to the `<p>` element in HTML. +//! See [*§ 4.4.1 The `p` element* in the HTML spec][html] for more info. +//! +//! Paragraphs can contain line endings and whitespace, but they are not +//! allowed to contain blank lines, or to be blank themselves. +//! +//! The paragraph is interpreted as the [text][] content type. +//! That means that [autolinks][autolink], [code (text)][code_text], etc are allowed. +//! +//! ## References +//! +//! * [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) +//! * [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs) +//! +//! [flow]: crate::content::flow +//! [text]: crate::content::text +//! [autolink]: crate::construct::autolink +//! [code_text]: crate::construct::code_text +//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element + +use crate::constant::TAB_SIZE; +use crate::construct::{ + code_fenced::start as code_fenced, heading_atx::start as heading_atx, + html_flow::start as html_flow, partial_whitespace::start as whitespace, + thematic_break::start as thematic_break, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::span::from_exit_event; + +/// Before a paragraph. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + unreachable!("unexpected eol/eof at start of paragraph") + } + _ => { + tokenizer.enter(TokenType::Paragraph); + tokenizer.enter(TokenType::ChunkText); + inside(tokenizer, code) + } + } +} + +/// In a paragraph. +/// +/// ```markdown +/// al|pha +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::None => end(tokenizer, code), + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer + .check(interrupt, |ok| { + Box::new(if ok { at_line_ending } else { end }) + })(tokenizer, code), + _ => { + tokenizer.consume(code); + (State::Fn(Box::new(inside)), None) + } + } +} + +/// At a line ending, not interrupting. +/// +/// ```markdown +/// alpha| +/// bravo. +/// ``` +fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.consume(code); + tokenizer.exit(TokenType::ChunkText); + tokenizer.enter(TokenType::ChunkText); + let next_index = tokenizer.events.len() - 1; + tokenizer.events[next_index - 2].next = Some(next_index); + tokenizer.events[next_index].previous = Some(next_index - 2); + (State::Fn(Box::new(inside)), None) +} + +/// At a line ending, done. +/// +/// ```markdown +/// alpha| +/// *** +/// ``` +fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::Paragraph); + (State::Ok, Some(vec![code])) +} + +/// Before a potential interruption. +/// +/// ```markdown +/// alpha| +/// *** +/// ``` +fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + match code { + Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(interrupt_initial)), None) + } + _ => unreachable!("expected eol"), + } +} + +/// After a line ending. +/// +/// ```markdown +/// alpha| +/// ~~~js +/// ~~~ +/// ``` +fn interrupt_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.attempt_2(code_fenced, html_flow, |ok| { + if ok { + Box::new(|_tokenizer, _code| (State::Nok, None)) + } else { + Box::new(|tokenizer, code| { + tokenizer.attempt( + |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), + |_ok| Box::new(interrupt_start), + )(tokenizer, code) + }) + } + })(tokenizer, code) +} + +/// After a line ending, after optional whitespace. +/// +/// ```markdown +/// alpha| +/// # bravo +/// ``` +fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + let tail = tokenizer.events.last(); + let mut prefix = 0; + + if let Some(event) = tail { + if event.token_type == TokenType::Whitespace { + let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); + prefix = span.end_index - span.start_index; + } + } + + match code { + // Blank lines are not allowed in paragraph. + Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), + // To do: If code is disabled, indented lines are allowed. + _ if prefix >= TAB_SIZE => (State::Ok, None), + // To do: definitions, setext headings, etc? + _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| { + let result = if ok { + (State::Nok, None) + } else { + (State::Ok, None) + }; + Box::new(|_t, _c| result) + })(tokenizer, code), + } +} diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index a2f638b..58d07c1 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -26,7 +26,8 @@ //! URLs. //! //! The destination is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! ## References //! @@ -34,6 +35,8 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! //! <!-- To do: link label end. --> diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index f7ce8d7..4997390 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -19,7 +19,8 @@ //! contain blank lines, and they must not be blank themselves. //! //! The label is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! > 👉 **Note**: this label relates to, but is not, the initial “label” of //! > what is know as a reference in markdown: @@ -46,6 +47,8 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX //! //! <!-- To do: link label end, label starts. --> diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 7b5fa64..19ba8d4 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -17,7 +17,8 @@ //! They are allowed to be blank themselves. //! //! The title is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed. //! //! ## References //! @@ -25,6 +26,8 @@ //! //! [definition]: crate::construct::definition //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference //! //! <!-- To do: link label end. --> |