diff options
Diffstat (limited to '')
| -rw-r--r-- | src/construct/blank_line.rs | 5 | ||||
| -rw-r--r-- | src/construct/code_fenced.rs | 5 | ||||
| -rw-r--r-- | src/construct/definition.rs | 5 | ||||
| -rw-r--r-- | src/construct/heading_atx.rs | 4 | ||||
| -rw-r--r-- | src/construct/html_flow.rs | 3 | ||||
| -rw-r--r-- | src/construct/mod.rs | 3 | ||||
| -rw-r--r-- | src/construct/paragraph.rs | 177 | ||||
| -rw-r--r-- | src/construct/partial_destination.rs | 5 | ||||
| -rw-r--r-- | src/construct/partial_label.rs | 5 | ||||
| -rw-r--r-- | src/construct/partial_title.rs | 5 | 
10 files changed, 206 insertions, 11 deletions
| diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 7f794b9..fdb1ee0 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -6,7 +6,7 @@  //! blank_line ::= *(' ' '\t')  //! ```  //! -//! Blank lines are sometimes needed, such as to differentiate a paragraph +//! Blank lines are sometimes needed, such as to differentiate a [paragraph][]  //! from another paragraph.  //! In several cases, blank lines are not needed between flow constructs,  //! such as between two [heading (atx)][heading-atx]s. @@ -24,9 +24,10 @@  //! *   [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines)  //!  //! [flow]: crate::content::flow +//! [paragraph]: crate::construct::paragraph  //! [heading-atx]: crate::construct::heading_atx  //! -//! <!-- To do: link `list`, `paragraph` --> +//! <!-- To do: link `list` -->  use crate::construct::partial_whitespace::start as whitespace;  use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 28ac20b..ba76aa8 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -64,7 +64,8 @@  //! ```  //!  //! The `info` and `meta` parts are interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed.  //!  //! In markdown, it is also possible to use [code (text)][code_text] in the  //! [text][] content type. @@ -84,6 +85,8 @@  //! [text]: crate::content::text  //! [code_indented]: crate::construct::code_indented  //! [code_text]: crate::construct::code_text +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference  //! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element  //! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 65c0991..f7f8acd 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -38,7 +38,8 @@  //!  //! The `label`, `destination`, and `title` parts are interpreted as the  //! [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed.  //!  //! ## References  //! @@ -47,6 +48,8 @@  //!  //! [flow]: crate::content::flow  //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference  //! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element  //!  //! <!-- To do: link link (reference) --> diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 3ff6fea..ab8b6a5 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -3,9 +3,9 @@  //! They’re formed with the following BNF:  //!  //! ```bnf -//! heading_atx ::= 1*6'#' [ 1*space_or_tab code [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! heading_atx ::= 1*6'#' [ 1*space_or_tab text [ 1*space_or_tab 1*'#' ] ] *space_or_tab  //! -//! code ::= . ; any unicode code point (other than line endings). +//! text ::= code - eol  //! space_or_tab ::= ' ' | '\t'  //! ```  //! diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index da4517d..5adac7d 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -63,7 +63,7 @@  //!  //! The **complete** production of HTML (flow) is not allowed to interrupt  //! content. -//! That means that a blank line is needed between a paragraph and it. +//! That means that a blank line is needed between a [paragraph][] and it.  //! However, [HTML (text)][html_text] has a similar production, which will  //! typically kick-in instead.  //! @@ -87,6 +87,7 @@  //!  //! [flow]: crate::content::flow  //! [html_text]: crate::construct::html_text +//! [paragraph]: crate::construct::paragraph  //! [html_raw_names]: crate::constant::HTML_RAW_NAMES  //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES  //! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 3195205..1debb74 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -35,7 +35,7 @@  //! *   label start (image)  //! *   label start (link)  //! *   list -//! *   paragraph +//! *   [paragraph][]  //! *   [thematic break][thematic_break]  //!  //! Each construct maintained here is explained with a BNF diagram. @@ -67,6 +67,7 @@ pub mod heading_atx;  pub mod heading_setext;  pub mod html_flow;  pub mod html_text; +pub mod paragraph;  pub mod partial_destination;  pub mod partial_label;  pub mod partial_title; diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs new file mode 100644 index 0000000..50ef627 --- /dev/null +++ b/src/construct/paragraph.rs @@ -0,0 +1,177 @@ +//! Paragraph is a construct that occurs in the [flow] content type. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: lines cannot start other flow constructs. +//! ; Restriction: lines cannot be blank. +//! paragraph ::= 1*line *( eol 1*line ) +//! ``` +//! +//! Paragraphs in markdown relate to the `<p>` element in HTML. +//! See [*§ 4.4.1 The `p` element* in the HTML spec][html] for more info. +//! +//! Paragraphs can contain line endings and whitespace, but they are not +//! allowed to contain blank lines, or to be blank themselves. +//! +//! The paragraph is interpreted as the [text][] content type. +//! That means that [autolinks][autolink], [code (text)][code_text], etc are allowed. +//! +//! ## References +//! +//! *   [`content.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/content.js) +//! *   [*§ 4.8 Paragraphs* in `CommonMark`](https://spec.commonmark.org/0.30/#paragraphs) +//! +//! [flow]: crate::content::flow +//! [text]: crate::content::text +//! [autolink]: crate::construct::autolink +//! [code_text]: crate::construct::code_text +//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element + +use crate::constant::TAB_SIZE; +use crate::construct::{ +    code_fenced::start as code_fenced, heading_atx::start as heading_atx, +    html_flow::start as html_flow, partial_whitespace::start as whitespace, +    thematic_break::start as thematic_break, +}; +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::span::from_exit_event; + +/// Before a paragraph. +/// +/// ```markdown +/// |qwe +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            unreachable!("unexpected eol/eof at start of paragraph") +        } +        _ => { +            tokenizer.enter(TokenType::Paragraph); +            tokenizer.enter(TokenType::ChunkText); +            inside(tokenizer, code) +        } +    } +} + +/// In a paragraph. +/// +/// ```markdown +/// al|pha +/// ``` +fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => end(tokenizer, code), +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer +            .check(interrupt, |ok| { +                Box::new(if ok { at_line_ending } else { end }) +            })(tokenizer, code), +        _ => { +            tokenizer.consume(code); +            (State::Fn(Box::new(inside)), None) +        } +    } +} + +/// At a line ending, not interrupting. +/// +/// ```markdown +/// alpha| +/// bravo. +/// ``` +fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.consume(code); +    tokenizer.exit(TokenType::ChunkText); +    tokenizer.enter(TokenType::ChunkText); +    let next_index = tokenizer.events.len() - 1; +    tokenizer.events[next_index - 2].next = Some(next_index); +    tokenizer.events[next_index].previous = Some(next_index - 2); +    (State::Fn(Box::new(inside)), None) +} + +/// At a line ending, done. +/// +/// ```markdown +/// alpha| +/// *** +/// ``` +fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.exit(TokenType::ChunkText); +    tokenizer.exit(TokenType::Paragraph); +    (State::Ok, Some(vec![code])) +} + +/// Before a potential interruption. +/// +/// ```markdown +/// alpha| +/// *** +/// ``` +fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.enter(TokenType::LineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::LineEnding); +            (State::Fn(Box::new(interrupt_initial)), None) +        } +        _ => unreachable!("expected eol"), +    } +} + +/// After a line ending. +/// +/// ```markdown +/// alpha| +/// ~~~js +/// ~~~ +/// ``` +fn interrupt_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt_2(code_fenced, html_flow, |ok| { +        if ok { +            Box::new(|_tokenizer, _code| (State::Nok, None)) +        } else { +            Box::new(|tokenizer, code| { +                tokenizer.attempt( +                    |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), +                    |_ok| Box::new(interrupt_start), +                )(tokenizer, code) +            }) +        } +    })(tokenizer, code) +} + +/// After a line ending, after optional whitespace. +/// +/// ```markdown +/// alpha| +/// # bravo +/// ``` +fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    let tail = tokenizer.events.last(); +    let mut prefix = 0; + +    if let Some(event) = tail { +        if event.token_type == TokenType::Whitespace { +            let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1); +            prefix = span.end_index - span.start_index; +        } +    } + +    match code { +        // Blank lines are not allowed in paragraph. +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), +        // To do: If code is disabled, indented lines are allowed. +        _ if prefix >= TAB_SIZE => (State::Ok, None), +        // To do: definitions, setext headings, etc? +        _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| { +            let result = if ok { +                (State::Nok, None) +            } else { +                (State::Ok, None) +            }; +            Box::new(|_t, _c| result) +        })(tokenizer, code), +    } +} diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index a2f638b..58d07c1 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -26,7 +26,8 @@  //! URLs.  //!  //! The destination is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed.  //!  //! ## References  //! @@ -34,6 +35,8 @@  //!  //! [definition]: crate::construct::definition  //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference  //!  //! <!-- To do: link label end. --> diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index f7ce8d7..4997390 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -19,7 +19,8 @@  //! contain blank lines, and they must not be blank themselves.  //!  //! The label is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed.  //!  //! > 👉 **Note**: this label relates to, but is not, the initial “label” of  //! > what is know as a reference in markdown: @@ -46,6 +47,8 @@  //!  //! [definition]: crate::construct::definition  //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference  //! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX  //!  //! <!-- To do: link label end, label starts. --> diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 7b5fa64..19ba8d4 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -17,7 +17,8 @@  //! They are allowed to be blank themselves.  //!  //! The title is interpreted as the [string][] content type. -//! That means that character escapes and character reference are allowed. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed.  //!  //! ## References  //! @@ -25,6 +26,8 @@  //!  //! [definition]: crate::construct::definition  //! [string]: crate::content::string +//! [character_escape]: crate::construct::character_escape +//! [character_reference]: crate::construct::character_reference  //!  //! <!-- To do: link label end. --> | 
