diff options
-rw-r--r-- | readme.md | 25 | ||||
-rw-r--r-- | src/compiler.rs | 2 | ||||
-rw-r--r-- | src/constant.rs | 5 | ||||
-rw-r--r-- | src/construct/code_fenced.rs | 2 | ||||
-rw-r--r-- | src/construct/html_text.rs | 6 | ||||
-rw-r--r-- | src/construct/mod.rs | 1 | ||||
-rw-r--r-- | src/content/content.rs | 70 | ||||
-rw-r--r-- | src/content/flow.rs | 54 | ||||
-rw-r--r-- | src/content/mod.rs | 1 | ||||
-rw-r--r-- | src/content/string.rs | 4 | ||||
-rw-r--r-- | src/subtokenize.rs | 9 | ||||
-rw-r--r-- | src/tokenizer.rs | 2 |
12 files changed, 50 insertions, 131 deletions
@@ -46,9 +46,9 @@ cargo doc --document-private-items ### Some major obstacles -- [ ] (8) Can content (and to a lesser extent string and text) operate more - performantly than checking whether other flow constructs start a line, - before exiting and actually attempting flow constructs? +- [ ] (8) Can paragraphs (and to a lesser extent string data and text data) + operate more performantly than checking whether other flow constructs + start a line, before exiting and actually attempting flow constructs? - [ ] (5) Figure out sharing definition and identifiers, and references before definitions - [ ] (3) Interrupting: sometimes flow can or cannot start depending on the @@ -57,8 +57,8 @@ cargo doc --document-private-items subtokenization is solved - [ ] (3) Concrete constructs: HTML or code (fenced) cannot be “pierced” into by containers -- [ ] (3) Lazy lines, in containers, in flow and content in a paragraph, a line - does not need to be indented +- [ ] (3) Lazy lines, in containers, in flow in a paragraph, a line does not + need to be indented - [ ] (5) There’s a lot of rust-related choosing whether to pass (mutable) references or whatever around that should be refactored - [ ] (5) Figure out extensions @@ -66,11 +66,9 @@ cargo doc --document-private-items ### Small things -- [ ] (1) Remove `content` content type, as it is no longer needed - [ ] (1) Connect `ChunkString` in label, destination, title - [ ] (1) Add support for line endings in `string` - [ ] (1) Add docs to subtokenize -- [ ] (1) Add module docs to content - [ ] (1) Add module docs to parser - [ ] (1) Add overview docs on how everything works - [ ] (1) Move safe protocols to constants @@ -109,8 +107,7 @@ cargo doc --document-private-items - [x] character reference - [x] code (fenced) - [x] code (indented) -- [x] (1) code (text) -- [ ] (3) content +- [x] code (text) - [x] definition - [x] hard break (escape) - [x] hard break (trailing) @@ -134,14 +131,12 @@ cargo doc --document-private-items - [x] blank line - [x] code (fenced) - [x] code (indented) - - [x] content - [x] definition - [x] heading (atx) - [x] heading (setext) - [x] html (flow) - - [x] thematic break -- [x] content - [x] paragraph + - [x] thematic break - [ ] (5) text - [ ] attention (strong, emphasis) (text) - [x] autolink @@ -170,10 +165,10 @@ cargo doc --document-private-items - [x] (1) Add examples to `CompileOptions` docs - [x] (3) Fix deep subtokenization - [x] (1) text in heading -- [x] (1) Setext headings: can they be solved in content, or do they have to be - solved in flow somehow +- [x] (1) Setext headings, solved in flow - [x] (1) Add docs to partials - [x] (1) Remove all `pub fn`s from constructs, except for start +- [x] (1) Remove `content` content type, as it is no longer needed ### Extensions @@ -188,7 +183,7 @@ important. — [`micromark-extension-frontmatter`](https://github.com/micromark/micromark-extension-frontmatter) - [ ] (3) autolink literal (GFM) (text) — [`micromark-extension-gfm-autolink-literal`](https://github.com/micromark/micromark-extension-gfm-autolink-literal) -- [ ] (3) footnote (GFM) (content, text) +- [ ] (3) footnote (GFM) (flow, text) — [`micromark-extension-gfm-footnote`](https://github.com/micromark/micromark-extension-gfm-footnote) - [ ] (3) strikethrough (GFM) (text) — [`micromark-extension-gfm-strikethrough`](https://github.com/micromark/micromark-extension-gfm-strikethrough) diff --git a/src/compiler.rs b/src/compiler.rs index be5d0fe..59fcd22 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -126,7 +126,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CodeTextData | TokenType::CodeTextLineEnding | TokenType::CodeTextSequence - | TokenType::Content | TokenType::Data | TokenType::DefinitionLabel | TokenType::DefinitionLabelMarker @@ -213,7 +212,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St | TokenType::CodeFencedFenceWhitespace | TokenType::CodeIndentedPrefixWhitespace | TokenType::CodeTextSequence - | TokenType::Content | TokenType::DefinitionLabel | TokenType::DefinitionLabelMarker | TokenType::DefinitionLabelData diff --git a/src/constant.rs b/src/constant.rs index 1f833c2..e7594b9 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -103,8 +103,9 @@ pub const HTML_RAW_SIZE_MAX: usize = 8; /// List of HTML tag names that form the **basic** production of /// [HTML (flow)][html_flow]. /// -/// The **basic** production allows interleaving HTML and markdown with blank lines -/// and allows flow (block) elements to interrupt content. +/// The **basic** production allows interleaving HTML and markdown with blank +/// lines and allows flow (block) elements to interrupt definitions, paragraphs, +/// and heading (setext). /// Tag name matching must be performed insensitive to case, and thus this list /// includes lowercase tag names. /// diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 12c8bd6..28ac20b 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -27,7 +27,7 @@ //! The above grammar does not show how whitespace is handled. //! To parse code (fenced), let `X` be the number of whitespace characters //! before the opening fence sequence. -//! Each line of content is then allowed (not required) to be indented with up +//! Each line of text is then allowed (not required) to be indented with up //! to `X` spaces or tabs, which are then ignored as an indent instead of being //! considered as part of the code. //! This indent does not affect the closing fence. diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index d50a8ce..93b4b62 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -632,7 +632,7 @@ fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// At an allowed line ending. /// -/// > **Note**: we can’t have blank lines in content, so no need to worry about +/// > **Note**: we can’t have blank lines in text, so no need to worry about /// > empty tokens. /// /// ```markdown @@ -661,7 +661,7 @@ fn at_line_ending( /// After a line ending. /// -/// > **Note**: we can’t have blank lines in content, so no need to worry about +/// > **Note**: we can’t have blank lines in text, so no need to worry about /// > empty tokens. /// /// ```markdown @@ -681,7 +681,7 @@ fn after_line_ending( /// After a line ending, after indent. /// -/// > **Note**: we can’t have blank lines in content, so no need to worry about +/// > **Note**: we can’t have blank lines in text, so no need to worry about /// > empty tokens. /// /// ```markdown diff --git a/src/construct/mod.rs b/src/construct/mod.rs index a5e95bc..3195205 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -24,7 +24,6 @@ //! * [code (fenced)][code_fenced] //! * [code (indented)][code_indented] //! * [code (text)][code_text] -//! * content //! * [definition][] //! * [hard break (escape)][hard_break_escape] //! * [hard break (trailing)][hard_break_trailing] diff --git a/src/content/content.rs b/src/content/content.rs deleted file mode 100644 index 86bc290..0000000 --- a/src/content/content.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! The `content`, ahum, content type. -//! -//! **Content** is zero or more definitions, and then zero or one paragraph. -//! It’s a weird one, and needed to make certain edge cases around definitions -//! spec compliant. -//! Definitions are unlike other things in markdown, in that they behave like -//! **text** in that they can contain arbitrary line endings, but *have* to end -//! at a line ending. -//! If they end in something else, the whole definition instead is seen as a -//! paragraph. -//! -//! The constructs found in content are: -//! -//! * Definition -//! * Paragraph - -use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; - -/// Before a paragraph. -/// -/// ```markdown -/// |asd -/// ``` -pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - unreachable!("expected non-eol/eof"); - } - _ => { - tokenizer.enter(TokenType::Paragraph); - tokenizer.enter(TokenType::ChunkText); - inside(tokenizer, code, tokenizer.events.len() - 1) - } - } -} - -/// In a line in a paragraph. -/// -/// ```markdown -/// |\& -/// |qwe -/// ``` -fn inside(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { - match code { - Code::None => { - tokenizer.exit(TokenType::ChunkText); - tokenizer.exit(TokenType::Paragraph); - (State::Ok, None) - } - Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - tokenizer.consume(code); - tokenizer.exit(TokenType::ChunkText); - tokenizer.enter(TokenType::ChunkText); - let next_index = tokenizer.events.len() - 1; - tokenizer.events[previous_index].next = Some(next_index); - tokenizer.events[next_index].previous = Some(previous_index); - ( - State::Fn(Box::new(move |t, c| inside(t, c, next_index))), - None, - ) - } - _ => { - tokenizer.consume(code); - ( - State::Fn(Box::new(move |t, c| inside(t, c, previous_index))), - None, - ) - } - } -} diff --git a/src/content/flow.rs b/src/content/flow.rs index 3fab523..58be61d 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -1,7 +1,7 @@ //! The flow content type. //! -//! **Flow** represents the sections, such as headings, code, and content, which -//! is parsed per line. +//! **Flow** represents the sections, such as headings and code, which are +//! parsed per line. //! An example is HTML, which has a certain starting condition (such as //! `<script>` on its own line), then continues for a while, until an end //! condition is found (such as `</style>`). @@ -18,8 +18,6 @@ //! * [Heading (setext)][crate::construct::heading_setext] //! * [HTML (flow)][crate::construct::html_flow] //! * [Thematic break][crate::construct::thematic_break] -//! -//! <!-- To do: Link to content. --> use crate::constant::TAB_SIZE; use crate::construct::{ @@ -153,45 +151,43 @@ pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResu thematic_break, definition, heading_setext, - |ok| Box::new(if ok { after } else { content_before }), + |ok| Box::new(if ok { after } else { paragraph_before }), )(tokenizer, code) } -/// Before content. +/// Before a paragraph. /// /// ```markdown /// |qwe /// ``` -/// -// To do: we don’t need content anymore in `micromark-rs` it seems? -fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn paragraph_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { after(tokenizer, code) } _ => { - tokenizer.enter(TokenType::Content); - tokenizer.enter(TokenType::ChunkContent); - content(tokenizer, code, tokenizer.events.len() - 1) + tokenizer.enter(TokenType::Paragraph); + tokenizer.enter(TokenType::ChunkText); + paragraph_inside(tokenizer, code, tokenizer.events.len() - 1) } } } -/// In content. +/// In a paragraph. /// /// ```markdown /// al|pha /// ``` -fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult { +fn paragraph_inside(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult { match code { - Code::None => content_end(tokenizer, code), + Code::None => paragraph_end(tokenizer, code), Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { tokenizer.check(continuation_construct, move |ok| { Box::new(move |t, c| { if ok { - content_continue(t, c, previous) + paragraph_continue(t, c, previous) } else { - content_end(t, c) + paragraph_end(t, c) } }) })(tokenizer, code) @@ -199,7 +195,7 @@ fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnRes _ => { tokenizer.consume(code); ( - State::Fn(Box::new(move |t, c| content(t, c, previous))), + State::Fn(Box::new(move |t, c| paragraph_inside(t, c, previous))), None, ) } @@ -248,9 +244,9 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> } match code { - // Blank lines are not allowed in content. + // Blank lines are not allowed in paragraph. Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), - // To do: If code is disabled, indented lines are part of the content. + // To do: If code is disabled, indented lines are part of the paragraph. _ if prefix >= TAB_SIZE => (State::Ok, None), // To do: definitions, setext headings, etc? _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| { @@ -264,21 +260,25 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> } } -fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult { +fn paragraph_continue( + tokenizer: &mut Tokenizer, + code: Code, + previous_index: usize, +) -> StateFnResult { tokenizer.consume(code); - tokenizer.exit(TokenType::ChunkContent); - tokenizer.enter(TokenType::ChunkContent); + tokenizer.exit(TokenType::ChunkText); + tokenizer.enter(TokenType::ChunkText); let next_index = tokenizer.events.len() - 1; tokenizer.events[previous_index].next = Some(next_index); tokenizer.events[next_index].previous = Some(previous_index); ( - State::Fn(Box::new(move |t, c| content(t, c, next_index))), + State::Fn(Box::new(move |t, c| paragraph_inside(t, c, next_index))), None, ) } -fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - tokenizer.exit(TokenType::ChunkContent); - tokenizer.exit(TokenType::Content); +fn paragraph_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { + tokenizer.exit(TokenType::ChunkText); + tokenizer.exit(TokenType::Paragraph); after(tokenizer, code) } diff --git a/src/content/mod.rs b/src/content/mod.rs index d13df79..395e41b 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -1,7 +1,6 @@ //! Content types found in markdown. #[allow(clippy::module_inception)] -pub mod content; pub mod flow; pub mod string; pub mod text; diff --git a/src/content/string.rs b/src/content/string.rs index e8134c4..f591cd7 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -1,6 +1,6 @@ //! The string content type. //! -//! **String** is a limited **text** like content type which only allows +//! **String** is a limited [text][] like content type which only allows //! character escapes and character references. //! It exists in things such as identifiers (media references, definitions), //! titles, URLs, code (fenced) info and meta parts. @@ -9,6 +9,8 @@ //! //! * [Character escape][crate::construct::character_escape] //! * [Character reference][crate::construct::character_reference] +//! +//! [text]: crate::content::text use crate::construct::{ character_escape::start as character_escape, character_reference::start as character_reference, diff --git a/src/subtokenize.rs b/src/subtokenize.rs index 71a84e1..4a29a01 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -1,4 +1,4 @@ -use crate::content::{content::start as content, string::start as string, text::start as text}; +use crate::content::{string::start as string, text::start as text}; use crate::tokenizer::{ Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, }; @@ -20,8 +20,7 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { // Find each first opening chunk. if (event.token_type == TokenType::ChunkString - || event.token_type == TokenType::ChunkText - || event.token_type == TokenType::ChunkContent) && + || event.token_type == TokenType::ChunkText) && event.event_type == EventType::Enter && // No need to enter linked events again. event.previous == None @@ -33,9 +32,7 @@ pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> (Vec<Event>, bool) { let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); // Substate. let mut result: StateFnResult = ( - State::Fn(Box::new(if event.token_type == TokenType::ChunkContent { - content - } else if event.token_type == TokenType::ChunkString { + State::Fn(Box::new(if event.token_type == TokenType::ChunkString { string } else { text diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9884986..c1bb61b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -48,7 +48,6 @@ pub enum TokenType { CodeTextSequence, CodeTextLineEnding, CodeTextData, - Content, Data, Definition, DefinitionLabel, @@ -86,7 +85,6 @@ pub enum TokenType { Whitespace, // Chunks are tokenizer, but unraveled by `subtokenize`. - ChunkContent, ChunkString, ChunkText, } |