diff options
author | Titus Wormer <tituswormer@gmail.com> | 2022-06-20 18:14:15 +0200 |
---|---|---|
committer | Titus Wormer <tituswormer@gmail.com> | 2022-06-20 18:14:15 +0200 |
commit | c7a46d3cc22bd0b029ff97623cee31c6ec38fdfb (patch) | |
tree | cc8991179de6efc57fbefdcd5fbebaeb872e878e | |
parent | 65dd765cceee8bdccc74c08066eec59a579a16b1 (diff) | |
download | markdown-rs-c7a46d3cc22bd0b029ff97623cee31c6ec38fdfb.tar.gz markdown-rs-c7a46d3cc22bd0b029ff97623cee31c6ec38fdfb.tar.bz2 markdown-rs-c7a46d3cc22bd0b029ff97623cee31c6ec38fdfb.zip |
Add support for line endings in string
Diffstat (limited to '')
-rw-r--r-- | readme.md | 4 | ||||
-rw-r--r-- | src/construct/heading_setext.rs | 27 | ||||
-rw-r--r-- | src/construct/paragraph.rs | 6 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 6 | ||||
-rw-r--r-- | src/construct/partial_label.rs | 84 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 33 | ||||
-rw-r--r-- | src/content/string.rs | 32 | ||||
-rw-r--r-- | src/util/link.rs | 8 | ||||
-rw-r--r-- | src/util/mod.rs | 1 | ||||
-rw-r--r-- | tests/character_reference.rs | 2 |
10 files changed, 136 insertions, 67 deletions
@@ -66,8 +66,6 @@ cargo doc --document-private-items ### Small things -- [ ] (1) Connect `ChunkString` in label, destination, title -- [ ] (1) Add support for line endings in `string` - [ ] (1) Add docs to subtokenize - [ ] (1) Add module docs to parser - [ ] (1) Add overview docs on how everything works @@ -171,6 +169,8 @@ cargo doc --document-private-items - [x] (1) Remove `content` content type, as it is no longer needed - [x] (1) Paragraph - [x] (1) Parse whitespace in each flow construct +- [x] (1) Connect `ChunkString` in label, destination, title +- [x] (1) Add support for line endings in `string` ### Extensions diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 64647cb..579fa71 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -52,7 +52,7 @@ use crate::constant::TAB_SIZE; use crate::construct::partial_space_or_tab::space_or_tab_opt; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; -use crate::util::span::from_exit_event; +use crate::util::{link::link, span::from_exit_event}; /// Kind of underline. #[derive(Debug, Clone, PartialEq)] @@ -133,16 +133,12 @@ fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { - let next = tokenizer.events.len(); - let previous = next - 2; - tokenizer.enter(TokenType::LineEnding); + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); - tokenizer.events[previous].next = Some(next); - tokenizer.events[next].previous = Some(previous); - ( State::Fn(Box::new(tokenizer.go(space_or_tab_opt(), text_line_start))), None, @@ -160,27 +156,20 @@ fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// == /// ``` fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - let next = tokenizer.events.len() - 2; - let previous = next - 2; + let index = tokenizer.events.len() - 2; // Link the whitespace, if it exists. - if tokenizer.events[next].token_type == TokenType::Whitespace { - tokenizer.events[previous].next = Some(next); - tokenizer.events[next].previous = Some(previous); + if tokenizer.events[index].token_type == TokenType::Whitespace { + link(&mut tokenizer.events, index); } match code { // Blank lines not allowed. Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), _ => { - let next = tokenizer.events.len(); - let previous = next - 2; - tokenizer.enter(TokenType::ChunkText); - - tokenizer.events[previous].next = Some(next); - tokenizer.events[next].previous = Some(previous); - + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); text_inside(tokenizer, code) } } diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs index fa18f28..b00188d 100644 --- a/src/construct/paragraph.rs +++ b/src/construct/paragraph.rs @@ -35,6 +35,7 @@ use crate::construct::{ partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break, }; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::link::link; /// Before a paragraph. /// @@ -83,9 +84,8 @@ fn at_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); tokenizer.exit(TokenType::ChunkText); tokenizer.enter(TokenType::ChunkText); - let next_index = tokenizer.events.len() - 1; - tokenizer.events[next_index - 2].next = Some(next_index); - tokenizer.events[next_index].previous = Some(next_index - 2); + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); (State::Fn(Box::new(inside)), None) } diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index bc95055..901a10d 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -18,8 +18,8 @@ //! They are counted with a counter that starts at `0`, and is incremented //! every time `(` occurs and decremented every time `)` occurs. //! If `)` is found when the counter is `0`, the destination closes immediately -//! after it. -//! Escaped parens do not count. +//! before it. +//! Escaped parens do not count in balancing. //! //! It is recommended to use the enclosed variant of destinations, as it allows //! arbitrary parens, and also allows for whitespace and other characters in @@ -68,7 +68,6 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::DefinitionDestination); tokenizer.enter(TokenType::DefinitionDestinationRaw); tokenizer.enter(TokenType::DefinitionDestinationString); - // To do: link. tokenizer.enter(TokenType::ChunkString); raw(tokenizer, code, 0) } @@ -90,7 +89,6 @@ fn enclosed_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { (State::Ok, None) } else { tokenizer.enter(TokenType::DefinitionDestinationString); - // To do: link. tokenizer.enter(TokenType::ChunkString); enclosed(tokenizer, code) } diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index 4997390..55efd13 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -56,7 +56,9 @@ // To do: pass token types in. use crate::constant::LINK_REFERENCE_SIZE_MAX; +use crate::construct::partial_space_or_tab::space_or_tab_opt; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::link::link; /// Before a label. /// @@ -71,7 +73,10 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.consume(code); tokenizer.exit(TokenType::DefinitionLabelMarker); tokenizer.enter(TokenType::DefinitionLabelData); - (State::Fn(Box::new(|t, c| at_break(t, c, false, 0))), None) + ( + State::Fn(Box::new(|t, c| at_break(t, c, false, 0, false))), + None, + ) } _ => (State::Nok, None), } @@ -83,7 +88,13 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// [|a] /// [a|] /// ``` -fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult { +fn at_break( + tokenizer: &mut Tokenizer, + code: Code, + data: bool, + size: usize, + connect: bool, +) -> StateFnResult { match code { Code::None | Code::Char('[') => (State::Nok, None), Code::Char(']') if !data => (State::Nok, None), @@ -96,24 +107,57 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> S tokenizer.exit(TokenType::DefinitionLabel); (State::Ok, None) } - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - // To do: limit blank lines. - ( - State::Fn(Box::new(move |t, c| at_break(t, c, data, size))), - None, - ) - } _ => { tokenizer.enter(TokenType::ChunkString); - // To do: link. + + if connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } + label(tokenizer, code, data, size) } } } +/// After a line ending. +/// +/// ```markdown +/// [a +/// |b] +/// ``` +fn line_start( + tokenizer: &mut Tokenizer, + code: Code, + data: bool, + size: usize, + connect: bool, +) -> StateFnResult { + tokenizer.go(space_or_tab_opt(), move |t, c| { + line_begin(t, c, data, size, connect) + })(tokenizer, code) +} + +/// After a line ending, after optional whitespace. +/// +/// ```markdown +/// [a +/// |b] +/// ``` +fn line_begin( + tokenizer: &mut Tokenizer, + code: Code, + data: bool, + size: usize, + connect: bool, +) -> StateFnResult { + match code { + // Blank line not allowed. + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), + _ => at_break(tokenizer, code, data, size, connect), + } +} + /// In a label, in text. /// /// ```markdown @@ -121,13 +165,21 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> S /// ``` fn label(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult { match code { - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => { + Code::None | Code::Char('[' | ']') => { tokenizer.exit(TokenType::ChunkString); - at_break(tokenizer, code, data, size) + at_break(tokenizer, code, data, size, true) } _ if size > LINK_REFERENCE_SIZE_MAX => { tokenizer.exit(TokenType::ChunkString); - at_break(tokenizer, code, data, size) + at_break(tokenizer, code, data, size, true) + } + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.consume(code); + tokenizer.exit(TokenType::ChunkString); + ( + State::Fn(Box::new(move |t, c| line_start(t, c, data, size + 1, true))), + None, + ) } Code::VirtualSpace | Code::Char('\t' | ' ') => { tokenizer.consume(code); diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 0669c8e..322a3e6 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -35,6 +35,7 @@ use crate::construct::partial_space_or_tab::space_or_tab_opt; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +use crate::util::link::link; /// Type of title. #[derive(Debug, Clone, PartialEq)] @@ -102,7 +103,7 @@ fn begin(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { } _ => { tokenizer.enter(TokenType::DefinitionTitleString); - at_break(tokenizer, code, kind) + at_break(tokenizer, code, kind, false) } } } @@ -115,22 +116,19 @@ fn begin(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { /// (a| /// b) /// ``` -fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { +fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: Kind, connect: bool) -> StateFnResult { match code { Code::Char(char) if char == kind_to_marker(&kind) => { tokenizer.exit(TokenType::DefinitionTitleString); begin(tokenizer, code, kind) } Code::None => (State::Nok, None), - Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { - tokenizer.enter(TokenType::LineEnding); - tokenizer.consume(code); - tokenizer.exit(TokenType::LineEnding); - (State::Fn(Box::new(|t, c| line_start(t, c, kind))), None) - } _ => { - // To do: link. tokenizer.enter(TokenType::ChunkString); + if connect { + let index = tokenizer.events.len() - 1; + link(&mut tokenizer.events, index); + } title(tokenizer, code, kind) } } @@ -156,7 +154,7 @@ fn line_begin(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResul match code { // Blank line not allowed. Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), - _ => at_break(tokenizer, code, kind), + _ => at_break(tokenizer, code, kind, true), } } @@ -169,11 +167,20 @@ fn title(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind_to_marker(&kind) => { tokenizer.exit(TokenType::ChunkString); - at_break(tokenizer, code, kind) + at_break(tokenizer, code, kind, true) } - Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + Code::None => { + tokenizer.exit(TokenType::ChunkString); + at_break(tokenizer, code, kind, true) + } + // To do: limit blank lines. + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.consume(code); tokenizer.exit(TokenType::ChunkString); - at_break(tokenizer, code, kind) + ( + State::Fn(Box::new(move |t, c| line_start(t, c, kind))), + None, + ) } Code::Char('\\') => { tokenizer.consume(code); diff --git a/src/content/string.rs b/src/content/string.rs index f591cd7..efb6e60 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -43,13 +43,28 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// |qwe /// ``` fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { - if let Code::None = code { - (State::Ok, None) - } else { - tokenizer.enter(TokenType::Data); - tokenizer.consume(code); - (State::Fn(Box::new(in_data)), None) + match code { + Code::None => (State::Ok, None), + Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { + tokenizer.enter(TokenType::LineEnding); + tokenizer.consume(code); + tokenizer.exit(TokenType::LineEnding); + (State::Fn(Box::new(start)), None) + } + _ => { + tokenizer.enter(TokenType::Data); + tokenizer.consume(code); + (State::Fn(Box::new(in_data)), None) + } } + + // if let Code::None = code { + // (State::Ok, None) + // } else { + // tokenizer.enter(TokenType::Data); + // tokenizer.consume(code); + // (State::Fn(Box::new(in_data)), None) + // } } /// In data. @@ -59,10 +74,9 @@ fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { - // To do: line endings. - Code::None => { + Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { tokenizer.exit(TokenType::Data); - (State::Ok, None) + before_data(tokenizer, code) } // To do: somehow get these markers from constructs. Code::Char('&' | '\\') => { diff --git a/src/util/link.rs b/src/util/link.rs new file mode 100644 index 0000000..917ce4d --- /dev/null +++ b/src/util/link.rs @@ -0,0 +1,8 @@ +//! To do. + +use crate::tokenizer::Event; + +pub fn link(events: &mut [Event], index: usize) { + events[index - 2].next = Some(index); + events[index].previous = Some(index - 2); +} diff --git a/src/util/mod.rs b/src/util/mod.rs index c3db267..5439c62 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -2,5 +2,6 @@ pub mod decode_character_reference; pub mod encode; +pub mod link; pub mod sanitize_uri; pub mod span; diff --git a/tests/character_reference.rs b/tests/character_reference.rs index bcd0aca..f2337ab 100644 --- a/tests/character_reference.rs +++ b/tests/character_reference.rs @@ -100,7 +100,7 @@ fn character_reference() { // "should not support character references as construct markers (2)" // ); - // To do: link. + // To do: link (resource). // assert_eq!( // micromark("[a](url "tit")"), // "<p>[a](url "tit")</p>", |