aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--readme.md2
-rw-r--r--src/compiler.rs10
-rw-r--r--src/construct/blank_line.rs13
-rw-r--r--src/construct/code_fenced.rs169
-rw-r--r--src/construct/code_indented.rs96
-rw-r--r--src/construct/definition.rs96
-rw-r--r--src/construct/heading_atx.rs52
-rw-r--r--src/construct/heading_setext.rs45
-rw-r--r--src/construct/html_flow.rs7
-rw-r--r--src/construct/html_text.rs9
-rw-r--r--src/construct/mod.rs2
-rw-r--r--src/construct/paragraph.rs68
-rw-r--r--src/construct/partial_destination.rs2
-rw-r--r--src/construct/partial_space_or_tab.rs98
-rw-r--r--src/construct/partial_title.rs7
-rw-r--r--src/construct/partial_whitespace.rs64
-rw-r--r--src/construct/thematic_break.rs50
-rw-r--r--src/content/flow.rs45
-rw-r--r--src/tokenizer.rs32
-rw-r--r--tests/autolink.rs2
20 files changed, 326 insertions, 543 deletions
diff --git a/readme.md b/readme.md
index d6b20af..224c5d0 100644
--- a/readme.md
+++ b/readme.md
@@ -66,7 +66,6 @@ cargo doc --document-private-items
### Small things
-- [ ] (1) Parse whitespace in each flow construct
- [ ] (1) Connect `ChunkString` in label, destination, title
- [ ] (1) Add support for line endings in `string`
- [ ] (1) Add docs to subtokenize
@@ -171,6 +170,7 @@ cargo doc --document-private-items
- [x] (1) Remove all `pub fn`s from constructs, except for start
- [x] (1) Remove `content` content type, as it is no longer needed
- [x] (1) Paragraph
+- [x] (1) Parse whitespace in each flow construct
### Extensions
diff --git a/src/compiler.rs b/src/compiler.rs
index 59fcd22..366dcd9 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -108,7 +108,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::AutolinkMarker
| TokenType::AutolinkProtocol
| TokenType::BlankLineEnding
- | TokenType::BlankLineWhitespace
| TokenType::CharacterEscape
| TokenType::CharacterEscapeMarker
| TokenType::CharacterEscapeValue
@@ -118,10 +117,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CharacterReferenceMarkerNumeric
| TokenType::CharacterReferenceMarkerSemi
| TokenType::CharacterReferenceValue
- | TokenType::CodeIndentedPrefixWhitespace
| TokenType::CodeFencedFence
| TokenType::CodeFencedFenceSequence
- | TokenType::CodeFencedFenceWhitespace
| TokenType::CodeFlowChunk
| TokenType::CodeTextData
| TokenType::CodeTextLineEnding
@@ -153,7 +150,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::LineEnding
| TokenType::ThematicBreak
| TokenType::ThematicBreakSequence
- | TokenType::ThematicBreakWhitespace
| TokenType::Whitespace => {
// Ignore.
}
@@ -172,7 +168,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
TokenType::CodeFenced => {
code_flow_seen_data = Some(false);
line_ending_if_needed(buffers);
- // Note: no `>`, which is added later.
+ // Note that no `>` is used, which is added later.
buf_tail_mut(buffers).push("<pre><code".to_string());
code_fenced_fences_count = Some(0);
}
@@ -203,14 +199,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
TokenType::Autolink
| TokenType::AutolinkMarker
| TokenType::BlankLineEnding
- | TokenType::BlankLineWhitespace
| TokenType::CharacterEscape
| TokenType::CharacterEscapeMarker
| TokenType::CharacterReference
| TokenType::CharacterReferenceMarkerSemi
| TokenType::CodeFencedFenceSequence
- | TokenType::CodeFencedFenceWhitespace
- | TokenType::CodeIndentedPrefixWhitespace
| TokenType::CodeTextSequence
| TokenType::DefinitionLabel
| TokenType::DefinitionLabelMarker
@@ -228,7 +221,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::HardBreakTrailingSpace
| TokenType::HeadingSetext
| TokenType::ThematicBreakSequence
- | TokenType::ThematicBreakWhitespace
| TokenType::Whitespace => {
// Ignore.
}
diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs
index fdb1ee0..86091d9 100644
--- a/src/construct/blank_line.rs
+++ b/src/construct/blank_line.rs
@@ -29,27 +29,24 @@
//!
//! <!-- To do: link `list` -->
-use crate::construct::partial_whitespace::start as whitespace;
-use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::construct::partial_space_or_tab::space_or_tab_opt;
+use crate::tokenizer::{Code, State, StateFnResult, Tokenizer};
/// Start of a blank line.
///
-/// Note: `␠` represents a space character.
+/// > πŸ‘‰ **Note**: `␠` represents a space character.
///
/// ```markdown
/// |␠␠
/// |
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::BlankLineWhitespace),
- |_ok| Box::new(after),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), after)(tokenizer, code)
}
/// After zero or more spaces or tabs, before a line ending or EOF.
///
-/// Note: `␠` represents a space character.
+/// > πŸ‘‰ **Note**: `␠` represents a space character.
///
/// ```markdown
/// |␠␠
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index ba76aa8..30ec911 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -91,7 +91,7 @@
//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE};
-use crate::construct::partial_whitespace::start as whitespace;
+use crate::construct::partial_space_or_tab::{space_or_tab_min_max, space_or_tab_opt};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
use crate::util::span::from_exit_event;
@@ -130,10 +130,7 @@ struct Info {
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::CodeFenced);
tokenizer.enter(TokenType::CodeFencedFence);
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(before_sequence_open),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), before_sequence_open)(tokenizer, code)
}
/// Inside the opening fence, after an optional prefix, before a sequence.
@@ -159,6 +156,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult
tokenizer.enter(TokenType::CodeFencedFenceSequence);
sequence_open(
tokenizer,
+ code,
Info {
prefix,
size: 0,
@@ -168,7 +166,6 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult
Kind::Tilde
},
},
- code,
)
}
_ => (State::Nok, None),
@@ -182,7 +179,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult
/// console.log(1);
/// ~~~
/// ```
-fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn sequence_open(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
let marker = if info.kind == Kind::GraveAccent {
'`'
} else {
@@ -193,26 +190,18 @@ fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnRe
Code::Char(char) if char == marker => {
tokenizer.consume(code);
(
- State::Fn(Box::new(|tokenizer, code| {
+ State::Fn(Box::new(|t, c| {
let mut info = info;
info.size += 1;
- sequence_open(tokenizer, info, code)
+ sequence_open(t, c, info)
})),
None,
)
}
+ _ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN => (State::Nok, None),
_ => {
- if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN {
- (State::Nok, None)
- } else {
- tokenizer.exit(TokenType::CodeFencedFenceSequence);
- tokenizer.attempt(
- |tokenizer, code| {
- whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace)
- },
- |_ok| Box::new(|tokenizer, code| info_before(tokenizer, info, code)),
- )(tokenizer, code)
- }
+ tokenizer.exit(TokenType::CodeFencedFenceSequence);
+ tokenizer.go(space_or_tab_opt(), |t, c| info_before(t, c, info))(tokenizer, code)
}
}
}
@@ -224,16 +213,16 @@ fn sequence_open(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnRe
/// console.log(1);
/// ~~~
/// ```
-fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn info_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::CodeFencedFence);
- at_break(tokenizer, info, code)
+ at_break(tokenizer, code, info)
}
_ => {
tokenizer.enter(TokenType::CodeFencedFenceInfo);
tokenizer.enter(TokenType::ChunkString);
- info_inside(tokenizer, info, code, vec![])
+ info_inside(tokenizer, code, info, vec![])
}
}
}
@@ -247,8 +236,8 @@ fn info_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResu
/// ```
fn info_inside(
tokenizer: &mut Tokenizer,
- info: Info,
code: Code,
+ info: Info,
codes: Vec<Code>,
) -> StateFnResult {
match code {
@@ -256,15 +245,12 @@ fn info_inside(
tokenizer.exit(TokenType::ChunkString);
tokenizer.exit(TokenType::CodeFencedFenceInfo);
tokenizer.exit(TokenType::CodeFencedFence);
- at_break(tokenizer, info, code)
+ at_break(tokenizer, code, info)
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
tokenizer.exit(TokenType::ChunkString);
tokenizer.exit(TokenType::CodeFencedFenceInfo);
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace),
- |_ok| Box::new(|tokenizer, code| meta_before(tokenizer, info, code)),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), |t, c| meta_before(t, c, info))(tokenizer, code)
}
Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
Code::Char(_) => {
@@ -272,9 +258,7 @@ fn info_inside(
codes.push(code);
tokenizer.consume(code);
(
- State::Fn(Box::new(|tokenizer, code| {
- info_inside(tokenizer, info, code, codes)
- })),
+ State::Fn(Box::new(|t, c| info_inside(t, c, info, codes))),
None,
)
}
@@ -288,16 +272,16 @@ fn info_inside(
/// console.log(1);
/// ~~~
/// ```
-fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn meta_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::CodeFencedFence);
- at_break(tokenizer, info, code)
+ at_break(tokenizer, code, info)
}
_ => {
tokenizer.enter(TokenType::CodeFencedFenceMeta);
tokenizer.enter(TokenType::ChunkString);
- meta(tokenizer, info, code)
+ meta(tokenizer, code, info)
}
}
}
@@ -309,21 +293,18 @@ fn meta_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResu
/// console.log(1);
/// ~~~
/// ```
-fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn meta(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::ChunkString);
tokenizer.exit(TokenType::CodeFencedFenceMeta);
tokenizer.exit(TokenType::CodeFencedFence);
- at_break(tokenizer, info, code)
+ at_break(tokenizer, code, info)
}
Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
_ => {
tokenizer.consume(code);
- (
- State::Fn(Box::new(|tokenizer, code| meta(tokenizer, info, code))),
- None,
- )
+ (State::Fn(Box::new(|t, c| meta(t, c, info))), None)
}
}
}
@@ -335,7 +316,7 @@ fn meta(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
/// aa|
/// ~~~
/// ```
-fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
let clone = info.clone();
match code {
@@ -345,12 +326,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
- (
- State::Fn(Box::new(|tokenizer, code| {
- close_before(tokenizer, info, code)
- })),
- None,
- )
+ (State::Fn(Box::new(|t, c| close_start(t, c, info))), None)
},
|ok| {
if ok {
@@ -360,12 +336,7 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
- (
- State::Fn(Box::new(|tokenizer, code| {
- content_start(tokenizer, clone, code)
- })),
- None,
- )
+ (State::Fn(Box::new(|t, c| content_start(t, c, clone))), None)
})
}
},
@@ -385,12 +356,11 @@ fn at_break(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult
/// console.log('1')
/// | ~~~
/// ```
-fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn close_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
tokenizer.enter(TokenType::CodeFencedFence);
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(|tokenizer, code| close_sequence_before(tokenizer, info, code)),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_min_max(0, TAB_SIZE - 1), |t, c| {
+ close_before(t, c, info)
+ })(tokenizer, code)
}
/// In a closing fence, after optional whitespace, before sequence.
@@ -404,31 +374,17 @@ fn close_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnRes
/// console.log('1')
/// |~~~
/// ```
-fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
- let tail = tokenizer.events.last();
- let mut prefix = 0;
+fn close_before(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
let marker = if info.kind == Kind::GraveAccent {
'`'
} else {
'~'
};
- if let Some(event) = tail {
- if event.token_type == TokenType::Whitespace {
- let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
- prefix = span.end_index - span.start_index;
- }
- }
-
- // To do: 4+ should be okay if code (indented) is turned off!
- if prefix >= TAB_SIZE {
- return (State::Nok, None);
- }
-
match code {
Code::Char(char) if char == marker => {
tokenizer.enter(TokenType::CodeFencedFenceSequence);
- close_sequence(tokenizer, info, code, 0)
+ close_sequence(tokenizer, code, info, 0)
}
_ => (State::Nok, None),
}
@@ -441,7 +397,7 @@ fn close_sequence_before(tokenizer: &mut Tokenizer, info: Info, code: Code) -> S
/// console.log('1')
/// ~|~~
/// ```
-fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize) -> StateFnResult {
+fn close_sequence(tokenizer: &mut Tokenizer, code: Code, info: Info, size: usize) -> StateFnResult {
let marker = if info.kind == Kind::GraveAccent {
'`'
} else {
@@ -452,18 +408,13 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize
Code::Char(char) if char == marker => {
tokenizer.consume(code);
(
- State::Fn(Box::new(move |tokenizer, code| {
- close_sequence(tokenizer, info, code, size + 1)
- })),
+ State::Fn(Box::new(move |t, c| close_sequence(t, c, info, size + 1))),
None,
)
}
_ if size >= CODE_FENCED_SEQUENCE_SIZE_MIN && size >= info.size => {
tokenizer.exit(TokenType::CodeFencedFenceSequence);
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::CodeFencedFenceWhitespace),
- |_ok| Box::new(close_whitespace_after),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), close_sequence_after)(tokenizer, code)
}
_ => (State::Nok, None),
}
@@ -476,7 +427,7 @@ fn close_sequence(tokenizer: &mut Tokenizer, info: Info, code: Code, size: usize
/// console.log('1')
/// ~~~ |
/// ```
-fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn close_sequence_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::CodeFencedFence);
@@ -493,53 +444,27 @@ fn close_whitespace_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResul
/// |aa
/// ~~~
/// ```
-fn content_start(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
- match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- at_break(tokenizer, info, code)
- }
- Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > 0 => {
- tokenizer.enter(TokenType::Whitespace);
- content_prefix(tokenizer, info, 0, code)
- }
- _ => {
- tokenizer.enter(TokenType::CodeFlowChunk);
- content_continue(tokenizer, info, code)
- }
- }
+fn content_start(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
+ tokenizer.go(space_or_tab_min_max(0, info.prefix), |t, c| {
+ content_begin(t, c, info)
+ })(tokenizer, code)
}
-/// Before code content, in a prefix.
+/// Before code content, after a prefix.
///
/// ```markdown
/// ~~~js
/// | aa
/// ~~~
/// ```
-fn content_prefix(
- tokenizer: &mut Tokenizer,
- info: Info,
- prefix: usize,
- code: Code,
-) -> StateFnResult {
+fn content_begin(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
match code {
- Code::VirtualSpace | Code::Char('\t' | ' ') if info.prefix > prefix => {
- tokenizer.consume(code);
- (
- State::Fn(Box::new(move |tokenizer, code| {
- content_prefix(tokenizer, info, prefix + 1, code)
- })),
- None,
- )
- }
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.exit(TokenType::Whitespace);
- at_break(tokenizer, info, code)
+ at_break(tokenizer, code, info)
}
_ => {
- tokenizer.exit(TokenType::Whitespace);
tokenizer.enter(TokenType::CodeFlowChunk);
- content_continue(tokenizer, info, code)
+ content_continue(tokenizer, code, info)
}
}
}
@@ -553,18 +478,16 @@ fn content_prefix(
/// ab|
/// ~~~
/// ```
-fn content_continue(tokenizer: &mut Tokenizer, info: Info, code: Code) -> StateFnResult {
+fn content_continue(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::CodeFlowChunk);
- at_break(tokenizer, info, code)
+ at_break(tokenizer, code, info)
}
_ => {
tokenizer.consume(code);
(
- State::Fn(Box::new(|tokenizer, code| {
- content_continue(tokenizer, info, code)
- })),
+ State::Fn(Box::new(|t, c| content_continue(t, c, info))),
None,
)
}
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 55b8901..64956be 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -38,6 +38,7 @@
//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+use super::partial_space_or_tab::{space_or_tab_min_max, space_or_tab_opt};
use crate::constant::TAB_SIZE;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -46,46 +47,13 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// ```markdown
/// | asd
/// ```
-pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::VirtualSpace | Code::Char(' ' | '\t') => {
- tokenizer.enter(TokenType::CodeIndented);
- tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
- indent(tokenizer, code, 0)
- }
- _ => (State::Nok, None),
- }
-}
-
-/// Inside the initial whitespace.
-///
-/// ```markdown
-/// | asd
-/// | asd
-/// | asd
-/// |asd
-/// ```
///
/// > **Parsing note**: it is not needed to check if this first line is a
/// > filled line (that it has a non-whitespace character), because blank lines
/// > are parsed already, so we never run into that.
-fn indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
- match code {
- _ if size == TAB_SIZE => {
- tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
- at_break(tokenizer, code)
- }
- Code::VirtualSpace | Code::Char(' ' | '\t') => {
- tokenizer.consume(code);
- (
- State::Fn(Box::new(move |tokenizer, code| {
- indent(tokenizer, code, size + 1)
- })),
- None,
- )
- }
- _ => (State::Nok, None),
- }
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::CodeIndented);
+ tokenizer.go(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), at_break)(tokenizer, code)
}
/// At a break.
@@ -153,39 +121,45 @@ fn further_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::LineEnding);
(State::Fn(Box::new(further_start)), None)
}
- Code::VirtualSpace | Code::Char(' ' | '\t') => {
- tokenizer.enter(TokenType::CodeIndentedPrefixWhitespace);
- further_indent(tokenizer, code, 0)
- }
- _ => (State::Nok, None),
+ _ => tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
+ Box::new(if ok { further_end } else { further_begin })
+ })(tokenizer, code),
}
}
-/// Inside further whitespace.
+/// After a proper indent.
///
/// ```markdown
/// asd
-/// | asd
+/// |asd
/// ```
-fn further_indent(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+fn further_end(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ (State::Ok, Some(vec![code]))
+}
+
+/// At the beginning of a line that is not indented enough.
+///
+/// > πŸ‘‰ **Note**: `␠` represents a space character.
+///
+/// ```markdown
+/// asd
+/// |␠␠
+/// asd
+/// ```
+fn further_begin(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.go(space_or_tab_opt(), further_after)(tokenizer, code)
+}
+
+/// After whitespace.
+///
+/// ```markdown
+/// asd
+/// ␠␠|
+/// asd
+/// ```
+fn further_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- _ if size == TAB_SIZE => {
- tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
- (State::Ok, Some(vec![code]))
- }
- Code::VirtualSpace | Code::Char(' ' | '\t') => {
- tokenizer.consume(code);
- (
- State::Fn(Box::new(move |tokenizer, code| {
- further_indent(tokenizer, code, size + 1)
- })),
- None,
- )
- }
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.exit(TokenType::CodeIndentedPrefixWhitespace);
- further_start(tokenizer, code)
- }
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => further_start(tokenizer, code),
_ => (State::Nok, None),
}
}
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index f7f8acd..03baee6 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -58,7 +58,7 @@
use crate::construct::{
partial_destination::start as destination, partial_label::start as label,
- partial_title::start as title, partial_whitespace::start as whitespace,
+ partial_space_or_tab::space_or_tab_opt, partial_title::start as title,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -68,11 +68,18 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// |[a]: b "c"
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::Definition);
+ tokenizer.go(space_or_tab_opt(), before)(tokenizer, code)
+}
+
+/// At the start of a definition, after whitespace.
+///
+/// ```markdown
+/// |[a]: b "c"
+/// ```
+pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char('[') => {
- tokenizer.enter(TokenType::Definition);
- tokenizer.go(label, label_after)(tokenizer, code)
- }
+ Code::Char('[') => tokenizer.go(label, label_after)(tokenizer, code),
_ => (State::Nok, None),
}
}
@@ -93,27 +100,15 @@ fn label_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::DefinitionMarker);
tokenizer.consume(code);
tokenizer.exit(TokenType::DefinitionMarker);
- (State::Fn(Box::new(marker_after)), None)
+ (
+ State::Fn(Box::new(tokenizer.go(space_or_tab_opt(), marker_after))),
+ None,
+ )
}
_ => (State::Nok, None),
}
}
-/// After the marker of a definition.
-///
-/// ```markdown
-/// [a]:| b "c"
-///
-/// [a]:| ␊
-/// b "c"
-/// ```
-fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(marker_after_optional_whitespace),
- )(tokenizer, code)
-}
-
/// After the marker, after whitespace.
///
/// ```markdown
@@ -122,31 +117,23 @@ fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// [a]: |␊
/// b "c"
/// ```
-fn marker_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn marker_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
- (State::Fn(Box::new(marker_after_optional_line_ending)), None)
+ (
+ State::Fn(Box::new(
+ tokenizer.go(space_or_tab_opt(), destination_before),
+ )),
+ None,
+ )
}
_ => destination_before(tokenizer, code),
}
}
-/// After the marker, after a line ending.
-///
-/// ```markdown
-/// [a]:
-/// | b "c"
-/// ```
-fn marker_after_optional_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(destination_before),
- )(tokenizer, code)
-}
-
/// Before a destination.
///
/// ```markdown
@@ -163,8 +150,9 @@ fn destination_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
);
- if !char_nok
- && (event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace)
+ // Whitespace.
+ if (event.token_type == TokenType::LineEnding || event.token_type == TokenType::Whitespace)
+ && !char_nok
{
tokenizer.go(destination, destination_after)(tokenizer, code)
} else {
@@ -191,10 +179,7 @@ fn destination_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// [a]: b "c"|
/// ```
fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(after_whitespace),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), after_whitespace)(tokenizer, code)
}
/// After a definition, after optional whitespace.
@@ -222,10 +207,7 @@ fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// "c"
/// ```
fn title_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(title_before_after_optional_whitespace),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), title_before_after_optional_whitespace)(tokenizer, code)
}
/// Before a title, after optional whitespace.
@@ -243,7 +225,9 @@ fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code)
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
(
- State::Fn(Box::new(title_before_after_optional_line_ending)),
+ State::Fn(Box::new(
+ tokenizer.go(space_or_tab_opt(), title_before_marker),
+ )),
None,
)
}
@@ -257,19 +241,6 @@ fn title_before_after_optional_whitespace(tokenizer: &mut Tokenizer, code: Code)
/// [a]: b␊
/// | "c"
/// ```
-fn title_before_after_optional_line_ending(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(title_before_marker),
- )(tokenizer, code)
-}
-
-/// Before a title, after a line ending.
-///
-/// ```markdown
-/// [a]: b␊
-/// | "c"
-/// ```
fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
let event = tokenizer.events.last().unwrap();
@@ -289,10 +260,7 @@ fn title_before_marker(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// "c"|
/// ```
fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(title_after_after_optional_whitespace),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), title_after_after_optional_whitespace)(tokenizer, code)
}
/// After a title, after optional whitespace.
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index ab8b6a5..12d4193 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -47,6 +47,7 @@
//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
//! [atx]: http://www.aaronsw.com/2002/atx/
+use super::partial_space_or_tab::{space_or_tab, space_or_tab_opt};
use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -56,8 +57,17 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// |## alpha
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::HeadingAtx);
+ tokenizer.go(space_or_tab_opt(), before)(tokenizer, code)
+}
+
+/// Start of a heading (atx), after whitespace.
+///
+/// ```markdown
+/// |## alpha
+/// ```
+pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
if Code::Char('#') == code {
- tokenizer.enter(TokenType::HeadingAtx);
tokenizer.enter(TokenType::HeadingAtxSequence);
sequence_open(tokenizer, code, 0)
} else {
@@ -72,12 +82,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnResult {
match code {
- Code::None
- | Code::CarriageReturnLineFeed
- | Code::VirtualSpace
- | Code::Char('\t' | '\n' | '\r' | ' ')
- if rank > 0 =>
- {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') if rank > 0 => {
tokenizer.exit(TokenType::HeadingAtxSequence);
at_break(tokenizer, code)
}
@@ -90,6 +95,13 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR
None,
)
}
+ _ if rank > 0 => {
+ tokenizer.exit(TokenType::HeadingAtxSequence);
+ tokenizer.go(
+ space_or_tab(TokenType::HeadingAtxWhitespace, 1, usize::MAX),
+ at_break,
+ )(tokenizer, code)
+ }
_ => (State::Nok, None),
}
}
@@ -109,10 +121,10 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::HeadingAtx);
(State::Ok, Some(vec![code]))
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.enter(TokenType::HeadingAtxWhitespace);
- whitespace(tokenizer, code)
- }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.go(
+ space_or_tab(TokenType::HeadingAtxWhitespace, 1, usize::MAX),
+ at_break,
+ )(tokenizer, code),
Code::Char('#') => {
tokenizer.enter(TokenType::HeadingAtxSequence);
further_sequence(tokenizer, code)
@@ -141,24 +153,6 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// In whitespace.
-///
-/// ```markdown
-/// ## alpha | bravo
-/// ```
-fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- match code {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.consume(code);
- (State::Fn(Box::new(whitespace)), None)
- }
- _ => {
- tokenizer.exit(TokenType::HeadingAtxWhitespace);
- at_break(tokenizer, code)
- }
- }
-}
-
/// In text.
///
/// ```markdown
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
index f4c6001..64647cb 100644
--- a/src/construct/heading_setext.rs
+++ b/src/construct/heading_setext.rs
@@ -50,7 +50,7 @@
//! [atx]: http://www.aaronsw.com/2002/atx/
use crate::constant::TAB_SIZE;
-use crate::construct::partial_whitespace::start as whitespace;
+use crate::construct::partial_space_or_tab::space_or_tab_opt;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
use crate::util::span::from_exit_event;
@@ -70,12 +70,22 @@ pub enum Kind {
/// ==
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::HeadingSetext);
+ tokenizer.go(space_or_tab_opt(), before)(tokenizer, code)
+}
+
+/// Start of a heading (setext), after whitespace.
+///
+/// ```markdown
+/// |alpha
+/// ==
+/// ```
+pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
unreachable!("expected non-eol/eof");
}
_ => {
- tokenizer.enter(TokenType::HeadingSetext);
tokenizer.enter(TokenType::HeadingSetextText);
tokenizer.enter(TokenType::ChunkText);
text_inside(tokenizer, code)
@@ -134,10 +144,7 @@ fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.events[next].previous = Some(previous);
(
- State::Fn(Box::new(tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(text_line_start),
- ))),
+ State::Fn(Box::new(tokenizer.go(space_or_tab_opt(), text_line_start))),
None,
)
}
@@ -202,25 +209,17 @@ fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
- (State::Fn(Box::new(underline_start)), None)
+ (
+ State::Fn(Box::new(
+ tokenizer.go(space_or_tab_opt(), underline_sequence_start),
+ )),
+ None,
+ )
}
_ => unreachable!("expected eol"),
}
}
-/// After a line ending, presumably an underline.
-///
-/// ```markdown
-/// alpha
-/// |==
-/// ```
-fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(underline_sequence_start),
- )(tokenizer, code)
-}
-
/// After optional whitespace, presumably an underline.
///
/// ```markdown
@@ -276,11 +275,7 @@ fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind)
None,
)
}
- Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(underline_after),
- )(tokenizer, code),
- _ => underline_after(tokenizer, code),
+ _ => tokenizer.go(space_or_tab_opt(), underline_after)(tokenizer, code),
}
}
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index 5adac7d..4819e63 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -93,7 +93,7 @@
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
use crate::constant::{HTML_BLOCK_NAMES, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX};
-use crate::construct::{blank_line::start as blank_line, partial_whitespace::start as whitespace};
+use crate::construct::{blank_line::start as blank_line, partial_space_or_tab::space_or_tab_opt};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// Kind of HTML (flow).
@@ -155,10 +155,7 @@ struct Info {
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::HtmlFlow);
tokenizer.enter(TokenType::HtmlFlowData);
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(before),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), before)(tokenizer, code)
}
/// After optional whitespace, before `<`.
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index 93b4b62..a91113f 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -49,7 +49,7 @@
//! [html_flow]: crate::construct::html_flow
//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
-use crate::construct::partial_whitespace::start as whitespace;
+use crate::construct::partial_space_or_tab::space_or_tab_opt;
use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer};
/// Start of HTML (text)
@@ -673,10 +673,9 @@ fn after_line_ending(
code: Code,
return_state: Box<StateFn>,
) -> StateFnResult {
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(|t, c| after_line_ending_prefix(t, c, return_state)),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), |t, c| {
+ after_line_ending_prefix(t, c, return_state)
+ })(tokenizer, code)
}
/// After a line ending, after indent.
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 1debb74..407dc6b 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -70,6 +70,6 @@ pub mod html_text;
pub mod paragraph;
pub mod partial_destination;
pub mod partial_label;
+pub mod partial_space_or_tab;
pub mod partial_title;
-pub mod partial_whitespace;
pub mod thematic_break;
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index 50ef627..fa18f28 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -30,12 +30,11 @@
use crate::constant::TAB_SIZE;
use crate::construct::{
- code_fenced::start as code_fenced, heading_atx::start as heading_atx,
- html_flow::start as html_flow, partial_whitespace::start as whitespace,
- thematic_break::start as thematic_break,
+ blank_line::start as blank_line, code_fenced::start as code_fenced,
+ heading_atx::start as heading_atx, html_flow::start as html_flow,
+ partial_space_or_tab::space_or_tab_min_max, thematic_break::start as thematic_break,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
-use crate::util::span::from_exit_event;
/// Before a paragraph.
///
@@ -114,7 +113,7 @@ fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
- (State::Fn(Box::new(interrupt_initial)), None)
+ (State::Fn(Box::new(interrupt_start)), None)
}
_ => unreachable!("expected eol"),
}
@@ -123,55 +122,30 @@ fn interrupt(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// After a line ending.
///
/// ```markdown
-/// alpha|
-/// ~~~js
+/// alpha
+/// |~~~js
/// ~~~
/// ```
-fn interrupt_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt_2(code_fenced, html_flow, |ok| {
+fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // To do: If code is disabled, indented lines are allowed to interrupt.
+ tokenizer.attempt(space_or_tab_min_max(TAB_SIZE, TAB_SIZE), |ok| {
if ok {
- Box::new(|_tokenizer, _code| (State::Nok, None))
+ Box::new(|_t, code| (State::Ok, Some(vec![code])))
} else {
Box::new(|tokenizer, code| {
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(interrupt_start),
+ tokenizer.attempt_5(
+ blank_line,
+ code_fenced,
+ html_flow,
+ heading_atx,
+ thematic_break,
+ |ok| {
+ Box::new(move |_t, code| {
+ (if ok { State::Nok } else { State::Ok }, Some(vec![code]))
+ })
+ },
)(tokenizer, code)
})
}
})(tokenizer, code)
}
-
-/// After a line ending, after optional whitespace.
-///
-/// ```markdown
-/// alpha|
-/// # bravo
-/// ```
-fn interrupt_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- let tail = tokenizer.events.last();
- let mut prefix = 0;
-
- if let Some(event) = tail {
- if event.token_type == TokenType::Whitespace {
- let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
- prefix = span.end_index - span.start_index;
- }
- }
-
- match code {
- // Blank lines are not allowed in paragraph.
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
- // To do: If code is disabled, indented lines are allowed.
- _ if prefix >= TAB_SIZE => (State::Ok, None),
- // To do: definitions, setext headings, etc?
- _ => tokenizer.attempt_2(heading_atx, thematic_break, |ok| {
- let result = if ok {
- (State::Nok, None)
- } else {
- (State::Ok, None)
- };
- Box::new(|_t, _c| result)
- })(tokenizer, code),
- }
-}
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index 58d07c1..bc95055 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -60,7 +60,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::DefinitionDestinationLiteralMarker);
(State::Fn(Box::new(enclosed_before)), None)
}
- Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(')') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::VirtualSpace | Code::Char(' ' | ')') => {
(State::Nok, None)
}
Code::Char(char) if char.is_ascii_control() => (State::Nok, None),
diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs
new file mode 100644
index 0000000..40ece49
--- /dev/null
+++ b/src/construct/partial_space_or_tab.rs
@@ -0,0 +1,98 @@
+//! Several helpers to parse whitespace (`space_or_tab`).
+//!
+//! ## References
+//!
+//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js)
+
+use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer};
+
+/// Optional `space_or_tab`
+///
+/// ```bnf
+/// space_or_tab_opt ::= *( ' ' '\t' )
+/// ```
+pub fn space_or_tab_opt() -> Box<StateFn> {
+ space_or_tab_min_max(0, usize::MAX)
+}
+
+/// Between `x` and `y` `space_or_tab`
+///
+/// ```bnf
+/// space_or_tab_min_max ::= x*y( ' ' '\t' )
+/// ```
+pub fn space_or_tab_min_max(min: usize, max: usize) -> Box<StateFn> {
+ space_or_tab(TokenType::Whitespace, min, max)
+}
+
+/// Between `x` and `y` `space_or_tab`, with the given token type.
+///
+/// ```bnf
+/// space_or_tab ::= x*y( ' ' '\t' )
+/// ```
+pub fn space_or_tab(kind: TokenType, min: usize, max: usize) -> Box<StateFn> {
+ Box::new(move |t, c| start(t, c, kind, min, max))
+}
+
+/// Before whitespace.
+///
+/// ```markdown
+/// alpha| bravo
+/// ```
+fn start(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ kind: TokenType,
+ min: usize,
+ max: usize,
+) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') if max > 0 => {
+ tokenizer.enter(kind.clone());
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ inside(tokenizer, code, kind, min, max, 1)
+ })),
+ None,
+ )
+ }
+ _ => (
+ if min == 0 { State::Ok } else { State::Nok },
+ Some(vec![code]),
+ ),
+ }
+}
+
+/// In whitespace.
+///
+/// ```markdown
+/// alpha |bravo
+/// alpha | bravo
+/// ```
+fn inside(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ kind: TokenType,
+ min: usize,
+ max: usize,
+ size: usize,
+) -> StateFnResult {
+ match code {
+ Code::VirtualSpace | Code::Char('\t' | ' ') if size < max => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ inside(tokenizer, code, kind, min, max, size + 1)
+ })),
+ None,
+ )
+ }
+ _ => {
+ tokenizer.exit(kind);
+ (
+ if size >= min { State::Ok } else { State::Nok },
+ Some(vec![code]),
+ )
+ }
+ }
+}
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index 19ba8d4..0669c8e 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -33,7 +33,7 @@
// To do: pass token types in.
-use crate::construct::partial_whitespace::start as whitespace;
+use crate::construct::partial_space_or_tab::space_or_tab_opt;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// Type of title.
@@ -143,10 +143,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult
/// |b"
/// ```
fn line_start(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult {
- tokenizer.attempt(
- |t, c| whitespace(t, c, TokenType::Whitespace),
- |_ok| Box::new(|t, c| line_begin(t, c, kind)),
- )(tokenizer, code)
+ tokenizer.go(space_or_tab_opt(), |t, c| line_begin(t, c, kind))(tokenizer, code)
}
/// After a line ending, after optional whitespace.
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
deleted file mode 100644
index b8cf9a7..0000000
--- a/src/construct/partial_whitespace.rs
+++ /dev/null
@@ -1,64 +0,0 @@
-//! A little helper to parse `space_or_tab`
-//!
-//! They’re formed with the following BNF:
-//!
-//! ```bnf
-//! space_or_tab ::= 1*(' ' '\t')
-//! ```
-//!
-//! Depending on where whitespace can occur, it can be optional (or not),
-//! and present in the rendered result (or not).
-//!
-//! ## References
-//!
-//! * [`micromark-factory-space/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-space/dev/index.js)
-
-use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
-
-// To do: should `token_type` be a `Some`, with `None` defaulting to something?
-// To do: should `max: Some(usize)` be added?
-
-/// Before whitespace.
-///
-/// ```markdown
-/// alpha| bravo
-/// ```
-pub fn start(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult {
- match code {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- // To do: lifetimes.
- let clone = token_type.clone();
- tokenizer.enter(token_type);
- tokenizer.consume(code);
- (
- State::Fn(Box::new(|tokenizer, code| inside(tokenizer, code, clone))),
- None,
- )
- }
- _ => (State::Nok, None),
- }
-}
-
-/// In whitespace.
-///
-/// ```markdown
-/// alpha |bravo
-/// alpha | bravo
-/// ```
-fn inside(tokenizer: &mut Tokenizer, code: Code, token_type: TokenType) -> StateFnResult {
- match code {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.consume(code);
- (
- State::Fn(Box::new(|tokenizer, code| {
- inside(tokenizer, code, token_type)
- })),
- None,
- )
- }
- _ => {
- tokenizer.exit(token_type);
- (State::Ok, Some(vec![code]))
- }
- }
-}
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index bc41991..abf733d 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -44,6 +44,7 @@
//!
//! <!-- To do: link `lists` -->
+use super::partial_space_or_tab::space_or_tab_opt;
use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -53,9 +54,18 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// |***
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.enter(TokenType::ThematicBreak);
+ tokenizer.go(space_or_tab_opt(), before)(tokenizer, code)
+}
+
+/// Start of a thematic break, after whitespace.
+///
+/// ```markdown
+/// |***
+/// ```
+pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::Char(char) if char == '*' || char == '-' || char == '_' => {
- tokenizer.enter(TokenType::ThematicBreak);
at_break(tokenizer, code, char, 0)
}
_ => (State::Nok, None),
@@ -71,20 +81,16 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn at_break(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
match code {
- Code::Char(char) if char == marker => {
- tokenizer.enter(TokenType::ThematicBreakSequence);
- sequence(tokenizer, code, marker, size)
- }
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.enter(TokenType::ThematicBreakWhitespace);
- whitespace(tokenizer, code, marker, size)
- }
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
if size >= THEMATIC_BREAK_MARKER_COUNT_MIN =>
{
tokenizer.exit(TokenType::ThematicBreak);
(State::Ok, Some(vec![code]))
}
+ Code::Char(char) if char == marker => {
+ tokenizer.enter(TokenType::ThematicBreakSequence);
+ sequence(tokenizer, code, marker, size)
+ }
_ => (State::Nok, None),
}
}
@@ -109,31 +115,9 @@ fn sequence(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) ->
}
_ => {
tokenizer.exit(TokenType::ThematicBreakSequence);
- at_break(tokenizer, code, marker, size)
- }
- }
-}
-
-/// In whitespace.
-///
-/// ```markdown
-/// * |* *
-/// * | * *
-/// ```
-fn whitespace(tokenizer: &mut Tokenizer, code: Code, marker: char, size: usize) -> StateFnResult {
- match code {
- Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.consume(code);
- (
- State::Fn(Box::new(move |tokenizer, code| {
- whitespace(tokenizer, code, marker, size)
- })),
- None,
+ tokenizer.go(space_or_tab_opt(), move |t, c| at_break(t, c, marker, size))(
+ tokenizer, code,
)
}
- _ => {
- tokenizer.exit(TokenType::ThematicBreakWhitespace);
- at_break(tokenizer, code, marker, size)
- }
}
}
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 22aa77f..f4af4ea 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -24,7 +24,7 @@ use crate::construct::{
code_indented::start as code_indented, definition::start as definition,
heading_atx::start as heading_atx, heading_setext::start as heading_setext,
html_flow::start as html_flow, paragraph::start as paragraph,
- partial_whitespace::start as whitespace, thematic_break::start as thematic_break,
+ thematic_break::start as thematic_break,
};
use crate::subtokenize::subtokenize;
use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
@@ -95,9 +95,16 @@ fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
// To do: should all flow just start before the prefix?
- _ => tokenizer.attempt_3(code_indented, code_fenced, html_flow, |ok| {
- Box::new(if ok { after } else { before })
- })(tokenizer, code),
+ _ => tokenizer.attempt_7(
+ code_indented,
+ code_fenced,
+ html_flow,
+ heading_atx,
+ thematic_break,
+ definition,
+ heading_setext,
+ |ok| Box::new(if ok { after } else { before_paragraph }),
+ )(tokenizer, code),
}
}
@@ -123,36 +130,6 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// Before flow, but not at code (indented) or code (fenced).
-///
-/// Compared to flow (initial), normal flow can be arbitrarily prefixed.
-///
-/// ```markdown
-/// |qwe
-/// ```
-fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt(
- |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
- |_ok| Box::new(before_after_prefix),
- )(tokenizer, code)
-}
-
-/// Before flow, after potential whitespace.
-///
-/// ```markdown
-/// |# asd
-/// |***
-/// ```
-fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt_4(
- heading_atx,
- thematic_break,
- definition,
- heading_setext,
- |ok| Box::new(if ok { after } else { before_paragraph }),
- )(tokenizer, code)
-}
-
/// Before a paragraph.
///
/// ```markdown
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index c1bb61b..de27d12 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -25,7 +25,6 @@ pub enum TokenType {
AutolinkProtocol,
AutolinkEmail,
BlankLineEnding,
- BlankLineWhitespace,
CharacterEscape,
CharacterEscapeMarker,
CharacterEscapeValue,
@@ -38,12 +37,10 @@ pub enum TokenType {
CodeFenced,
CodeFencedFence,
CodeFencedFenceSequence,
- CodeFencedFenceWhitespace,
CodeFencedFenceInfo,
CodeFencedFenceMeta,
CodeFlowChunk,
CodeIndented,
- CodeIndentedPrefixWhitespace,
CodeText,
CodeTextSequence,
CodeTextLineEnding,
@@ -81,7 +78,6 @@ pub enum TokenType {
Paragraph,
ThematicBreak,
ThematicBreakSequence,
- ThematicBreakWhitespace,
Whitespace,
// Chunks are tokenizer, but unraveled by `subtokenize`.
@@ -114,7 +110,7 @@ pub struct Point {
/// 1-indexed line number.
pub line: usize,
/// 1-indexed column number.
- /// Note that this is increases up to a tab stop for tabs.
+ /// This is increases up to a tab stop for tabs.
/// Some editors count tabs as 1 character, so this position is not always
/// the same as editors.
pub column: usize,
@@ -485,32 +481,14 @@ impl Tokenizer {
)
}
- pub fn attempt_3(
- &mut self,
- a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
- b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
- c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
- done: impl FnOnce(bool) -> Box<StateFn> + 'static,
- ) -> Box<StateFn> {
- self.call_multiple(
- false,
- Some(Box::new(a)),
- Some(Box::new(b)),
- Some(Box::new(c)),
- None,
- None,
- None,
- None,
- done,
- )
- }
-
- pub fn attempt_4(
+ #[allow(clippy::too_many_arguments, clippy::many_single_char_names)]
+ pub fn attempt_5(
&mut self,
a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
done: impl FnOnce(bool) -> Box<StateFn> + 'static,
) -> Box<StateFn> {
self.call_multiple(
@@ -519,7 +497,7 @@ impl Tokenizer {
Some(Box::new(b)),
Some(Box::new(c)),
Some(Box::new(d)),
- None,
+ Some(Box::new(e)),
None,
None,
done,
diff --git a/tests/autolink.rs b/tests/autolink.rs
index 9d394d7..51873ed 100644
--- a/tests/autolink.rs
+++ b/tests/autolink.rs
@@ -7,7 +7,7 @@ const DANGER: &CompileOptions = &CompileOptions {
};
#[test]
-fn autolink() {
+fn code_fenced() {
assert_eq!(
micromark("```\n<\n >\n```"),
"<pre><code>&lt;\n &gt;\n</code></pre>",