aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-16 19:04:16 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-06-16 19:04:16 +0200
commit60ea2fd3a09f10fa28bf48575736b47afebf3221 (patch)
treef7aae5cec9181f7ff5df23e648fe1da22a94209f
parentef14d6581848ba5052d3389bb61fc96645551eef (diff)
downloadmarkdown-rs-60ea2fd3a09f10fa28bf48575736b47afebf3221.tar.gz
markdown-rs-60ea2fd3a09f10fa28bf48575736b47afebf3221.tar.bz2
markdown-rs-60ea2fd3a09f10fa28bf48575736b47afebf3221.zip
Add heading (setext)
-rw-r--r--readme.md8
-rw-r--r--src/compiler.rs126
-rw-r--r--src/construct/heading_atx.rs32
-rw-r--r--src/construct/heading_setext.rs301
-rw-r--r--src/construct/mod.rs3
-rw-r--r--src/construct/thematic_break.rs5
-rw-r--r--src/content/content.rs22
-rw-r--r--src/content/flow.rs19
-rw-r--r--src/tokenizer.rs11
-rw-r--r--tests/character_escape.rs2
-rw-r--r--tests/code_fenced.rs11
-rw-r--r--tests/code_indented.rs11
-rw-r--r--tests/heading_setext.rs279
-rw-r--r--tests/thematic_break.rs11
14 files changed, 727 insertions, 114 deletions
diff --git a/readme.md b/readme.md
index 0cbff1e..20ce174 100644
--- a/readme.md
+++ b/readme.md
@@ -46,8 +46,6 @@ cargo doc --document-private-items
### Some major obstacles
-- [ ] (1) Setext headings: can they be solved in content, or do they have to be
- solved in flow somehow
- [ ] (8) Can content (and to a lesser extent string and text) operate more
performantly than checking whether other flow constructs start a line,
before exiting and actually attempting flow constructs?
@@ -114,7 +112,7 @@ cargo doc --document-private-items
- [x] hard break (escape)
- [x] hard break (trailing)
- [x] heading (atx)
-- [ ] (1) heading (setext)
+- [x] heading (setext)
- [x] html (flow)
- [x] html (text)
- [ ] (3) label end
@@ -135,11 +133,11 @@ cargo doc --document-private-items
- [x] code (indented)
- [x] content
- [x] heading (atx)
+ - [x] heading (setext)
- [x] html (flow)
- [x] thematic break
- [ ] (3) content
- [ ] definition
- - [ ] heading (setext)
- [x] paragraph
- [ ] (5) text
- [ ] attention (strong, emphasis) (text)
@@ -169,6 +167,8 @@ cargo doc --document-private-items
- [x] (1) Add examples to `CompileOptions` docs
- [x] (3) Fix deep subtokenization
- [x] (1) text in heading
+- [x] (1) Setext headings: can they be solved in content, or do they have to be
+ solved in flow somehow
### Extensions
diff --git a/src/compiler.rs b/src/compiler.rs
index 50c06e1..9941fa5 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -5,7 +5,7 @@ use crate::util::{
decode_character_reference::{decode_named, decode_numeric},
encode::encode,
sanitize_uri::sanitize_uri,
- span::{from_exit_event, serialize},
+ span::{codes as codes_from_span, from_exit_event, serialize},
};
/// Configuration (optional).
@@ -78,6 +78,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
let buffers: &mut Vec<Vec<String>> = &mut vec![vec![]];
let mut atx_opening_sequence_size: Option<usize> = None;
let mut atx_heading_buffer: Option<String> = None;
+ let mut heading_setext_buffer: Option<String> = None;
let mut code_flow_seen_data: Option<bool> = None;
let mut code_fenced_fences_count: Option<usize> = None;
let mut slurp_one_line_ending = false;
@@ -102,10 +103,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
match event.event_type {
EventType::Enter => match token_type {
- TokenType::AtxHeading
- | TokenType::AtxHeadingSequence
- | TokenType::AtxHeadingWhitespace
- | TokenType::Autolink
+ TokenType::Autolink
| TokenType::AutolinkEmail
| TokenType::AutolinkMarker
| TokenType::AutolinkProtocol
@@ -134,6 +132,11 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::HardBreakEscapeMarker
| TokenType::HardBreakTrailing
| TokenType::HardBreakTrailingSpace
+ | TokenType::HeadingAtx
+ | TokenType::HeadingAtxSequence
+ | TokenType::HeadingAtxWhitespace
+ | TokenType::HeadingSetext
+ | TokenType::HeadingSetextUnderline
| TokenType::HtmlFlowData
| TokenType::HtmlTextData
| TokenType::LineEnding
@@ -143,9 +146,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::Whitespace => {
// Ignore.
}
- TokenType::AtxHeadingText
- | TokenType::CodeFencedFenceInfo
- | TokenType::CodeFencedFenceMeta => {
+ TokenType::CodeFencedFenceInfo
+ | TokenType::CodeFencedFenceMeta
+ | TokenType::HeadingAtxText
+ | TokenType::HeadingSetextText => {
buffer(buffers);
}
TokenType::CodeIndented => {
@@ -199,6 +203,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::Content
| TokenType::HardBreakEscapeMarker
| TokenType::HardBreakTrailingSpace
+ | TokenType::HeadingSetext
| TokenType::ThematicBreakSequence
| TokenType::ThematicBreakWhitespace
| TokenType::Whitespace => {
@@ -213,52 +218,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
false,
)));
}
- TokenType::AtxHeading => {
- let rank = atx_opening_sequence_size
- .expect("`atx_opening_sequence_size` must be set in headings");
- buf_tail_mut(buffers).push(format!("</h{}>", rank));
- atx_opening_sequence_size = None;
- atx_heading_buffer = None;
- }
- // `AtxHeadingWhitespace` is ignored after the opening sequence,
- // before the closing sequence, and after the closing sequence.
- // But it is used around intermediate sequences.
- // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`.
- // `AtxHeadingSequence` is ignored as the opening and closing sequence,
- // but not when intermediate.
- TokenType::AtxHeadingSequence | TokenType::AtxHeadingWhitespace => {
- if let Some(buf) = atx_heading_buffer {
- atx_heading_buffer = Some(
- buf.to_string()
- + &encode(&serialize(
- codes,
- &from_exit_event(events, index),
- false,
- )),
- );
- }
-
- // First fence we see.
- if None == atx_opening_sequence_size {
- let rank = serialize(codes, &from_exit_event(events, index), false).len();
- atx_opening_sequence_size = Some(rank);
- buf_tail_mut(buffers).push(format!("<h{}>", rank));
- }
- }
- TokenType::AtxHeadingText => {
- let result = resume(buffers);
-
- if let Some(ref buf) = atx_heading_buffer {
- if !buf.is_empty() {
- buf_tail_mut(buffers).push(encode(buf));
- atx_heading_buffer = Some("".to_string());
- }
- } else {
- atx_heading_buffer = Some("".to_string());
- }
-
- buf_tail_mut(buffers).push(encode(&result));
- }
TokenType::AutolinkEmail => {
let slice = serialize(codes, &from_exit_event(events, index), false);
let buf = buf_tail_mut(buffers);
@@ -394,11 +353,68 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
TokenType::CodeTextLineEnding => {
buf_tail_mut(buffers).push(" ".to_string());
}
-
TokenType::HardBreakEscape | TokenType::HardBreakTrailing => {
buf_tail_mut(buffers).push("<br />".to_string());
}
+ TokenType::HeadingAtx => {
+ let rank = atx_opening_sequence_size
+ .expect("`atx_opening_sequence_size` must be set in headings");
+ buf_tail_mut(buffers).push(format!("</h{}>", rank));
+ atx_opening_sequence_size = None;
+ atx_heading_buffer = None;
+ }
+ // `HeadingAtxWhitespace` is ignored after the opening sequence,
+ // before the closing sequence, and after the closing sequence.
+ // But it is used around intermediate sequences.
+ // `atx_heading_buffer` is set to `Some` by the first `HeadingAtxText`.
+ // `HeadingAtxSequence` is ignored as the opening and closing sequence,
+ // but not when intermediate.
+ TokenType::HeadingAtxSequence | TokenType::HeadingAtxWhitespace => {
+ if let Some(buf) = atx_heading_buffer {
+ atx_heading_buffer = Some(
+ buf.to_string()
+ + &encode(&serialize(
+ codes,
+ &from_exit_event(events, index),
+ false,
+ )),
+ );
+ }
+
+ // First fence we see.
+ if None == atx_opening_sequence_size {
+ let rank = serialize(codes, &from_exit_event(events, index), false).len();
+ atx_opening_sequence_size = Some(rank);
+ buf_tail_mut(buffers).push(format!("<h{}>", rank));
+ }
+ }
+ TokenType::HeadingAtxText => {
+ let result = resume(buffers);
+ if let Some(ref buf) = atx_heading_buffer {
+ if !buf.is_empty() {
+ buf_tail_mut(buffers).push(encode(buf));
+ atx_heading_buffer = Some("".to_string());
+ }
+ } else {
+ atx_heading_buffer = Some("".to_string());
+ }
+
+ buf_tail_mut(buffers).push(encode(&result));
+ }
+ TokenType::HeadingSetextText => {
+ heading_setext_buffer = Some(resume(buffers));
+ slurp_one_line_ending = true;
+ }
+ TokenType::HeadingSetextUnderline => {
+ let text = heading_setext_buffer
+ .expect("`atx_opening_sequence_size` must be set in headings");
+ let head = codes_from_span(codes, &from_exit_event(events, index))[0];
+ let level: usize = if head == Code::Char('-') { 2 } else { 1 };
+
+ heading_setext_buffer = None;
+ buf_tail_mut(buffers).push(format!("<h{}>{}</h{}>", level, text, level));
+ }
TokenType::HtmlFlow | TokenType::HtmlText => {
ignore_encode = false;
}
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 1a9ed03..3ff6fea 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -18,9 +18,11 @@
//! In older markdown versions, this was not required, and headings would form
//! without it.
//!
-//! In markdown, it is also possible to create headings with the setext heading
-//! construct.
-//! The benefit of setext headings is that their text can include line endings.
+//! In markdown, it is also possible to create headings with a
+//! [heading (setext)][heading_setext] construct.
+//! The benefit of setext headings is that their text can include line endings,
+//! and by extensions also hard breaks (e.g., with
+//! [hard break (escape)][hard_break_escape]).
//! However, their limit is that they cannot form `<h3>` through `<h6>`
//! headings.
//! Due to this limitation, it is recommended to use atx headings.
@@ -39,11 +41,11 @@
//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings)
//!
//! [flow]: crate::content::flow
+//! [heading_setext]: crate::construct::heading_setext
+//! [hard_break_escape]: crate::construct::hard_break_escape
//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements
//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
//! [atx]: http://www.aaronsw.com/2002/atx/
-//!
-//! <!-- To do: link `setext` -->
use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -55,8 +57,8 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
if Code::Char('#') == code {
- tokenizer.enter(TokenType::AtxHeading);
- tokenizer.enter(TokenType::AtxHeadingSequence);
+ tokenizer.enter(TokenType::HeadingAtx);
+ tokenizer.enter(TokenType::HeadingAtxSequence);
sequence_open(tokenizer, code, 0)
} else {
(State::Nok, None)
@@ -76,7 +78,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR
| Code::Char('\t' | '\n' | '\r' | ' ')
if rank > 0 =>
{
- tokenizer.exit(TokenType::AtxHeadingSequence);
+ tokenizer.exit(TokenType::HeadingAtxSequence);
at_break(tokenizer, code)
}
Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
@@ -104,19 +106,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR
fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.exit(TokenType::AtxHeading);
+ tokenizer.exit(TokenType::HeadingAtx);
(State::Ok, Some(vec![code]))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.enter(TokenType::AtxHeadingWhitespace);
+ tokenizer.enter(TokenType::HeadingAtxWhitespace);
whitespace(tokenizer, code)
}
Code::Char('#') => {
- tokenizer.enter(TokenType::AtxHeadingSequence);
+ tokenizer.enter(TokenType::HeadingAtxSequence);
further_sequence(tokenizer, code)
}
Code::Char(_) => {
- tokenizer.enter(TokenType::AtxHeadingText);
+ tokenizer.enter(TokenType::HeadingAtxText);
tokenizer.enter(TokenType::ChunkText);
data(tokenizer, code)
}
@@ -134,7 +136,7 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.consume(code);
(State::Fn(Box::new(further_sequence)), None)
} else {
- tokenizer.exit(TokenType::AtxHeadingSequence);
+ tokenizer.exit(TokenType::HeadingAtxSequence);
at_break(tokenizer, code)
}
}
@@ -151,7 +153,7 @@ fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
(State::Fn(Box::new(whitespace)), None)
}
_ => {
- tokenizer.exit(TokenType::AtxHeadingWhitespace);
+ tokenizer.exit(TokenType::HeadingAtxWhitespace);
at_break(tokenizer, code)
}
}
@@ -167,7 +169,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
// Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text.
Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => {
tokenizer.exit(TokenType::ChunkText);
- tokenizer.exit(TokenType::AtxHeadingText);
+ tokenizer.exit(TokenType::HeadingAtxText);
at_break(tokenizer, code)
}
_ => {
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
new file mode 100644
index 0000000..8cc4f6d
--- /dev/null
+++ b/src/construct/heading_setext.rs
@@ -0,0 +1,301 @@
+//! Heading (setext) is a construct that occurs in the [flow] content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! heading_setext ::= line *(eol line) eol whitespace_optional (1*'-' | 1*'=') whitespace_optional
+//!
+//! whitespace ::= 1*space_or_tab
+//! whitespace_optional ::= [ whitespace ]
+//! line ::= code - eol
+//! eol ::= '\r' | '\r\n' | '\n'
+//! ```
+//!
+//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in
+//! HTML.
+//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the
+//! HTML spec][html] for more info.
+//!
+//! In markdown, it is also possible to create headings with a
+//! [heading (atx)][heading_atx] construct.
+//! The benefit of setext headings is that their text can include line endings,
+//! and by extensions also hard breaks (e.g., with
+//! [hard break (escape)][hard_break_escape]).
+//! However, their limit is that they cannot form `<h3>` through `<h6>`
+//! headings.
+//! Due to this limitation, it is recommended to use atx headings.
+//!
+//! [Thematic breaks][thematic_break] formed with dashes (without whitespace)
+//! can also form heading (setext).
+//!
+//! > 🏛 **Background**: the word *setext* originates from a small markup
+//! > language by Ian Feldman from 1991.
+//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info.
+//! > The word *atx* originates from a tiny markup language by Aaron Swartz
+//! > from 2002.
+//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for
+//! > more info.
+//!
+//! ## References
+//!
+//! * [`setext-underline.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/setext-underline.js)
+//! * [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.30/#setext-headings)
+//!
+//! [flow]: crate::content::flow
+//! [heading_atx]: crate::construct::heading_atx
+//! [thematic_break]: crate::construct::thematic_break
+//! [hard_break_escape]: crate::construct::hard_break_escape
+//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements
+//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
+//! [atx]: http://www.aaronsw.com/2002/atx/
+
+use crate::constant::TAB_SIZE;
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::span::from_exit_event;
+
+/// Kind of underline.
+#[derive(Debug, Clone, PartialEq)]
+pub enum Kind {
+ /// Grave accent (tick) code.
+ Dash,
+ /// Tilde code.
+ EqualsTo,
+}
+
+/// Start of a heading (setext).
+///
+/// ```markdown
+/// |alpha
+/// ==
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ unreachable!("expected non-eol/eof");
+ }
+ _ => {
+ tokenizer.enter(TokenType::HeadingSetext);
+ tokenizer.enter(TokenType::HeadingSetextText);
+ tokenizer.enter(TokenType::ChunkText);
+ text_inside(tokenizer, code)
+ }
+ }
+}
+
+/// Inside text.
+///
+/// ```markdown
+/// al|pha
+/// bra|vo
+/// ==
+/// ```
+pub fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::ChunkText);
+ tokenizer.exit(TokenType::HeadingSetextText);
+ tokenizer.attempt(underline_before, |ok| {
+ Box::new(if ok { after } else { text_continue })
+ })(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(text_inside)), None)
+ }
+ }
+}
+
+/// At a line ending, not at an underline.
+///
+/// ```markdown
+/// alpha
+/// |bravo
+/// ==
+/// ```
+fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // Needed to connect the text.
+ // To do: does it work?
+ tokenizer.enter(TokenType::HeadingSetextText);
+ tokenizer.events.pop();
+ tokenizer.events.pop();
+
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ let next = tokenizer.events.len();
+ let previous = next - 2;
+
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+
+ tokenizer.events[previous].next = Some(next);
+ tokenizer.events[next].previous = Some(previous);
+
+ (
+ State::Fn(Box::new(tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(text_line_start),
+ ))),
+ None,
+ )
+ }
+ _ => unreachable!("expected eol"),
+ }
+}
+
+/// At a line ending after whitespace, not at an underline.
+///
+/// ```markdown
+/// alpha
+/// |bravo
+/// ==
+/// ```
+fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let next = tokenizer.events.len() - 2;
+ let previous = next - 2;
+
+ // Link the whitespace, if it exists.
+ if tokenizer.events[next].token_type == TokenType::Whitespace {
+ tokenizer.events[previous].next = Some(next);
+ tokenizer.events[next].previous = Some(previous);
+ }
+
+ match code {
+ // Blank lines not allowed.
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
+ _ => {
+ let next = tokenizer.events.len();
+ let previous = next - 2;
+
+ tokenizer.enter(TokenType::ChunkText);
+
+ tokenizer.events[previous].next = Some(next);
+ tokenizer.events[next].previous = Some(previous);
+
+ text_inside(tokenizer, code)
+ }
+ }
+}
+
+/// After a heading (setext).
+///
+/// ```markdown
+/// alpha
+/// ==|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::HeadingSetext);
+ (State::Ok, Some(vec![code]))
+}
+
+/// At a line ending, presumably an underline.
+///
+/// ```markdown
+/// alpha|
+/// ==
+/// ```
+fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(underline_start)), None)
+ }
+ _ => unreachable!("expected eol"),
+ }
+}
+
+/// After a line ending, presumably an underline.
+///
+/// ```markdown
+/// alpha
+/// |==
+/// ```
+fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(underline_sequence_start),
+ )(tokenizer, code)
+}
+
+/// After optional whitespace, presumably an underline.
+///
+/// ```markdown
+/// alpha
+/// |==
+/// ```
+fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let tail = tokenizer.events.last();
+ let mut prefix = 0;
+
+ if let Some(event) = tail {
+ if event.token_type == TokenType::Whitespace {
+ let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
+ prefix = span.end_index - span.start_index;
+ }
+ }
+
+ // To do: 4+ should be okay if code (indented) is turned off!
+ if prefix >= TAB_SIZE {
+ return (State::Nok, None);
+ }
+
+ match code {
+ Code::Char(char) if char == '-' || char == '=' => {
+ let marker = if char == '-' {
+ Kind::Dash
+ } else {
+ Kind::EqualsTo
+ };
+ tokenizer.enter(TokenType::HeadingSetextUnderline);
+ underline_sequence_inside(tokenizer, code, marker)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In an underline sequence.
+///
+/// ```markdown
+/// alpha
+/// =|=
+/// ```
+fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult {
+ let marker = if kind == Kind::Dash { '-' } else { '=' };
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ underline_sequence_inside(tokenizer, code, kind)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(underline_after),
+ )(tokenizer, code),
+ _ => underline_after(tokenizer, code),
+ }
+}
+
+/// After an underline sequence, after optional whitespace.
+///
+/// ```markdown
+/// alpha
+/// ==|
+/// ```
+fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::HeadingSetextUnderline);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 880d055..ca1149f 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -29,7 +29,7 @@
//! * [hard break (escape)][hard_break_escape]
//! * [hard break (trailing)][hard_break_trailing]
//! * [heading (atx)][heading_atx]
-//! * heading (setext)
+//! * [heading (setext)][heading_setext]
//! * [html (flow)][html_flow]
//! * [html (text)][html_text]
//! * label end
@@ -64,6 +64,7 @@ pub mod code_text;
pub mod hard_break_escape;
pub mod hard_break_trailing;
pub mod heading_atx;
+pub mod heading_setext;
pub mod html_flow;
pub mod html_text;
pub mod partial_whitespace;
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 7a4f71a..bc41991 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -24,7 +24,7 @@
//! For these reasons, it is recommend to not use spaces or tabs between the
//! markers.
//! Thematic breaks formed with dashes (without whitespace) can also form
-//! setext headings.
+//! [heading (setext)][heading_setext].
//! As dashes and underscores frequently occur in natural language and URLs, it
//! is recommended to use asterisks for thematic breaks to distinguish from
//! such use.
@@ -39,9 +39,10 @@
//! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks)
//!
//! [flow]: crate::content::flow
+//! [heading_setext]: crate::construct::heading_setext
//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element
//!
-//! <!-- To do: link `lists`, `setext heading` -->
+//! <!-- To do: link `lists` -->
use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
diff --git a/src/content/content.rs b/src/content/content.rs
index 4660fbe..4ca69ee 100644
--- a/src/content/content.rs
+++ b/src/content/content.rs
@@ -27,7 +27,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
unreachable!("expected non-eol/eof");
}
- _ => paragraph_initial(tokenizer, code)
+ _ => after_definitions(tokenizer, code)
// To do: definition.
// _ => tokenizer.attempt(definition, |ok| {
// Box::new(if ok {
@@ -44,10 +44,26 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```markdown
/// |asd
/// ```
+fn after_definitions(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ unreachable!("to do: handle eol after definition");
+ }
+ _ => paragraph_initial(tokenizer, code),
+ }
+}
+
+/// Before a paragraph.
+///
+/// ```markdown
+/// |asd
+/// ```
fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- unreachable!("expected non-eol/eof");
+ Code::None => (State::Ok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ unreachable!("to do: handle eol after definition");
}
_ => {
tokenizer.enter(TokenType::Paragraph);
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 4d2ece1..d7509d7 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -14,17 +14,18 @@
//! * [Code (fenced)][crate::construct::code_fenced]
//! * [Code (indented)][crate::construct::code_indented]
//! * [Heading (atx)][crate::construct::heading_atx]
+//! * [Heading (setext)][crate::construct::heading_setext]
//! * [HTML (flow)][crate::construct::html_flow]
//! * [Thematic break][crate::construct::thematic_break]
//!
-//! <!-- To do: `setext` in content? Link to content. -->
+//! <!-- To do: Link to content. -->
use crate::constant::TAB_SIZE;
use crate::construct::{
blank_line::start as blank_line, code_fenced::start as code_fenced,
code_indented::start as code_indented, heading_atx::start as heading_atx,
- html_flow::start as html_flow, partial_whitespace::start as whitespace,
- thematic_break::start as thematic_break,
+ heading_setext::start as heading_setext, html_flow::start as html_flow,
+ partial_whitespace::start as whitespace, thematic_break::start as thematic_break,
};
use crate::subtokenize::subtokenize;
use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer};
@@ -144,24 +145,20 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// |***
/// ```
pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- tokenizer.attempt_2(heading_atx, thematic_break, |ok| {
+ tokenizer.attempt_3(heading_atx, thematic_break, heading_setext, |ok| {
Box::new(if ok { after } else { content_before })
})(tokenizer, code)
}
-/// Before flow, but not before a heading (atx) or thematic break.
-///
-/// At this point, we’re at content (zero or more definitions and zero or one
-/// paragraph/setext heading).
+/// Before content.
///
/// ```markdown
/// |qwe
/// ```
-// To do: currently only parses a single line.
+///
// To do:
// - Multiline
// - One or more definitions.
-// - Setext heading.
fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
@@ -174,12 +171,12 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
}
+
/// In content.
///
/// ```markdown
/// al|pha
/// ```
-// To do: lift limitations as documented above.
fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult {
match code {
Code::None => content_end(tokenizer, code),
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 0aae480..fc9e177 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -24,10 +24,6 @@ pub enum TokenType {
AutolinkMarker,
AutolinkProtocol,
AutolinkEmail,
- AtxHeading,
- AtxHeadingSequence,
- AtxHeadingWhitespace,
- AtxHeadingText,
BlankLineEnding,
BlankLineWhitespace,
CharacterEscape,
@@ -58,6 +54,13 @@ pub enum TokenType {
HardBreakEscapeMarker,
HardBreakTrailing,
HardBreakTrailingSpace,
+ HeadingAtx,
+ HeadingAtxSequence,
+ HeadingAtxWhitespace,
+ HeadingAtxText,
+ HeadingSetext,
+ HeadingSetextText,
+ HeadingSetextUnderline,
HtmlFlow,
HtmlFlowData,
HtmlText,
diff --git a/tests/character_escape.rs b/tests/character_escape.rs
index c81760d..9e2a5c8 100644
--- a/tests/character_escape.rs
+++ b/tests/character_escape.rs
@@ -24,7 +24,7 @@ fn character_escape() {
assert_eq!(
micromark(
- "\\*not emphasized*\n\\<br/> not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\&ouml; not a character entity"
+ "\\*not emphasized*\n\\<br/> not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\&ouml; not a character entity"
),
"<p>*not emphasized*\n&lt;br/&gt; not a tag\n[not a link](/foo)\n`not code`\n1. not a list\n* not a list\n# not a heading\n[foo]: /url &quot;not a reference&quot;\n&amp;ouml; not a character entity</p>",
"should escape other constructs"
diff --git a/tests/code_fenced.rs b/tests/code_fenced.rs
index 82ac088..0e19637 100644
--- a/tests/code_fenced.rs
+++ b/tests/code_fenced.rs
@@ -136,12 +136,11 @@ fn code_fenced() {
"should support interrupting paragraphs"
);
- // To do: setext.
- // assert_eq!(
- // micromark("foo\n---\n~~~\nbar\n~~~\n# baz"),
- // "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>",
- // "should support interrupting other content"
- // );
+ assert_eq!(
+ micromark("foo\n---\n~~~\nbar\n~~~\n# baz"),
+ "<h2>foo</h2>\n<pre><code>bar\n</code></pre>\n<h1>baz</h1>",
+ "should support interrupting other content"
+ );
assert_eq!(
micromark("```ruby\ndef foo(x)\n return 3\nend\n```"),
diff --git a/tests/code_indented.rs b/tests/code_indented.rs
index f21d761..a7afb21 100644
--- a/tests/code_indented.rs
+++ b/tests/code_indented.rs
@@ -53,12 +53,11 @@ fn code_indented() {
"should support paragraphs directly after indented code"
);
- // To do: setext.
- // assert_eq!(
- // micromark("# Heading\n foo\nHeading\n------\n foo\n----"),
- // "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />",
- // "should mix w/ other content"
- // );
+ assert_eq!(
+ micromark("# Heading\n foo\nHeading\n------\n foo\n----"),
+ "<h1>Heading</h1>\n<pre><code>foo\n</code></pre>\n<h2>Heading</h2>\n<pre><code>foo\n</code></pre>\n<hr />",
+ "should mix w/ other content"
+ );
assert_eq!(
micromark(" foo\n bar"),
diff --git a/tests/heading_setext.rs b/tests/heading_setext.rs
new file mode 100644
index 0000000..92a5b43
--- /dev/null
+++ b/tests/heading_setext.rs
@@ -0,0 +1,279 @@
+extern crate micromark;
+use micromark::micromark;
+
+#[test]
+fn heading_setext() {
+ // To do: emphasis.
+ // assert_eq!(
+ // micromark("Foo *bar*\n========="),
+ // "<h1>Foo <em>bar</em></h1>",
+ // "should support a heading w/ an equals to (rank of 1)"
+ // );
+
+ // To do: emphasis.
+ // assert_eq!(
+ // micromark("Foo *bar*\n---------"),
+ // "<h2>Foo <em>bar</em></h2>",
+ // "should support a heading w/ a dash (rank of 2)"
+ // );
+
+ // To do: emphasis.
+ // assert_eq!(
+ // micromark("Foo *bar\nbaz*\n===="),
+ // "<h1>Foo <em>bar\nbaz</em></h1>",
+ // "should support line endings in setext headings"
+ // );
+
+ // To do: emphasis, trim.
+ // assert_eq!(
+ // micromark(" Foo *bar\nbaz*\t\n===="),
+ // "<h1>Foo <em>bar\nbaz</em></h1>",
+ // "should not include initial and final whitespace around content"
+ // );
+
+ assert_eq!(
+ micromark("Foo\n-------------------------"),
+ "<h2>Foo</h2>",
+ "should support long underlines"
+ );
+
+ assert_eq!(
+ micromark("Foo\n="),
+ "<h1>Foo</h1>",
+ "should support short underlines"
+ );
+
+ assert_eq!(
+ micromark(" Foo\n ==="),
+ "<h1>Foo</h1>",
+ "should support indented content w/ 1 space"
+ );
+
+ assert_eq!(
+ micromark(" Foo\n---"),
+ "<h2>Foo</h2>",
+ "should support indented content w/ 2 spaces"
+ );
+
+ assert_eq!(
+ micromark(" Foo\n---"),
+ "<h2>Foo</h2>",
+ "should support indented content w/ 3 spaces"
+ );
+
+ assert_eq!(
+ micromark(" Foo\n ---"),
+ "<pre><code>Foo\n---\n</code></pre>",
+ "should not support too much indented content (1)"
+ );
+
+ assert_eq!(
+ micromark(" Foo\n---"),
+ "<pre><code>Foo\n</code></pre>\n<hr />",
+ "should not support too much indented content (2)"
+ );
+
+ assert_eq!(
+ micromark("Foo\n ---- "),
+ "<h2>Foo</h2>",
+ "should support initial and final whitespace around the underline"
+ );
+
+ assert_eq!(
+ micromark("Foo\n ="),
+ "<h1>Foo</h1>",
+ "should support whitespace before underline"
+ );
+
+ // To do: trim paragraphs.
+ // assert_eq!(
+ // micromark("Foo\n ="),
+ // "<p>Foo\n=</p>",
+ // "should not support too much whitespace before underline (1)"
+ // );
+
+ // To do: trim paragraphs.
+ // assert_eq!(
+ // micromark("Foo\n\t="),
+ // "<p>Foo\n=</p>",
+ // "should not support too much whitespace before underline (2)"
+ // );
+
+ assert_eq!(
+ micromark("Foo\n= ="),
+ "<p>Foo\n= =</p>",
+ "should not support whitespace in the underline (1)"
+ );
+
+ assert_eq!(
+ micromark("Foo\n--- -"),
+ "<p>Foo</p>\n<hr />",
+ "should not support whitespace in the underline (2)"
+ );
+
+ // To do: trim setext.
+ // assert_eq!(
+ // micromark("Foo \n-----"),
+ // "<h2>Foo</h2>",
+ // "should not support a hard break w/ spaces at the end"
+ // );
+
+ assert_eq!(
+ micromark("Foo\\\n-----"),
+ "<h2>Foo\\</h2>",
+ "should not support a hard break w/ backslash at the end"
+ );
+
+ assert_eq!(
+ micromark("`Foo\n----\n`"),
+ "<h2>`Foo</h2>\n<p>`</p>",
+ "should precede over inline constructs (1)"
+ );
+
+ assert_eq!(
+ micromark("<a title=\"a lot\n---\nof dashes\"/>"),
+ "<h2>&lt;a title=&quot;a lot</h2>\n<p>of dashes&quot;/&gt;</p>",
+ "should precede over inline constructs (2)"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark("> Foo\n---"),
+ // "<blockquote>\n<p>Foo</p>\n</blockquote>\n<hr />",
+ // "should not allow underline to be lazy (1)"
+ // );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark("> foo\nbar\n==="),
+ // "<blockquote>\n<p>foo\nbar\n===</p>\n</blockquote>",
+ // "should not allow underline to be lazy (2)"
+ // );
+
+ // To do: list.
+ // assert_eq!(
+ // micromark("- Foo\n---"),
+ // "<ul>\n<li>Foo</li>\n</ul>\n<hr />",
+ // "should not allow underline to be lazy (3)"
+ // );
+
+ assert_eq!(
+ micromark("Foo\nBar\n---"),
+ "<h2>Foo\nBar</h2>",
+ "should support line endings in setext headings"
+ );
+
+ assert_eq!(
+ micromark("---\nFoo\n---\nBar\n---\nBaz"),
+ "<hr />\n<h2>Foo</h2>\n<h2>Bar</h2>\n<p>Baz</p>",
+ "should support adjacent setext headings"
+ );
+
+ assert_eq!(
+ micromark("\n===="),
+ "<p>====</p>",
+ "should not support empty setext headings"
+ );
+
+ assert_eq!(
+ micromark("---\n---"),
+ "<hr />\n<hr />",
+ "should prefer other constructs over setext headings (1)"
+ );
+
+ // To do: list.
+ // assert_eq!(
+ // micromark("- foo\n-----"),
+ // "<ul>\n<li>foo</li>\n</ul>\n<hr />",
+ // "should prefer other constructs over setext headings (2)"
+ // );
+
+ assert_eq!(
+ micromark(" foo\n---"),
+ "<pre><code>foo\n</code></pre>\n<hr />",
+ "should prefer other constructs over setext headings (3)"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark("> foo\n-----"),
+ // "<blockquote>\n<p>foo</p>\n</blockquote>\n<hr />",
+ // "should prefer other constructs over setext headings (4)"
+ // );
+
+ assert_eq!(
+ micromark("\\> foo\n------"),
+ "<h2>&gt; foo</h2>",
+ "should support starting w/ character escapes"
+ );
+
+ assert_eq!(
+ micromark("Foo\nbar\n---\nbaz"),
+ "<h2>Foo\nbar</h2>\n<p>baz</p>",
+ "paragraph and heading interplay (1)"
+ );
+
+ assert_eq!(
+ micromark("Foo\n\nbar\n---\nbaz"),
+ "<p>Foo</p>\n<h2>bar</h2>\n<p>baz</p>",
+ "paragraph and heading interplay (2)"
+ );
+
+ assert_eq!(
+ micromark("Foo\nbar\n\n---\n\nbaz"),
+ "<p>Foo\nbar</p>\n<hr />\n<p>baz</p>",
+ "paragraph and heading interplay (3)"
+ );
+
+ assert_eq!(
+ micromark("Foo\nbar\n* * *\nbaz"),
+ "<p>Foo\nbar</p>\n<hr />\n<p>baz</p>",
+ "paragraph and heading interplay (4)"
+ );
+
+ assert_eq!(
+ micromark("Foo\nbar\n\\---\nbaz"),
+ "<p>Foo\nbar\n---\nbaz</p>",
+ "paragraph and heading interplay (5)"
+ );
+
+ // Extra:
+ assert_eq!(
+ micromark("Foo \nbar\n-----"),
+ "<h2>Foo<br />\nbar</h2>",
+ "should support a hard break w/ spaces in between"
+ );
+
+ assert_eq!(
+ micromark("Foo\\\nbar\n-----"),
+ "<h2>Foo<br />\nbar</h2>",
+ "should support a hard break w/ backslash in between"
+ );
+
+ assert_eq!(
+ micromark("a\n-\nb"),
+ "<h2>a</h2>\n<p>b</p>",
+ "should prefer a setext heading over an interrupting list"
+ );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark("> ===\na"),
+ // "<blockquote>\n<p>===\na</p>\n</blockquote>",
+ // "should not support lazyness (1)"
+ // );
+
+ // To do: block quote.
+ // assert_eq!(
+ // micromark("> a\n==="),
+ // "<blockquote>\n<p>a\n===</p>\n</blockquote>",
+ // "should not support lazyness (2)"
+ // );
+
+ // To do: turning things off.
+ // assert_eq!(
+ // micromark("a\n-", {extensions: [{disable: {null: ["setextUnderline"]}}]}),
+ // "<p>a\n-</p>",
+ // "should support turning off setext underlines"
+ // );
+}
diff --git a/tests/thematic_break.rs b/tests/thematic_break.rs
index 3dc7b5d..cbc84e0 100644
--- a/tests/thematic_break.rs
+++ b/tests/thematic_break.rs
@@ -144,12 +144,11 @@ fn thematic_break() {
"should support thematic breaks interrupting paragraphs"
);
- // To do: setext.
- // assert_eq!(
- // micromark("Foo\n---\nbar"),
- // "<h2>Foo</h2>\n<p>bar</p>",
- // "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)"
- // );
+ assert_eq!(
+ micromark("Foo\n---\nbar"),
+ "<h2>Foo</h2>\n<p>bar</p>",
+ "should not support thematic breaks w/ dashes interrupting paragraphs (setext heading)"
+ );
// To do: list.
// assert_eq!(