aboutsummaryrefslogtreecommitdiffstats
path: root/src/construct
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/construct/heading_atx.rs32
-rw-r--r--src/construct/heading_setext.rs301
-rw-r--r--src/construct/mod.rs3
-rw-r--r--src/construct/thematic_break.rs5
4 files changed, 323 insertions, 18 deletions
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 1a9ed03..3ff6fea 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -18,9 +18,11 @@
//! In older markdown versions, this was not required, and headings would form
//! without it.
//!
-//! In markdown, it is also possible to create headings with the setext heading
-//! construct.
-//! The benefit of setext headings is that their text can include line endings.
+//! In markdown, it is also possible to create headings with a
+//! [heading (setext)][heading_setext] construct.
+//! The benefit of setext headings is that their text can include line endings,
+//! and by extensions also hard breaks (e.g., with
+//! [hard break (escape)][hard_break_escape]).
//! However, their limit is that they cannot form `<h3>` through `<h6>`
//! headings.
//! Due to this limitation, it is recommended to use atx headings.
@@ -39,11 +41,11 @@
//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.30/#atx-headings)
//!
//! [flow]: crate::content::flow
+//! [heading_setext]: crate::construct::heading_setext
+//! [hard_break_escape]: crate::construct::hard_break_escape
//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements
//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
//! [atx]: http://www.aaronsw.com/2002/atx/
-//!
-//! <!-- To do: link `setext` -->
use crate::constant::HEADING_ATX_OPENING_FENCE_SIZE_MAX;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -55,8 +57,8 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
if Code::Char('#') == code {
- tokenizer.enter(TokenType::AtxHeading);
- tokenizer.enter(TokenType::AtxHeadingSequence);
+ tokenizer.enter(TokenType::HeadingAtx);
+ tokenizer.enter(TokenType::HeadingAtxSequence);
sequence_open(tokenizer, code, 0)
} else {
(State::Nok, None)
@@ -76,7 +78,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR
| Code::Char('\t' | '\n' | '\r' | ' ')
if rank > 0 =>
{
- tokenizer.exit(TokenType::AtxHeadingSequence);
+ tokenizer.exit(TokenType::HeadingAtxSequence);
at_break(tokenizer, code)
}
Code::Char('#') if rank < HEADING_ATX_OPENING_FENCE_SIZE_MAX => {
@@ -104,19 +106,19 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, rank: usize) -> StateFnR
fn at_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- tokenizer.exit(TokenType::AtxHeading);
+ tokenizer.exit(TokenType::HeadingAtx);
(State::Ok, Some(vec![code]))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
- tokenizer.enter(TokenType::AtxHeadingWhitespace);
+ tokenizer.enter(TokenType::HeadingAtxWhitespace);
whitespace(tokenizer, code)
}
Code::Char('#') => {
- tokenizer.enter(TokenType::AtxHeadingSequence);
+ tokenizer.enter(TokenType::HeadingAtxSequence);
further_sequence(tokenizer, code)
}
Code::Char(_) => {
- tokenizer.enter(TokenType::AtxHeadingText);
+ tokenizer.enter(TokenType::HeadingAtxText);
tokenizer.enter(TokenType::ChunkText);
data(tokenizer, code)
}
@@ -134,7 +136,7 @@ fn further_sequence(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.consume(code);
(State::Fn(Box::new(further_sequence)), None)
} else {
- tokenizer.exit(TokenType::AtxHeadingSequence);
+ tokenizer.exit(TokenType::HeadingAtxSequence);
at_break(tokenizer, code)
}
}
@@ -151,7 +153,7 @@ fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
(State::Fn(Box::new(whitespace)), None)
}
_ => {
- tokenizer.exit(TokenType::AtxHeadingWhitespace);
+ tokenizer.exit(TokenType::HeadingAtxWhitespace);
at_break(tokenizer, code)
}
}
@@ -167,7 +169,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
// Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text.
Code::None | Code::CarriageReturnLineFeed | Code::Char('\t' | '\n' | '\r' | ' ') => {
tokenizer.exit(TokenType::ChunkText);
- tokenizer.exit(TokenType::AtxHeadingText);
+ tokenizer.exit(TokenType::HeadingAtxText);
at_break(tokenizer, code)
}
_ => {
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
new file mode 100644
index 0000000..8cc4f6d
--- /dev/null
+++ b/src/construct/heading_setext.rs
@@ -0,0 +1,301 @@
+//! Heading (setext) is a construct that occurs in the [flow] content type.
+//!
+//! They’re formed with the following BNF:
+//!
+//! ```bnf
+//! heading_setext ::= line *(eol line) eol whitespace_optional (1*'-' | 1*'=') whitespace_optional
+//!
+//! whitespace ::= 1*space_or_tab
+//! whitespace_optional ::= [ whitespace ]
+//! line ::= code - eol
+//! eol ::= '\r' | '\r\n' | '\n'
+//! ```
+//!
+//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in
+//! HTML.
+//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the
+//! HTML spec][html] for more info.
+//!
+//! In markdown, it is also possible to create headings with a
+//! [heading (atx)][heading_atx] construct.
+//! The benefit of setext headings is that their text can include line endings,
+//! and by extensions also hard breaks (e.g., with
+//! [hard break (escape)][hard_break_escape]).
+//! However, their limit is that they cannot form `<h3>` through `<h6>`
+//! headings.
+//! Due to this limitation, it is recommended to use atx headings.
+//!
+//! [Thematic breaks][thematic_break] formed with dashes (without whitespace)
+//! can also form heading (setext).
+//!
+//! > 🏛 **Background**: the word *setext* originates from a small markup
+//! > language by Ian Feldman from 1991.
+//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info.
+//! > The word *atx* originates from a tiny markup language by Aaron Swartz
+//! > from 2002.
+//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for
+//! > more info.
+//!
+//! ## References
+//!
+//! * [`setext-underline.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/setext-underline.js)
+//! * [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.30/#setext-headings)
+//!
+//! [flow]: crate::content::flow
+//! [heading_atx]: crate::construct::heading_atx
+//! [thematic_break]: crate::construct::thematic_break
+//! [hard_break_escape]: crate::construct::hard_break_escape
+//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements
+//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext
+//! [atx]: http://www.aaronsw.com/2002/atx/
+
+use crate::constant::TAB_SIZE;
+use crate::construct::partial_whitespace::start as whitespace;
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::span::from_exit_event;
+
+/// Kind of underline.
+#[derive(Debug, Clone, PartialEq)]
+pub enum Kind {
+ /// Grave accent (tick) code.
+ Dash,
+ /// Tilde code.
+ EqualsTo,
+}
+
+/// Start of a heading (setext).
+///
+/// ```markdown
+/// |alpha
+/// ==
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ unreachable!("expected non-eol/eof");
+ }
+ _ => {
+ tokenizer.enter(TokenType::HeadingSetext);
+ tokenizer.enter(TokenType::HeadingSetextText);
+ tokenizer.enter(TokenType::ChunkText);
+ text_inside(tokenizer, code)
+ }
+ }
+}
+
+/// Inside text.
+///
+/// ```markdown
+/// al|pha
+/// bra|vo
+/// ==
+/// ```
+pub fn text_inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::ChunkText);
+ tokenizer.exit(TokenType::HeadingSetextText);
+ tokenizer.attempt(underline_before, |ok| {
+ Box::new(if ok { after } else { text_continue })
+ })(tokenizer, code)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(text_inside)), None)
+ }
+ }
+}
+
+/// At a line ending, not at an underline.
+///
+/// ```markdown
+/// alpha
+/// |bravo
+/// ==
+/// ```
+fn text_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ // Needed to connect the text.
+ // To do: does it work?
+ tokenizer.enter(TokenType::HeadingSetextText);
+ tokenizer.events.pop();
+ tokenizer.events.pop();
+
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ let next = tokenizer.events.len();
+ let previous = next - 2;
+
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+
+ tokenizer.events[previous].next = Some(next);
+ tokenizer.events[next].previous = Some(previous);
+
+ (
+ State::Fn(Box::new(tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(text_line_start),
+ ))),
+ None,
+ )
+ }
+ _ => unreachable!("expected eol"),
+ }
+}
+
+/// At a line ending after whitespace, not at an underline.
+///
+/// ```markdown
+/// alpha
+/// |bravo
+/// ==
+/// ```
+fn text_line_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let next = tokenizer.events.len() - 2;
+ let previous = next - 2;
+
+ // Link the whitespace, if it exists.
+ if tokenizer.events[next].token_type == TokenType::Whitespace {
+ tokenizer.events[previous].next = Some(next);
+ tokenizer.events[next].previous = Some(previous);
+ }
+
+ match code {
+ // Blank lines not allowed.
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
+ _ => {
+ let next = tokenizer.events.len();
+ let previous = next - 2;
+
+ tokenizer.enter(TokenType::ChunkText);
+
+ tokenizer.events[previous].next = Some(next);
+ tokenizer.events[next].previous = Some(previous);
+
+ text_inside(tokenizer, code)
+ }
+ }
+}
+
+/// After a heading (setext).
+///
+/// ```markdown
+/// alpha
+/// ==|
+/// ```
+fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.exit(TokenType::HeadingSetext);
+ (State::Ok, Some(vec![code]))
+}
+
+/// At a line ending, presumably an underline.
+///
+/// ```markdown
+/// alpha|
+/// ==
+/// ```
+fn underline_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.enter(TokenType::LineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::LineEnding);
+ (State::Fn(Box::new(underline_start)), None)
+ }
+ _ => unreachable!("expected eol"),
+ }
+}
+
+/// After a line ending, presumably an underline.
+///
+/// ```markdown
+/// alpha
+/// |==
+/// ```
+fn underline_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(underline_sequence_start),
+ )(tokenizer, code)
+}
+
+/// After optional whitespace, presumably an underline.
+///
+/// ```markdown
+/// alpha
+/// |==
+/// ```
+fn underline_sequence_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let tail = tokenizer.events.last();
+ let mut prefix = 0;
+
+ if let Some(event) = tail {
+ if event.token_type == TokenType::Whitespace {
+ let span = from_exit_event(&tokenizer.events, tokenizer.events.len() - 1);
+ prefix = span.end_index - span.start_index;
+ }
+ }
+
+ // To do: 4+ should be okay if code (indented) is turned off!
+ if prefix >= TAB_SIZE {
+ return (State::Nok, None);
+ }
+
+ match code {
+ Code::Char(char) if char == '-' || char == '=' => {
+ let marker = if char == '-' {
+ Kind::Dash
+ } else {
+ Kind::EqualsTo
+ };
+ tokenizer.enter(TokenType::HeadingSetextUnderline);
+ underline_sequence_inside(tokenizer, code, marker)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In an underline sequence.
+///
+/// ```markdown
+/// alpha
+/// =|=
+/// ```
+fn underline_sequence_inside(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult {
+ let marker = if kind == Kind::Dash { '-' } else { '=' };
+
+ match code {
+ Code::Char(char) if char == marker => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ underline_sequence_inside(tokenizer, code, kind)
+ })),
+ None,
+ )
+ }
+ Code::VirtualSpace | Code::Char('\t' | ' ') => tokenizer.attempt(
+ |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace),
+ |_ok| Box::new(underline_after),
+ )(tokenizer, code),
+ _ => underline_after(tokenizer, code),
+ }
+}
+
+/// After an underline sequence, after optional whitespace.
+///
+/// ```markdown
+/// alpha
+/// ==|
+/// ```
+fn underline_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ tokenizer.exit(TokenType::HeadingSetextUnderline);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => (State::Nok, None),
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 880d055..ca1149f 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -29,7 +29,7 @@
//! * [hard break (escape)][hard_break_escape]
//! * [hard break (trailing)][hard_break_trailing]
//! * [heading (atx)][heading_atx]
-//! * heading (setext)
+//! * [heading (setext)][heading_setext]
//! * [html (flow)][html_flow]
//! * [html (text)][html_text]
//! * label end
@@ -64,6 +64,7 @@ pub mod code_text;
pub mod hard_break_escape;
pub mod hard_break_trailing;
pub mod heading_atx;
+pub mod heading_setext;
pub mod html_flow;
pub mod html_text;
pub mod partial_whitespace;
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 7a4f71a..bc41991 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -24,7 +24,7 @@
//! For these reasons, it is recommend to not use spaces or tabs between the
//! markers.
//! Thematic breaks formed with dashes (without whitespace) can also form
-//! setext headings.
+//! [heading (setext)][heading_setext].
//! As dashes and underscores frequently occur in natural language and URLs, it
//! is recommended to use asterisks for thematic breaks to distinguish from
//! such use.
@@ -39,9 +39,10 @@
//! * [*§ 4.1 Thematic breaks* in `CommonMark`](https://spec.commonmark.org/0.30/#thematic-breaks)
//!
//! [flow]: crate::content::flow
+//! [heading_setext]: crate::construct::heading_setext
//! [html]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-hr-element
//!
-//! <!-- To do: link `lists`, `setext heading` -->
+//! <!-- To do: link `lists` -->
use crate::constant::THEMATIC_BREAK_MARKER_COUNT_MIN;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};