diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/construct/definition.rs | 2 | ||||
-rw-r--r-- | src/construct/mod.rs | 2 | ||||
-rw-r--r-- | src/construct/partial_destination.rs | 75 | ||||
-rw-r--r-- | src/construct/partial_label.rs | 81 | ||||
-rw-r--r-- | src/construct/partial_title.rs | 127 |
5 files changed, 245 insertions, 42 deletions
diff --git a/src/construct/definition.rs b/src/construct/definition.rs index e540b44..3035a20 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -14,7 +14,7 @@ //! //! destination ::= destination_enclosed | destination_raw //! destination_enclosed ::= '<' *( destination_enclosed_text | destination_enclosed_escape ) '>' -//! destination_enclosed_text ::= code - '<' - '\\' - eol +//! destination_enclosed_text ::= code - '<' - '\\' - '>' - eol //! destination_enclosed_escape ::= '\\' [ '<' | '\\' | '>' ] //! destination_raw ::= 1*( destination_raw_text | destination_raw_escape ) //! ; Restriction: unbalanced `)` characters are not allowed. diff --git a/src/construct/mod.rs b/src/construct/mod.rs index fb79f68..a5e95bc 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -15,7 +15,7 @@ //! //! The following constructs are found in markdown: //! -//! * attention (strong, emphasis) (text) +//! * attention (strong, emphasis) //! * [autolink][] //! * [blank line][blank_line] //! * block quote diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs index 8cf5b77..a2f638b 100644 --- a/src/construct/partial_destination.rs +++ b/src/construct/partial_destination.rs @@ -1,7 +1,52 @@ +//! Destination occurs in [definition][] and label end. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! destination ::= destination_enclosed | destination_raw +//! +//! destination_enclosed ::= '<' *( destination_enclosed_text | destination_enclosed_escape ) '>' +//! destination_enclosed_text ::= code - '<' - '\\' - '>' - eol +//! destination_enclosed_escape ::= '\\' [ '<' | '\\' | '>' ] +//! destination_raw ::= 1*( destination_raw_text | destination_raw_escape ) +//! ; Restriction: unbalanced `)` characters are not allowed. +//! destination_raw_text ::= code - '\\' - ascii_control - space_or_tab - eol +//! destination_raw_escape ::= '\\' [ '(' | ')' | '\\' ] +//! ``` +//! +//! Balanced parens allowed in raw destinations. +//! They are counted with a counter that starts at `0`, and is incremented +//! every time `(` occurs and decremented every time `)` occurs. +//! If `)` is found when the counter is `0`, the destination closes immediately +//! after it. +//! Escaped parens do not count. +//! +//! It is recommended to use the enclosed variant of destinations, as it allows +//! arbitrary parens, and also allows for whitespace and other characters in +//! URLs. +//! +//! The destination is interpreted as the [string][] content type. +//! That means that character escapes and character reference are allowed. +//! +//! ## References +//! +//! * [`micromark-factory-destination/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-destination/dev/index.js) +//! +//! [definition]: crate::construct::definition +//! [string]: crate::content::string +//! +//! <!-- To do: link label end. --> + // To do: pass token types in. use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; +/// Before a destination. +/// +/// ```markdown +/// |<ab> +/// |ab +/// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char('<') => { @@ -27,7 +72,11 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// To do. +/// After `<`, before an enclosed destination. +/// +/// ```markdown +/// <|ab> +/// ``` fn enclosed_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { if let Code::Char('>') = code { tokenizer.enter(TokenType::DefinitionDestinationLiteralMarker); @@ -44,7 +93,11 @@ fn enclosed_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// To do. +/// In an enclosed destination. +/// +/// ```markdown +/// <u|rl> +/// ``` fn enclosed(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char('>') => { @@ -66,7 +119,11 @@ fn enclosed(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// To do. +/// After `\`, in an enclosed destination. +/// +/// ```markdown +/// <a\|>b> +/// ``` fn enclosed_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char('<' | '>' | '\\') => { @@ -77,7 +134,11 @@ fn enclosed_escape(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { } } -/// To do. +/// In a raw destination. +/// +/// ```markdown +/// a|b +/// ``` // To do: these arms can be improved? fn raw(tokenizer: &mut Tokenizer, code: Code, balance: usize) -> StateFnResult { // To do: configurable. @@ -139,7 +200,11 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, balance: usize) -> StateFnResult { } } -/// To do. +/// After `\`, in a raw destination. +/// +/// ```markdown +/// a\|)b +/// ``` fn raw_escape(tokenizer: &mut Tokenizer, code: Code, balance: usize) -> StateFnResult { match code { Code::Char('(' | ')' | '\\') => { diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs index c772c56..f7ce8d7 100644 --- a/src/construct/partial_label.rs +++ b/src/construct/partial_label.rs @@ -1,9 +1,65 @@ +//! Label occurs in [definition][] and label end. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: maximum `999` codes allowed between brackets. +//! ; Restriction: no blank lines. +//! ; Restriction: at least 1 non-space and non-eol code must exist. +//! label ::= '[' *( label_text | label_escape ) ']' +//! label_text ::= code - '[' - '\\' - ']' +//! label_escape ::= '\\' [ '[' | '\\' | ']' ] +//! ``` +//! +//! The maximum allowed size of the label, without the brackets, is `999` +//! (inclusive), which is defined in +//! [`LINK_REFERENCE_SIZE_MAX`][link_reference_size_max]. +//! +//! Labels can contain line endings and whitespace, but they are not allowed to +//! contain blank lines, and they must not be blank themselves. +//! +//! The label is interpreted as the [string][] content type. +//! That means that character escapes and character reference are allowed. +//! +//! > 👉 **Note**: this label relates to, but is not, the initial “label” of +//! > what is know as a reference in markdown: +//! > +//! > | Kind | Link | Image | +//! > | --------- | -------- | --------- | +//! > | Shortcut | `[x]` | `![x]` | +//! > | Collapsed | `[x][]` | `![x][]` | +//! > | Full | `[x][y]` | `![x][y]` | +//! > +//! > The 6 above things are references, in the three kinds they come in, as +//! > links and images. +//! > The label that this module focusses on is only the thing that contains +//! > `y`. +//! > +//! > The thing that contains `x` is not a single thing when parsing markdown, +//! > but instead constists of an opening (label start (image) or label start +//! > (link)) and a closing (label end), so as to allow further phrasing such +//! > as code (text) or attention. +//! +//! ## References +//! +//! * [`micromark-factory-label/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-label/dev/index.js) +//! +//! [definition]: crate::construct::definition +//! [string]: crate::content::string +//! [link_reference_size_max]: crate::constant::LINK_REFERENCE_SIZE_MAX +//! +//! <!-- To do: link label end, label starts. --> + // To do: pass token types in. use crate::constant::LINK_REFERENCE_SIZE_MAX; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; -/// To do. +/// Before a label. +/// +/// ```markdown +/// |[a] +/// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { Code::Char('[') => { @@ -14,12 +70,16 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::DefinitionLabelData); (State::Fn(Box::new(|t, c| at_break(t, c, false, 0))), None) } - // To do: allow? - _ => unreachable!("expected `[` at start of label"), + _ => (State::Nok, None), } } -/// To do. +/// In a label, at something. +/// +/// ```markdown +/// [|a] +/// [a|] +/// ``` fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult { match code { Code::None | Code::Char('[') => (State::Nok, None), @@ -37,6 +97,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> S tokenizer.enter(TokenType::LineEnding); tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); + // To do: limit blank lines. ( State::Fn(Box::new(move |t, c| at_break(t, c, data, size))), None, @@ -50,7 +111,11 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> S } } -/// To do. +/// In a label, in text. +/// +/// ```markdown +/// [a|b] +/// ``` fn label(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult { match code { Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => { @@ -85,7 +150,11 @@ fn label(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> Stat } } -/// To do. +/// After `\` in a label. +/// +/// ```markdown +/// [a\|[b] +/// ``` fn escape(tokenizer: &mut Tokenizer, code: Code, data: bool, size: usize) -> StateFnResult { match code { Code::Char('[' | '\\' | ']') => { diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs index 4c7b527..a626c50 100644 --- a/src/construct/partial_title.rs +++ b/src/construct/partial_title.rs @@ -1,32 +1,70 @@ +//! Title occurs in [definition][] and label end. +//! +//! They’re formed with the following BNF: +//! +//! ```bnf +//! ; Restriction: no blank lines. +//! ; Restriction: markers must match (in case of `(` with `)`). +//! title ::= marker [ *( code - '\\' | '\\' [ marker ] ) ] marker +//! marker ::= '"' | '\'' | '(' +//! ``` +//! +//! Titles can be double quoted (`"a"`), single quoted (`'a'`), or +//! parenthesized (`(a)`). +//! +//! Titles can contain line endings and whitespace, but they are not allowed to +//! contain blank lines. +//! They are allowed to be blank themselves. +//! +//! The title is interpreted as the [string][] content type. +//! That means that character escapes and character reference are allowed. +//! +//! ## References +//! +//! * [`micromark-factory-title/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-title/dev/index.js) +//! +//! [definition]: crate::construct::definition +//! [string]: crate::content::string +//! +//! <!-- To do: link label end. --> + // To do: pass token types in. use crate::construct::partial_whitespace::start as whitespace; use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; -/// Type of quote, if we’re in an attribure, in complete (condition 7). +/// Type of title. #[derive(Debug, Clone, PartialEq)] -enum TitleKind { - /// In a parenthesised (`(` and `)`) title. +enum Kind { + /// In a parenthesized (`(` and `)`) title. Paren, /// In a double quoted (`"`) title. Double, - /// In a single quoted (`"`) title. + /// In a single quoted (`'`) title. Single, } -fn kind_to_marker(kind: &TitleKind) -> char { +/// Display a marker. +fn kind_to_marker(kind: &Kind) -> char { match kind { - TitleKind::Double => '"', - TitleKind::Single => '\'', - TitleKind::Paren => ')', + Kind::Double => '"', + Kind::Single => '\'', + Kind::Paren => ')', } } +/// Before a title. +/// +/// ```markdown +/// |"a" +/// |'a' +/// |(a) +/// ``` pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { let kind = match code { - Code::Char('"') => Some(TitleKind::Double), - Code::Char('\'') => Some(TitleKind::Single), - Code::Char('(') => Some(TitleKind::Paren), + Code::Char('"') => Some(Kind::Double), + Code::Char('\'') => Some(Kind::Single), + Code::Char('(') => Some(Kind::Paren), _ => None, }; @@ -35,14 +73,22 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { tokenizer.enter(TokenType::DefinitionTitleMarker); tokenizer.consume(code); tokenizer.exit(TokenType::DefinitionTitleMarker); - (State::Fn(Box::new(|t, c| at_first_break(t, c, kind))), None) + (State::Fn(Box::new(|t, c| begin(t, c, kind))), None) } else { (State::Nok, None) } } -/// To do. -fn at_first_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult { +/// After the opening marker. +/// +/// This is also used when at the closing marker. +/// +/// ```markdown +/// "|a" +/// '|a' +/// (|a) +/// ``` +fn begin(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind_to_marker(&kind) => { tokenizer.enter(TokenType::DefinitionTitleMarker); @@ -58,12 +104,19 @@ fn at_first_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> Sta } } -/// To do. -fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult { +/// At something, before something else. +/// +/// ```markdown +/// "|a" +/// 'a|' +/// (a| +/// b) +/// ``` +fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind_to_marker(&kind) => { tokenizer.exit(TokenType::DefinitionTitleString); - at_first_break(tokenizer, code, kind) + begin(tokenizer, code, kind) } Code::None => (State::Nok, None), Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => { @@ -71,7 +124,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnRe tokenizer.consume(code); tokenizer.exit(TokenType::LineEnding); ( - State::Fn(Box::new(|t, c| at_break_line_start(t, c, kind))), + State::Fn(Box::new(|t, c| line_start(t, c, kind))), None, ) } @@ -83,14 +136,26 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnRe } } -fn at_break_line_start(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult { +/// After a line ending. +/// +/// ```markdown +/// "a +/// |b" +/// ``` +fn line_start(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { tokenizer.attempt( |t, c| whitespace(t, c, TokenType::Whitespace), - |_ok| Box::new(|t, c| at_break_line_begin(t, c, kind)), + |_ok| Box::new(|t, c| line_begin(t, c, kind)), )(tokenizer, code) } -fn at_break_line_begin(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult { +/// After a line ending, after optional whitespace. +/// +/// ```markdown +/// "a +/// |b" +/// ``` +fn line_begin(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { // Blank line not allowed. Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None), @@ -98,8 +163,12 @@ fn at_break_line_begin(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) - } } -/// To do. -fn title(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult { +/// In title text. +/// +/// ```markdown +/// "a|b" +/// ``` +fn title(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind_to_marker(&kind) => { tokenizer.exit(TokenType::ChunkString); @@ -120,17 +189,17 @@ fn title(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResul } } -/// To do. -fn escape(tokenizer: &mut Tokenizer, code: Code, kind: TitleKind) -> StateFnResult { +/// After `\`, in title text. +/// +/// ```markdown +/// "a\|"b" +/// ``` +fn escape(tokenizer: &mut Tokenizer, code: Code, kind: Kind) -> StateFnResult { match code { Code::Char(char) if char == kind_to_marker(&kind) => { tokenizer.consume(code); (State::Fn(Box::new(move |t, c| title(t, c, kind))), None) } - Code::Char('\\') => { - tokenizer.consume(code); - (State::Fn(Box::new(move |t, c| title(t, c, kind))), None) - } _ => title(tokenizer, code, kind), } } |