aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/compiler.rs42
-rw-r--r--src/construct/code_fenced.rs13
-rw-r--r--src/construct/code_indented.rs12
-rw-r--r--src/construct/code_text.rs217
-rw-r--r--src/construct/mod.rs3
-rw-r--r--src/content/text.rs10
-rw-r--r--src/tokenizer.rs27
7 files changed, 293 insertions, 31 deletions
diff --git a/src/compiler.rs b/src/compiler.rs
index 6f0215c..d3d935b 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -96,8 +96,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
// let mut slurp_all_line_endings = false;
- println!("events: {:#?}", events);
-
while index < events.len() {
let event = &events[index];
let token_type = &event.token_type;
@@ -133,6 +131,10 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
ignore_encode = true;
}
}
+ TokenType::CodeText => {
+ buf_tail_mut(buffers).push("<code>".to_string());
+ buffer(buffers);
+ }
TokenType::Content
| TokenType::AtxHeading
| TokenType::AtxHeadingSequence
@@ -152,6 +154,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::CodeFencedFence
| TokenType::CodeFencedFenceSequence
| TokenType::CodeFencedFenceWhitespace
+ | TokenType::CodeTextSequence
+ | TokenType::CodeTextData
+ | TokenType::CodeTextLineEnding
| TokenType::Data
| TokenType::CharacterEscape
| TokenType::CharacterEscapeMarker
@@ -181,6 +186,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
| TokenType::Whitespace
| TokenType::CodeFencedFenceSequence
| TokenType::CodeFencedFenceWhitespace
+ | TokenType::CodeTextSequence
| TokenType::CharacterEscape
| TokenType::CharacterEscapeMarker
| TokenType::CharacterReference
@@ -264,6 +270,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
false,
)));
}
+
// `AtxHeadingWhitespace` is ignored after the opening sequence,
// before the closing sequence, and after the closing sequence.
// But it is used around intermediate sequences.
@@ -290,7 +297,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
}
}
TokenType::AtxHeadingText => {
- println!("text: {:?}", atx_heading_buffer);
if let Some(ref buf) = atx_heading_buffer {
if !buf.is_empty() {
buf_tail_mut(buffers).push(encode(buf));
@@ -301,7 +307,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
}
let slice = encode(&serialize(codes, &from_exit_event(events, index), false));
- println!("slice: {:?}", slice);
buf_tail_mut(buffers).push(slice);
}
TokenType::AtxHeading => {
@@ -340,8 +345,6 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
// } else
if slurp_one_line_ending {
slurp_one_line_ending = false;
- // } else if code_text_inside {
- // buf_tail_mut(buffers).push(" ".to_string());
} else {
buf_tail_mut(buffers).push(encode(&serialize(
codes,
@@ -378,8 +381,33 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St
character_reference_kind = None;
}
+ TokenType::CodeText => {
+ let result = resume(buffers);
+ let mut chars = result.chars();
+ let mut trim = false;
+
+ if Some(' ') == chars.next() && Some(' ') == chars.next_back() {
+ let mut next = chars.next();
+ while next != None && !trim {
+ if Some(' ') != next {
+ trim = true;
+ }
+ next = chars.next();
+ }
+ }
+
+ buf_tail_mut(buffers).push(if trim {
+ result[1..(result.len() - 1)].to_string()
+ } else {
+ result
+ });
+ buf_tail_mut(buffers).push("</code>".to_string());
+ }
+ TokenType::CodeTextLineEnding => {
+ buf_tail_mut(buffers).push(" ".to_string());
+ }
// This branch below currently acts as the resulting `data` tokens.
- TokenType::Data | TokenType::CharacterEscapeValue => {
+ TokenType::CodeTextData | TokenType::Data | TokenType::CharacterEscapeValue => {
// last_was_tag = false;
buf_tail_mut(buffers).push(encode(&serialize(
codes,
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index c852e8d..12c8bd6 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -66,10 +66,10 @@
//! The `info` and `meta` parts are interpreted as the [string][] content type.
//! That means that character escapes and character reference are allowed.
//!
-//! In markdown, it is also possible to use code (text) in the [text][] content
-//! type.
+//! In markdown, it is also possible to use [code (text)][code_text] in the
+//! [text][] content type.
//! It is also possible to create code with the
-//! [code (indented)][code-indented] construct.
+//! [code (indented)][code_indented] construct.
//! That construct is less explicit, different from code (text), and has no
//! support for specifying the programming language, so it is recommended to
//! use code (fenced) instead of code (indented).
@@ -82,11 +82,10 @@
//! [flow]: crate::content::flow
//! [string]: crate::content::string
//! [text]: crate::content::text
-//! [code-indented]: crate::construct::code_indented
+//! [code_indented]: crate::construct::code_indented
+//! [code_text]: crate::construct::code_text
//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
-//!
-//! <!-- To do: link `code_text` -->
use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE};
use crate::construct::partial_whitespace::start as whitespace;
@@ -251,14 +250,12 @@ fn info_inside(
) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- println!("to do: subtokenize: {:?}", codes);
tokenizer.exit(TokenType::ChunkString);
tokenizer.exit(TokenType::CodeFencedFenceInfo);
tokenizer.exit(TokenType::CodeFencedFence);
at_break(tokenizer, info, code)
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
- println!("to do: subtokenize: {:?}", codes);
tokenizer.exit(TokenType::ChunkString);
tokenizer.exit(TokenType::CodeFencedFenceInfo);
tokenizer.attempt(
diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs
index 936f174..55b8901 100644
--- a/src/construct/code_indented.rs
+++ b/src/construct/code_indented.rs
@@ -18,9 +18,9 @@
//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code`
//! element*][html-code] in the HTML spec for more info.
//!
-//! In markdown, it is also possible to use code (text) in the text content
-//! type.
-//! It is also possible to create code with the [code (fenced)][code-fenced]
+//! In markdown, it is also possible to use [code (text)][code_text] in the
+//! [text][] content type.
+//! It is also possible to create code with the [code (fenced)][code_fenced]
//! construct.
//! That construct is more explicit, more similar to code (text), and has
//! support for specifying the programming language that the code is in, so it
@@ -32,11 +32,11 @@
//! * [*§ 4.4 Indented code blocks* in `CommonMark`](https://spec.commonmark.org/0.30/#indented-code-blocks)
//!
//! [flow]: crate::content::flow
-//! [code-fenced]: crate::construct::code_fenced
+//! [text]: crate::content::text
+//! [code_text]: crate::construct::code_text
+//! [code_fenced]: crate::construct::code_fenced
//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element
//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
-//!
-//! <!-- To do: link `code_text` -->
use crate::constant::TAB_SIZE;
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
new file mode 100644
index 0000000..3c01070
--- /dev/null
+++ b/src/construct/code_text.rs
@@ -0,0 +1,217 @@
+//! Code (text) is a construct that occurs in the [text][] content type.
+//!
+//! It forms with the following BNF:
+//!
+//! ```bnf
+//! ; Restriction: the number of markers in the closing sequence must be equal
+//! ; to the number of markers in the opening sequence.
+//! code_text ::= sequence 1*code sequence
+//!
+//! sequence ::= 1*'`'
+//! ```
+//!
+//! The above grammar shows that it is not possible to create empty code.
+//! It is possible to include grave accents (ticks) in code, by wrapping it
+//! in bigger or smaller sequences:
+//!
+//! ```markdown
+//! Include more: `a``b` or include less: ``a`b``.
+//! ```
+//!
+//! When turning markdown into HTML, each line ending is turned into a space.
+//!
+//! It is also possible to include just one grave accent (tick):
+//!
+//! ```markdown
+//! Include just one: `` ` ``.
+//! ```
+//!
+//! Sequences are “gready”, in that they cannot be preceded or succeeded by
+//! more grave accents (ticks).
+//! To illustrate:
+//!
+//! ```markdown
+//! Not code: ``x`.
+//!
+//! Not code: `x``.
+//!
+//! Escapes work, this is code: \``x`.
+//!
+//! Escapes work, this is code: `x`\`.
+//! ```
+//!
+//! Yields:
+//!
+//! ```html
+//! <p>Not code: ``x`.</p>
+//! <p>Not code: `x``.</p>
+//! <p>Escapes work, this is code: `<code>x</code>.</p>
+//! <p>Escapes work, this is code: <code>x</code>`.</p>
+//! ```
+//!
+//! That is because, when turning markdown into HTML, the first and last space,
+//! if both exist and there is also a non-space in the code, are removed.
+//! Line endings, at that stage, are considered as spaces.
+//!
+//! Code (text) relates to the `<code>` element in HTML.
+//! See [*§ 4.5.15 The `code` element*][html-code] in the HTML spec for more
+//! info.
+//!
+//! In markdown, it is possible to create code with the
+//! [code (fenced)][code_fenced] or [code (indented)][code_indented] constructs
+//! in the [flow][] content type.
+//! Compared to code (indented), fenced code is more explicit and more similar
+//! to code (text), and it has support for specifying the programming language
+//! that the code is in, so it is recommended to use that instead of indented
+//! code.
+//!
+//! ## References
+//!
+//! * [`code-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/code-text.js)
+//! * [*§ 6.1 Code spans* in `CommonMark`](https://spec.commonmark.org/0.30/#code-spans)
+//!
+//! [flow]: crate::content::flow
+//! [text]: crate::content::text
+//! [code_indented]: crate::construct::code_indented
+//! [code_fenced]: crate::construct::code_fenced
+//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element
+
+use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+
+/// Start of code (text).
+///
+/// ```markdown
+/// |`a`
+///
+/// |\``a`
+///
+/// |``a`
+/// ```
+pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let len = tokenizer.events.len();
+
+ match code {
+ Code::Char('`')
+ if tokenizer.previous != Code::Char('`')
+ || (len > 0
+ && tokenizer.events[len - 1].token_type == TokenType::CharacterEscape) =>
+ {
+ tokenizer.enter(TokenType::CodeText);
+ tokenizer.enter(TokenType::CodeTextSequence);
+ sequence_open(tokenizer, code, 0)
+ }
+ _ => (State::Nok, None),
+ }
+}
+
+/// In the opening sequence.
+///
+/// ```markdown
+/// `|`a``
+/// ```
+pub fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
+ if let Code::Char('`') = code {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ sequence_open(tokenizer, code, size + 1)
+ })),
+ None,
+ )
+ } else {
+ tokenizer.exit(TokenType::CodeTextSequence);
+ between(tokenizer, code, size)
+ }
+}
+
+/// Between something and something else
+///
+/// ```markdown
+/// `|a`
+/// `a|`
+/// ```
+pub fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {
+ match code {
+ Code::None => (State::Nok, None),
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ tokenizer.enter(TokenType::CodeTextLineEnding);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CodeTextLineEnding);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ between(tokenizer, code, size_open)
+ })),
+ None,
+ )
+ }
+ Code::Char('`') => {
+ tokenizer.enter(TokenType::CodeTextSequence);
+ sequence_close(tokenizer, code, size_open, 0)
+ }
+ _ => {
+ tokenizer.enter(TokenType::CodeTextData);
+ data(tokenizer, code, size_open)
+ }
+ }
+}
+
+/// In data.
+///
+/// ```markdown
+/// `a|b`
+/// ```
+pub fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {
+ match code {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '`') => {
+ tokenizer.exit(TokenType::CodeTextData);
+ between(tokenizer, code, size_open)
+ }
+ _ => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ data(tokenizer, code, size_open)
+ })),
+ None,
+ )
+ }
+ }
+}
+
+/// In the closing sequence.
+///
+/// ```markdown
+/// ``a`|`
+/// ```
+pub fn sequence_close(
+ tokenizer: &mut Tokenizer,
+ code: Code,
+ size_open: usize,
+ size: usize,
+) -> StateFnResult {
+ match code {
+ Code::Char('`') => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |tokenizer, code| {
+ sequence_close(tokenizer, code, size_open, size + 1)
+ })),
+ None,
+ )
+ }
+ _ if size_open == size => {
+ tokenizer.exit(TokenType::CodeTextSequence);
+ tokenizer.exit(TokenType::CodeText);
+ (State::Ok, Some(vec![code]))
+ }
+ _ => {
+ let tail_index = tokenizer.events.len();
+ let head_index = tokenizer.events.len() - 1;
+ tokenizer.exit(TokenType::CodeTextSequence);
+ // Change the token type.
+ tokenizer.events[head_index].token_type = TokenType::CodeTextData;
+ tokenizer.events[tail_index].token_type = TokenType::CodeTextData;
+ between(tokenizer, code, size_open)
+ }
+ }
+}
diff --git a/src/construct/mod.rs b/src/construct/mod.rs
index 14f53a0..1fa57d5 100644
--- a/src/construct/mod.rs
+++ b/src/construct/mod.rs
@@ -23,7 +23,7 @@
//! * [character reference][character_reference]
//! * [code (fenced)][code_fenced]
//! * [code (indented)][code_indented]
-//! * code (text)
+//! * [code (text)][code_text]
//! * content
//! * definition
//! * hard break escape
@@ -59,6 +59,7 @@ pub mod character_escape;
pub mod character_reference;
pub mod code_fenced;
pub mod code_indented;
+pub mod code_text;
pub mod heading_atx;
pub mod html_flow;
pub mod html_text;
diff --git a/src/content/text.rs b/src/content/text.rs
index 433d030..9d510cb 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -9,7 +9,7 @@
//! * Attention
//! * [HTML (text)][crate::construct::html_text]
//! * Hard break escape
-//! * Code (text)
+//! * [Code (text)][crate::construct::code_text]
//! * Line ending
//! * Label start (image)
//! * Label start (link)
@@ -18,7 +18,8 @@
use crate::construct::{
autolink::start as autolink, character_escape::start as character_escape,
- character_reference::start as character_reference, html_text::start as html_text,
+ character_reference::start as character_reference, code_text::start as code_text,
+ html_text::start as html_text,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
@@ -34,11 +35,12 @@ use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
- _ => tokenizer.attempt_4(
+ _ => tokenizer.attempt_5(
character_reference,
character_escape,
autolink,
html_text,
+ code_text,
|ok| Box::new(if ok { start } else { before_data }),
)(tokenizer, code),
}
@@ -80,7 +82,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
(State::Ok, None)
}
// To do: somehow get these markers from constructs.
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '\\' | '<') => {
+ Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '&' | '<' | '\\' | '`') => {
tokenizer.exit(TokenType::Data);
start(tokenizer, code)
}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 486bc75..c5df42b 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -51,6 +51,11 @@ pub enum TokenType {
CodeIndented,
CodeIndentedPrefixWhitespace,
+ CodeText,
+ CodeTextSequence,
+ CodeTextLineEnding,
+ CodeTextData,
+
CodeFlowChunk,
Data,
@@ -159,6 +164,8 @@ struct InternalState {
events_len: usize,
/// Length of the stack. It’s not allowed to decrease the stack in a check or an attempt.
stack_len: usize,
+ /// Previous code.
+ previous: Code,
/// Current code.
current: Code,
/// `index` in codes of the current code.
@@ -182,6 +189,8 @@ pub struct Tokenizer {
///
/// Tracked to make sure everything’s valid.
stack: Vec<TokenType>,
+ /// Previous character code.
+ pub previous: Code,
/// Current character code.
current: Code,
/// `index` in codes of the current code.
@@ -194,6 +203,7 @@ impl Tokenizer {
/// Create a new tokenizer.
pub fn new(point: Point, index: usize) -> Tokenizer {
Tokenizer {
+ previous: Code::None,
current: Code::None,
column_start: HashMap::new(),
index,
@@ -218,7 +228,6 @@ impl Tokenizer {
}
fn account_for_potential_skip(&mut self) {
- println!("account?: {:?} {:?}", self.point, self.index);
match self.column_start.get(&self.point.line) {
None => {}
Some(next_column) => {
@@ -227,7 +236,6 @@ impl Tokenizer {
self.point.column = col;
self.point.offset += col - 1;
self.index += col - 1;
- println!("account! {:?} {:?}", self.point, self.index);
}
}
};
@@ -266,6 +274,7 @@ impl Tokenizer {
}
self.index += 1;
+ self.previous = code;
// Mark as consumed.
self.consumed = true;
}
@@ -321,6 +330,7 @@ impl Tokenizer {
fn capture(&mut self) -> InternalState {
InternalState {
index: self.index,
+ previous: self.previous,
current: self.current,
point: self.point.clone(),
events_len: self.events.len(),
@@ -331,6 +341,7 @@ impl Tokenizer {
/// Apply the internal state.
fn free(&mut self, previous: InternalState) {
self.index = previous.index;
+ self.previous = previous.previous;
self.current = previous.current;
self.point = previous.point;
assert!(
@@ -429,6 +440,7 @@ impl Tokenizer {
Some(Box::new(b)),
None,
None,
+ None,
done,
)
}
@@ -446,16 +458,19 @@ impl Tokenizer {
Some(Box::new(b)),
Some(Box::new(c)),
None,
+ None,
done,
)
}
- pub fn attempt_4(
+ #[allow(clippy::many_single_char_names)]
+ pub fn attempt_5(
&mut self,
a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
d: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
+ e: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static,
done: impl FnOnce(bool) -> Box<StateFn> + 'static,
) -> Box<StateFn> {
self.call_multiple(
@@ -464,10 +479,12 @@ impl Tokenizer {
Some(Box::new(b)),
Some(Box::new(c)),
Some(Box::new(d)),
+ Some(Box::new(e)),
done,
)
}
+ #[allow(clippy::too_many_arguments, clippy::many_single_char_names)]
pub fn call_multiple(
&mut self,
check: bool,
@@ -475,6 +492,7 @@ impl Tokenizer {
b: Option<Box<StateFn>>,
c: Option<Box<StateFn>>,
d: Option<Box<StateFn>>,
+ e: Option<Box<StateFn>>,
done: impl FnOnce(bool) -> Box<StateFn> + 'static,
) -> Box<StateFn> {
if let Some(head) = a {
@@ -483,7 +501,7 @@ impl Tokenizer {
done(ok)
} else {
Box::new(move |tokenizer: &mut Tokenizer, code| {
- tokenizer.call_multiple(check, b, c, d, None, done)(tokenizer, code)
+ tokenizer.call_multiple(check, b, c, d, e, None, done)(tokenizer, code)
})
}
};
@@ -640,7 +658,6 @@ pub fn as_codes(value: &str) -> Vec<Code> {
'\t' => {
// To do: is this correct?
let virtual_spaces = TAB_SIZE - (column % TAB_SIZE);
- println!("tabs, expand {:?}, {:?}", column, virtual_spaces);
codes.push(Code::Char(char));
column += 1;
let mut index = 0;