aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--readme.md2
-rw-r--r--src/construct/attention.rs24
-rw-r--r--src/construct/autolink.rs75
-rw-r--r--src/construct/character_reference.rs69
-rw-r--r--src/construct/code_fenced.rs29
-rw-r--r--src/construct/code_text.rs4
-rw-r--r--src/construct/definition.rs4
-rw-r--r--src/construct/hard_break_escape.rs2
-rw-r--r--src/construct/hard_break_trailing.rs2
-rw-r--r--src/construct/heading_atx.rs2
-rw-r--r--src/construct/heading_setext.rs4
-rw-r--r--src/construct/html_flow.rs85
-rw-r--r--src/construct/html_text.rs80
-rw-r--r--src/construct/paragraph.rs3
-rw-r--r--src/construct/partial_data.rs6
-rw-r--r--src/construct/partial_destination.rs4
-rw-r--r--src/construct/partial_label.rs4
-rw-r--r--src/construct/partial_space_or_tab.rs4
-rw-r--r--src/construct/partial_title.rs21
-rw-r--r--src/construct/partial_whitespace.rs4
-rw-r--r--src/construct/thematic_break.rs19
-rw-r--r--src/content/flow.rs4
-rw-r--r--src/content/mod.rs1
-rw-r--r--src/content/text.rs8
-rw-r--r--src/parser.rs5
-rw-r--r--src/tokenizer.rs78
-rw-r--r--src/util/codes.rs126
-rw-r--r--src/util/encode.rs39
-rw-r--r--src/util/mod.rs1
-rw-r--r--src/util/normalize_identifier.rs2
-rw-r--r--src/util/span.rs57
31 files changed, 396 insertions, 372 deletions
diff --git a/readme.md b/readme.md
index 3df036e..9a5867c 100644
--- a/readme.md
+++ b/readme.md
@@ -154,7 +154,6 @@ cargo doc --document-private-items
- [ ] (3) Check subtokenizer unraveling is ok
- [ ] (3) Remove splicing and cloning in subtokenizer
- [ ] (3) Pass more references around
-- [ ] (1) Remove todos in `span.rs` if not needed
- [ ] (1) Get markers from constructs (`string`, `text`)
- [ ] (3) Read through rust docs to figure out what useful functions there are,
and fix stuff I’m doing manually now
@@ -276,3 +275,4 @@ important.
- [x] (3) Unicode punctuation
- [x] (1) Use rust to crawl unicode
- [x] (1) Document attention
+- [x] (1) Remove todos in `span.rs` if not needed
diff --git a/src/construct/attention.rs b/src/construct/attention.rs
index dff8633..2144864 100644
--- a/src/construct/attention.rs
+++ b/src/construct/attention.rs
@@ -109,6 +109,13 @@ enum MarkerKind {
}
impl MarkerKind {
+ /// Turn the kind into a [char].
+ fn as_char(&self) -> char {
+ match self {
+ MarkerKind::Asterisk => '*',
+ MarkerKind::Underscore => '_',
+ }
+ }
/// Turn [char] into a kind.
///
/// ## Panics
@@ -137,14 +144,23 @@ impl MarkerKind {
/// Attentention sequence that we can take markers from.
#[derive(Debug)]
struct Sequence {
+ /// Marker used in this sequence.
marker: MarkerKind,
+ /// The index into events where this sequence’s `Enter` currently resides.
event_index: usize,
+ /// The (shifted) point where this sequence starts.
start_point: Point,
+ /// The (shifted) index where this sequence starts.
start_index: usize,
+ /// The (shifted) point where this sequence end.
end_point: Point,
+ /// The (shifted) index where this sequence end.
end_index: usize,
+ /// The number of markers we can still use.
size: usize,
+ /// Whether this sequence can open attention.
open: bool,
+ /// Whether this sequence can close attention.
close: bool,
}
@@ -155,9 +171,9 @@ struct Sequence {
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char(char) if char == '*' || char == '_' => {
+ Code::Char('*' | '_') => {
tokenizer.enter(TokenType::AttentionSequence);
- inside(tokenizer, code, char)
+ inside(tokenizer, code, MarkerKind::from_code(code))
}
_ => (State::Nok, None),
}
@@ -168,9 +184,9 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```markdown
/// *|*
/// ```
-fn inside(tokenizer: &mut Tokenizer, code: Code, marker: char) -> StateFnResult {
+fn inside(tokenizer: &mut Tokenizer, code: Code, marker: MarkerKind) -> StateFnResult {
match code {
- Code::Char(char) if char == marker => {
+ Code::Char(char) if char == marker.as_char() => {
tokenizer.consume(code);
(State::Fn(Box::new(move |t, c| inside(t, c, marker))), None)
}
diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs
index 6486a2d..e29bf8b 100644
--- a/src/construct/autolink.rs
+++ b/src/construct/autolink.rs
@@ -148,17 +148,11 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// a<u|ser@example.com>b
/// ```
fn scheme_or_email_atext(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
- // Whether this character can be both a protocol and email atext.
- let unknown = match code {
- Code::Char('+' | '-' | '.') => true,
- Code::Char(char) if char.is_ascii_alphanumeric() => true,
- _ => false,
- };
-
- if unknown {
- scheme_inside_or_email_atext(tokenizer, code, 1)
- } else {
- email_atext(tokenizer, code)
+ match code {
+ Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
+ scheme_inside_or_email_atext(tokenizer, code, 1)
+ }
+ _ => email_atext(tokenizer, code),
}
}
@@ -173,20 +167,14 @@ fn scheme_inside_or_email_atext(
code: Code,
size: usize,
) -> StateFnResult {
- if let Code::Char(':') = code {
- tokenizer.consume(code);
- (State::Fn(Box::new(url_inside)), None)
- } else {
- // Whether this character can be both a protocol and email atext.
- let unknown = match code {
- Code::Char('+' | '-' | '.') if size < AUTOLINK_SCHEME_SIZE_MAX => true,
- Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_SCHEME_SIZE_MAX => {
- true
- }
- _ => false,
- };
-
- if unknown {
+ match code {
+ Code::Char(':') => {
+ tokenizer.consume(code);
+ (State::Fn(Box::new(url_inside)), None)
+ }
+ Code::Char('+' | '-' | '.' | '0'..='9' | 'A'..='Z' | 'a'..='z')
+ if size < AUTOLINK_SCHEME_SIZE_MAX =>
+ {
tokenizer.consume(code);
(
State::Fn(Box::new(move |t, c| {
@@ -194,9 +182,8 @@ fn scheme_inside_or_email_atext(
})),
None,
)
- } else {
- email_atext(tokenizer, code)
}
+ _ => email_atext(tokenizer, code),
}
}
@@ -291,22 +278,22 @@ fn email_label(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnRes
/// a<user.name@ex-|ample.com>b
/// ```
fn email_value(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
- let ok = match code {
- Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => true,
- Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => true,
- _ => false,
- };
-
- if ok {
- tokenizer.consume(code);
- let func = if let Code::Char('-') = code {
- email_value
- } else {
- email_label
- };
- (State::Fn(Box::new(move |t, c| func(t, c, size + 1))), None)
- } else {
- (State::Nok, None)
+ match code {
+ Code::Char('-') if size < AUTOLINK_DOMAIN_SIZE_MAX => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| email_value(t, c, size + 1))),
+ None,
+ )
+ }
+ Code::Char(char) if char.is_ascii_alphanumeric() && size < AUTOLINK_DOMAIN_SIZE_MAX => {
+ tokenizer.consume(code);
+ (
+ State::Fn(Box::new(move |t, c| email_label(t, c, size + 1))),
+ None,
+ )
+ }
+ _ => (State::Nok, None),
}
}
@@ -325,7 +312,7 @@ fn end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::Autolink);
(State::Ok, None)
}
- _ => unreachable!("expected `>` at `end`"),
+ _ => unreachable!("expected `>`"),
}
}
diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs
index bc42d21..65e49ca 100644
--- a/src/construct/character_reference.rs
+++ b/src/construct/character_reference.rs
@@ -138,21 +138,18 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// a&|#x9;b
/// ```
fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+ let info = Info {
+ buffer: vec![],
+ kind: Kind::Named,
+ };
if let Code::Char('#') = code {
tokenizer.enter(TokenType::CharacterReferenceMarkerNumeric);
tokenizer.consume(code);
tokenizer.exit(TokenType::CharacterReferenceMarkerNumeric);
- (State::Fn(Box::new(numeric)), None)
+ (State::Fn(Box::new(|t, c| numeric(t, c, info))), None)
} else {
tokenizer.enter(TokenType::CharacterReferenceValue);
- value(
- tokenizer,
- code,
- Info {
- buffer: vec![],
- kind: Kind::Named,
- },
- )
+ value(tokenizer, code, info)
}
}
@@ -163,37 +160,18 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// a&#|123;b
/// a&#|x9;b
/// ```
-fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
+fn numeric(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
if let Code::Char('x' | 'X') = code {
tokenizer.enter(TokenType::CharacterReferenceMarkerHexadecimal);
tokenizer.consume(code);
tokenizer.exit(TokenType::CharacterReferenceMarkerHexadecimal);
tokenizer.enter(TokenType::CharacterReferenceValue);
-
- (
- State::Fn(Box::new(|t, c| {
- value(
- t,
- c,
- Info {
- buffer: vec![],
- kind: Kind::Hexadecimal,
- },
- )
- })),
- None,
- )
+ info.kind = Kind::Hexadecimal;
+ (State::Fn(Box::new(|t, c| value(t, c, info))), None)
} else {
tokenizer.enter(TokenType::CharacterReferenceValue);
-
- value(
- tokenizer,
- code,
- Info {
- buffer: vec![],
- kind: Kind::Decimal,
- },
- )
+ info.kind = Kind::Decimal;
+ value(tokenizer, code, info)
}
}
@@ -210,20 +188,19 @@ fn numeric(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn value(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
match code {
Code::Char(';') if !info.buffer.is_empty() => {
- tokenizer.exit(TokenType::CharacterReferenceValue);
- let value = info.buffer.iter().collect::<String>();
-
- if let Kind::Named = info.kind {
- if !CHARACTER_REFERENCE_NAMES.contains(&value.as_str()) {
- return (State::Nok, None);
- }
+ if Kind::Named == info.kind
+ && !CHARACTER_REFERENCE_NAMES
+ .contains(&info.buffer.iter().collect::<String>().as_str())
+ {
+ (State::Nok, None)
+ } else {
+ tokenizer.exit(TokenType::CharacterReferenceValue);
+ tokenizer.enter(TokenType::CharacterReferenceMarkerSemi);
+ tokenizer.consume(code);
+ tokenizer.exit(TokenType::CharacterReferenceMarkerSemi);
+ tokenizer.exit(TokenType::CharacterReference);
+ (State::Ok, None)
}
-
- tokenizer.enter(TokenType::CharacterReferenceMarkerSemi);
- tokenizer.consume(code);
- tokenizer.exit(TokenType::CharacterReferenceMarkerSemi);
- tokenizer.exit(TokenType::CharacterReference);
- (State::Ok, None)
}
Code::Char(char) => {
if info.buffer.len() < info.kind.max() && info.kind.allowed(char) {
diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs
index f2d243a..05266ba 100644
--- a/src/construct/code_fenced.rs
+++ b/src/construct/code_fenced.rs
@@ -151,6 +151,17 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
+ /// Turn [Code] into a kind.
+ ///
+ /// ## Panics
+ ///
+ /// Panics if `code` is not ``Code::Char('~' | '`')``.
+ fn from_code(code: Code) -> Kind {
+ match code {
+ Code::Char(char) => Kind::from_char(char),
+ _ => unreachable!("invalid code"),
+ }
+ }
}
/// State needed to parse code (fenced).
@@ -172,10 +183,6 @@ struct Info {
/// console.log(1);
/// ~~~
/// ```
-///
-/// Parsing note: normally, the prefix is already stripped.
-/// `flow.rs` makes sure that that doesn’t happen for code (fenced), as we need
-/// it.
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.enter(TokenType::CodeFenced);
tokenizer.enter(TokenType::CodeFencedFence);
@@ -202,7 +209,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult
}
match code {
- Code::Char(char) if char == '`' || char == '~' => {
+ Code::Char('`' | '~') => {
tokenizer.enter(TokenType::CodeFencedFenceSequence);
sequence_open(
tokenizer,
@@ -210,7 +217,7 @@ fn before_sequence_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult
Info {
prefix,
size: 0,
- kind: Kind::from_char(char),
+ kind: Kind::from_code(code),
},
)
}
@@ -237,11 +244,11 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> State
None,
)
}
- _ if info.size < CODE_FENCED_SEQUENCE_SIZE_MIN => (State::Nok, None),
- _ => {
+ _ if info.size >= CODE_FENCED_SEQUENCE_SIZE_MIN => {
tokenizer.exit(TokenType::CodeFencedFenceSequence);
tokenizer.attempt_opt(space_or_tab(), |t, c| info_before(t, c, info))(tokenizer, code)
}
+ _ => (State::Nok, None),
}
}
@@ -291,7 +298,7 @@ fn info_inside(
tokenizer.exit(TokenType::CodeFencedFenceInfo);
tokenizer.attempt_opt(space_or_tab(), |t, c| meta_before(t, c, info))(tokenizer, code)
}
- Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
+ Code::Char('`') if info.kind == Kind::GraveAccent => (State::Nok, None),
Code::Char(_) => {
codes.push(code);
tokenizer.consume(code);
@@ -339,7 +346,7 @@ fn meta(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
tokenizer.exit(TokenType::CodeFencedFence);
at_break(tokenizer, code, info)
}
- Code::Char(char) if char == '`' && info.kind == Kind::GraveAccent => (State::Nok, None),
+ Code::Char('`') if info.kind == Kind::GraveAccent => (State::Nok, None),
_ => {
tokenizer.consume(code);
(State::Fn(Box::new(|t, c| meta(t, c, info))), None)
@@ -369,7 +376,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult
}
},
)(tokenizer, code),
- _ => unreachable!("unexpected non-eol/eof after `at_break` `{:?}`", code),
+ _ => unreachable!("expected eof/eol"),
}
}
diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs
index c595c75..a6dc7eb 100644
--- a/src/construct/code_text.rs
+++ b/src/construct/code_text.rs
@@ -138,7 +138,7 @@ fn sequence_open(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnR
fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {
match code {
Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.enter(TokenType::CodeTextLineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::CodeTextLineEnding);
@@ -165,7 +165,7 @@ fn between(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnRe
/// ```
fn data(tokenizer: &mut Tokenizer, code: Code, size_open: usize) -> StateFnResult {
match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '`') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '`') => {
tokenizer.exit(TokenType::CodeTextData);
between(tokenizer, code, size_open)
}
diff --git a/src/construct/definition.rs b/src/construct/definition.rs
index e1afd03..db4a009 100644
--- a/src/construct/definition.rs
+++ b/src/construct/definition.rs
@@ -227,7 +227,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn after_whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::Definition);
// You’d be interrupting.
tokenizer.interrupt = true;
@@ -293,7 +293,7 @@ fn title_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn title_after_after_optional_whitespace(_tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
(State::Ok, Some(vec![code]))
}
_ => (State::Nok, None),
diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs
index 1e755a3..212d276 100644
--- a/src/construct/hard_break_escape.rs
+++ b/src/construct/hard_break_escape.rs
@@ -67,7 +67,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn inside(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::HardBreakEscape);
(State::Ok, Some(vec![code]))
}
diff --git a/src/construct/hard_break_trailing.rs b/src/construct/hard_break_trailing.rs
index 6709e51..35a7cab 100644
--- a/src/construct/hard_break_trailing.rs
+++ b/src/construct/hard_break_trailing.rs
@@ -76,7 +76,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, size: usize) -> StateFnResult {
None,
)
}
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
if size >= HARD_BREAK_PREFIX_SIZE_MIN =>
{
tokenizer.exit(TokenType::HardBreakTrailingSpace);
diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs
index 3ce7052..9fa2ace 100644
--- a/src/construct/heading_atx.rs
+++ b/src/construct/heading_atx.rs
@@ -181,7 +181,7 @@ fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// To do.
+/// Resolve heading (atx).
pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
let mut edit_map = EditMap::new();
let mut index = 0;
diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs
index df20aa7..211434f 100644
--- a/src/construct/heading_setext.rs
+++ b/src/construct/heading_setext.rs
@@ -179,7 +179,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
}
-/// To do.
+/// Resolve heading (setext).
pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
let mut edit_map = EditMap::new();
let mut index = 0;
@@ -207,7 +207,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
tokenizer.events[enter].token_type = TokenType::HeadingSetextText;
tokenizer.events[exit].token_type = TokenType::HeadingSetextText;
- // Add of Enter:HeadingSetext, Exit:HeadingSetext.
+ // Add Enter:HeadingSetext, Exit:HeadingSetext.
let mut heading_enter = tokenizer.events[enter].clone();
heading_enter.token_type = TokenType::HeadingSetext;
let mut heading_exit = tokenizer.events[index].clone();
diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs
index a1bddad..229b0ef 100644
--- a/src/construct/html_flow.rs
+++ b/src/construct/html_flow.rs
@@ -103,6 +103,7 @@ use crate::construct::{
blank_line::start as blank_line, partial_space_or_tab::space_or_tab_min_max,
};
use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};
+use crate::util::codes::{parse, serialize};
/// Kind of HTML (flow).
#[derive(Debug, PartialEq)]
@@ -164,6 +165,17 @@ impl QuoteKind {
_ => unreachable!("invalid char"),
}
}
+ /// Turn [Code] into a kind.
+ ///
+ /// ## Panics
+ ///
+ /// Panics if `code` is not `Code::Char('"' | '\'')`.
+ fn from_code(code: Code) -> QuoteKind {
+ match code {
+ Code::Char(char) => QuoteKind::from_char(char),
+ _ => unreachable!("invalid code"),
+ }
+ }
}
/// State needed to parse HTML (flow).
@@ -175,7 +187,7 @@ struct Info {
start_tag: bool,
/// Used depending on `kind` to either collect all parsed characters, or to
/// store expected characters.
- buffer: Vec<char>,
+ buffer: Vec<Code>,
/// `index` into `buffer` when expecting certain characters.
index: usize,
/// Current quote, when in a double or single quoted attribute value.
@@ -254,7 +266,7 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
None,
)
}
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Code::Char('A'..='Z' | 'a'..='z') => {
info.start_tag = true;
tag_name(tokenizer, code, info)
}
@@ -282,14 +294,14 @@ fn declaration_open(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> St
Code::Char('[') => {
tokenizer.consume(code);
info.kind = Kind::Cdata;
- info.buffer = vec!['C', 'D', 'A', 'T', 'A', '['];
+ info.buffer = parse("CDATA[");
info.index = 0;
(
State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))),
None,
)
}
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Code::Char('A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
info.kind = Kind::Declaration;
(
@@ -329,22 +341,21 @@ fn comment_open_inside(tokenizer: &mut Tokenizer, code: Code, info: Info) -> Sta
/// <![CDATA|[>&<]]>
/// ```
fn cdata_open_inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
- match code {
- Code::Char(char) if char == info.buffer[info.index] => {
- info.index += 1;
- tokenizer.consume(code);
+ if code == info.buffer[info.index] {
+ info.index += 1;
+ tokenizer.consume(code);
- if info.index == info.buffer.len() {
- info.buffer.clear();
- (State::Fn(Box::new(|t, c| continuation(t, c, info))), None)
- } else {
- (
- State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))),
- None,
- )
- }
+ if info.index == info.buffer.len() {
+ info.buffer.clear();
+ (State::Fn(Box::new(|t, c| continuation(t, c, info))), None)
+ } else {
+ (
+ State::Fn(Box::new(|t, c| cdata_open_inside(t, c, info))),
+ None,
+ )
}
- _ => (State::Nok, None),
+ } else {
+ (State::Nok, None)
}
}
@@ -355,9 +366,9 @@ fn cdata_open_inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> S
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
match code {
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Code::Char('A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
- info.buffer.push(char);
+ info.buffer.push(code);
(State::Fn(Box::new(|t, c| tag_name(t, c, info))), None)
}
_ => (State::Nok, None),
@@ -376,13 +387,9 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes
| Code::CarriageReturnLineFeed
| Code::VirtualSpace
| Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => {
- let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase();
+ let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();
let name = tag_name_buffer.as_str();
- let slash = if let Code::Char(char) = code {
- char == '/'
- } else {
- false
- };
+ let slash = matches!(code, Code::Char('/'));
info.buffer.clear();
@@ -413,9 +420,9 @@ fn tag_name(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes
}
}
}
- Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
- info.buffer.push(char);
+ info.buffer.push(code);
(State::Fn(Box::new(|t, c| tag_name(t, c, info))), None)
}
Code::Char(_) => (State::Nok, None),
@@ -481,7 +488,7 @@ fn complete_attribute_name_before(
tokenizer.consume(code);
(State::Fn(Box::new(|t, c| complete_end(t, c, info))), None)
}
- Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => {
+ Code::Char('0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume(code);
(
State::Fn(Box::new(|t, c| complete_attribute_name(t, c, info))),
@@ -508,13 +515,7 @@ fn complete_attribute_name_before(
/// ```
fn complete_attribute_name(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
match code {
- Code::Char(char)
- if char == '-'
- || char == '.'
- || char == ':'
- || char == '_'
- || char.is_ascii_alphanumeric() =>
- {
+ Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume(code);
(
State::Fn(Box::new(|t, c| complete_attribute_name(t, c, info))),
@@ -571,9 +572,9 @@ fn complete_attribute_value_before(
) -> StateFnResult {
match code {
Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None),
- Code::Char(char) if char == '"' || char == '\'' => {
+ Code::Char('"' | '\'') => {
tokenizer.consume(code);
- info.quote = Some(QuoteKind::from_char(char));
+ info.quote = Some(QuoteKind::from_code(code));
(
State::Fn(Box::new(|t, c| complete_attribute_value_quoted(t, c, info))),
None,
@@ -602,7 +603,7 @@ fn complete_attribute_value_quoted(
info: Info,
) -> StateFnResult {
match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => (State::Nok, None),
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None),
Code::Char(char) if char == info.quote.as_ref().unwrap().as_char() => {
tokenizer.consume(code);
(
@@ -860,7 +861,7 @@ fn continuation_raw_end_tag(
) -> StateFnResult {
match code {
Code::Char('>') => {
- let tag_name_buffer = info.buffer.iter().collect::<String>().to_lowercase();
+ let tag_name_buffer = serialize(&info.buffer, false).to_lowercase();
info.buffer.clear();
if HTML_RAW_NAMES.contains(&tag_name_buffer.as_str()) {
@@ -873,9 +874,9 @@ fn continuation_raw_end_tag(
continuation(tokenizer, code, info)
}
}
- Code::Char(char) if char.is_ascii_alphabetic() && info.buffer.len() < HTML_RAW_SIZE_MAX => {
+ Code::Char('A'..='Z' | 'a'..='z') if info.buffer.len() < HTML_RAW_SIZE_MAX => {
tokenizer.consume(code);
- info.buffer.push(char);
+ info.buffer.push(code);
(
State::Fn(Box::new(|t, c| continuation_raw_end_tag(t, c, info))),
None,
diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs
index 2ac0ccd..0926f48 100644
--- a/src/construct/html_text.rs
+++ b/src/construct/html_text.rs
@@ -56,6 +56,7 @@
use crate::construct::partial_space_or_tab::space_or_tab;
use crate::tokenizer::{Code, State, StateFn, StateFnResult, TokenType, Tokenizer};
+use crate::util::codes::parse;
/// Start of HTML (text)
///
@@ -94,7 +95,7 @@ fn open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.consume(code);
(State::Fn(Box::new(instruction)), None)
}
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Code::Char('A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_open)), None)
}
@@ -117,13 +118,13 @@ fn declaration_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
}
Code::Char('[') => {
tokenizer.consume(code);
- let buffer = vec!['C', 'D', 'A', 'T', 'A', '['];
+ let buffer = parse("CDATA[");
(
State::Fn(Box::new(|t, c| cdata_open_inside(t, c, buffer, 0))),
None,
)
}
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Code::Char('A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(declaration)), None)
}
@@ -197,7 +198,7 @@ fn comment_start_dash(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn comment(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(comment))
}
Code::Char('-') => {
@@ -239,25 +240,24 @@ fn comment_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn cdata_open_inside(
tokenizer: &mut Tokenizer,
code: Code,
- buffer: Vec<char>,
+ buffer: Vec<Code>,
index: usize,
) -> StateFnResult {
- match code {
- Code::Char(char) if char == buffer[index] => {
- tokenizer.consume(code);
+ if code == buffer[index] {
+ tokenizer.consume(code);
- if index + 1 == buffer.len() {
- (State::Fn(Box::new(cdata)), None)
- } else {
- (
- State::Fn(Box::new(move |t, c| {
- cdata_open_inside(t, c, buffer, index + 1)
- })),
- None,
- )
- }
+ if index + 1 == buffer.len() {
+ (State::Fn(Box::new(cdata)), None)
+ } else {
+ (
+ State::Fn(Box::new(move |t, c| {
+ cdata_open_inside(t, c, buffer, index + 1)
+ })),
+ None,
+ )
}
- _ => (State::Nok, None),
+ } else {
+ (State::Nok, None)
}
}
@@ -269,7 +269,7 @@ fn cdata_open_inside(
fn cdata(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(cdata))
}
Code::Char(']') => {
@@ -319,7 +319,7 @@ fn cdata_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::Char('>') => end(tokenizer, code),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(declaration))
}
_ => {
@@ -338,7 +338,7 @@ fn declaration(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn instruction(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(instruction))
}
Code::Char('?') => {
@@ -372,7 +372,7 @@ fn instruction_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char(char) if char.is_ascii_alphabetic() => {
+ Code::Char('A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_close)), None)
}
@@ -388,7 +388,7 @@ fn tag_close_start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_close)), None)
}
@@ -404,7 +404,7 @@ fn tag_close(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(tag_close_between))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
@@ -422,13 +422,13 @@ fn tag_close_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char(char) if char == '-' || char.is_ascii_alphanumeric() => {
+ Code::Char('-' | '0'..='9' | 'A'..='Z' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_open)), None)
}
Code::CarriageReturnLineFeed
| Code::VirtualSpace
- | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
_ => (State::Nok, None),
}
}
@@ -442,7 +442,7 @@ fn tag_open(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(tag_open_between))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
@@ -453,7 +453,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.consume(code);
(State::Fn(Box::new(end)), None)
}
- Code::Char(char) if char == ':' || char == '_' || char.is_ascii_alphabetic() => {
+ Code::Char(':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_open_attribute_name)), None)
}
@@ -470,13 +470,7 @@ fn tag_open_between(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char(char)
- if char == '-'
- || char == '.'
- || char == ':'
- || char == '_'
- || char.is_ascii_alphanumeric() =>
- {
+ Code::Char('-' | '.' | '0'..='9' | ':' | 'A'..='Z' | '_' | 'a'..='z') => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_open_attribute_name)), None)
}
@@ -494,7 +488,7 @@ fn tag_open_attribute_name(tokenizer: &mut Tokenizer, code: Code) -> StateFnResu
/// ```
fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(tag_open_attribute_name_after))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
@@ -519,7 +513,7 @@ fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer, code: Code) -> State
fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::Char('<' | '=' | '>' | '`') => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
at_line_ending(tokenizer, code, Box::new(tag_open_attribute_value_before))
}
Code::VirtualSpace | Code::Char('\t' | ' ') => {
@@ -555,7 +549,7 @@ fn tag_open_attribute_value_quoted(
) -> StateFnResult {
match code {
Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => at_line_ending(
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => at_line_ending(
tokenizer,
code,
Box::new(move |t, c| tag_open_attribute_value_quoted(t, c, marker)),
@@ -589,7 +583,7 @@ fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer, code: Code) -> S
Code::None | Code::Char('"' | '\'' | '<' | '=' | '`') => (State::Nok, None),
Code::CarriageReturnLineFeed
| Code::VirtualSpace
- | Code::Char('\r' | '\n' | '\t' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '/' | '>') => tag_open_between(tokenizer, code),
Code::Char(_) => {
tokenizer.consume(code);
(State::Fn(Box::new(tag_open_attribute_value_unquoted)), None)
@@ -607,7 +601,7 @@ fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer, code: Code)
match code {
Code::CarriageReturnLineFeed
| Code::VirtualSpace
- | Code::Char('\r' | '\n' | '\t' | ' ' | '>' | '/') => tag_open_between(tokenizer, code),
+ | Code::Char('\t' | '\n' | '\r' | ' ' | '>' | '/') => tag_open_between(tokenizer, code),
_ => (State::Nok, None),
}
}
@@ -646,7 +640,7 @@ fn at_line_ending(
return_state: Box<StateFn>,
) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::HtmlTextData);
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
@@ -656,7 +650,7 @@ fn at_line_ending(
None,
)
}
- _ => unreachable!("expected line ending"),
+ _ => unreachable!("expected eol"),
}
}
diff --git a/src/construct/paragraph.rs b/src/construct/paragraph.rs
index ae2f4de..5ec278e 100644
--- a/src/construct/paragraph.rs
+++ b/src/construct/paragraph.rs
@@ -45,7 +45,7 @@ use crate::util::edit_map::EditMap;
pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
- unreachable!("unexpected eol/eof at start of paragraph")
+ unreachable!("unexpected eol/eof")
}
_ => {
tokenizer.enter(TokenType::Paragraph);
@@ -99,7 +99,6 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Vec<Event> {
{
// Remove Exit:Paragraph, Enter:LineEnding, Exit:LineEnding, Enter:Paragraph.
edit_map.add(exit_index, 4, vec![]);
- println!("rm {:?} {:?}", exit_index, exit_index + 4);
// Add Exit:LineEnding position info to Exit:Data.
let line_ending_exit = &tokenizer.events[enter_next_index - 1];
diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs
index 9f99570..555ccaf 100644
--- a/src/construct/partial_data.rs
+++ b/src/construct/partial_data.rs
@@ -6,8 +6,6 @@
//! [string]: crate::content::string
//! [text]: crate::content::text
-// To do: pass token types in?
-
use crate::tokenizer::{Code, Event, EventType, State, StateFnResult, TokenType, Tokenizer};
use crate::util::edit_map::EditMap;
@@ -34,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnR
fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult {
match code {
Code::None => (State::Ok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.enter(TokenType::LineEnding);
tokenizer.consume(code);
tokenizer.exit(TokenType::LineEnding);
@@ -58,7 +56,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnRe
/// ```
fn data(tokenizer: &mut Tokenizer, code: Code, stop: Vec<Code>) -> StateFnResult {
let done = match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => true,
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => true,
_ if stop.contains(&code) => true,
_ => false,
};
diff --git a/src/construct/partial_destination.rs b/src/construct/partial_destination.rs
index 8b281c7..31c13ec 100644
--- a/src/construct/partial_destination.rs
+++ b/src/construct/partial_destination.rs
@@ -171,7 +171,7 @@ fn enclosed(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult
tokenizer.exit(info.options.string.clone());
enclosed_before(tokenizer, code, info)
}
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '<') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '<') => {
(State::Nok, None)
}
Code::Char('\\') => {
@@ -235,7 +235,7 @@ fn raw(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
Code::None
| Code::CarriageReturnLineFeed
| Code::VirtualSpace
- | Code::Char('\t' | '\r' | '\n' | ' ') => {
+ | Code::Char('\t' | '\n' | '\r' | ' ') => {
if info.balance > 0 {
(State::Nok, None)
} else {
diff --git a/src/construct/partial_label.rs b/src/construct/partial_label.rs
index 32182d6..f201f60 100644
--- a/src/construct/partial_label.rs
+++ b/src/construct/partial_label.rs
@@ -133,7 +133,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes
tokenizer.exit(info.options.label);
(State::Ok, None)
}
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go(
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go(
space_or_tab_eol_with_options(EolOptions {
content_type: Some(ContentType::String),
connect: info.connect,
@@ -165,7 +165,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes
/// ```
fn label(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResult {
match code {
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n' | '[' | ']') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r' | '[' | ']') => {
tokenizer.exit(TokenType::Data);
at_break(tokenizer, code, info)
}
diff --git a/src/construct/partial_space_or_tab.rs b/src/construct/partial_space_or_tab.rs
index d2934b3..5b1ec5e 100644
--- a/src/construct/partial_space_or_tab.rs
+++ b/src/construct/partial_space_or_tab.rs
@@ -195,7 +195,7 @@ fn inside(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnResul
/// ```
fn after_space_or_tab(tokenizer: &mut Tokenizer, code: Code, mut info: EolInfo) -> StateFnResult {
match code {
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.enter_with_content(TokenType::LineEnding, info.options.content_type);
if info.connect {
@@ -254,7 +254,7 @@ fn after_more_space_or_tab(_tokenizer: &mut Tokenizer, code: Code) -> StateFnRes
// Blank line not allowed.
if matches!(
code,
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
) {
(State::Nok, None)
} else {
diff --git a/src/construct/partial_title.rs b/src/construct/partial_title.rs
index caacb0d..010f554 100644
--- a/src/construct/partial_title.rs
+++ b/src/construct/partial_title.rs
@@ -102,6 +102,19 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
+ /// Turn [Code] into a kind.
+ ///
+ /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`.
+ ///
+ /// ## Panics
+ ///
+ /// Panics if `code` is not `Code::Char('(' | '"' | '\'')`.
+ fn from_code(code: Code) -> Kind {
+ match code {
+ Code::Char(char) => Kind::from_char(char),
+ _ => unreachable!("invalid code"),
+ }
+ }
}
/// State needed to parse titles.
@@ -124,10 +137,10 @@ struct Info {
/// ```
pub fn start(tokenizer: &mut Tokenizer, code: Code, options: Options) -> StateFnResult {
match code {
- Code::Char(char) if char == '"' || char == '\'' || char == '(' => {
+ Code::Char('"' | '\'' | '(') => {
let info = Info {
connect: false,
- kind: Kind::from_char(char),
+ kind: Kind::from_code(code),
options,
};
tokenizer.enter(info.options.title.clone());
@@ -180,7 +193,7 @@ fn at_break(tokenizer: &mut Tokenizer, code: Code, mut info: Info) -> StateFnRes
begin(tokenizer, code, info)
}
Code::None => (State::Nok, None),
- Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => tokenizer.go(
+ Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => tokenizer.go(
space_or_tab_eol_with_options(EolOptions {
content_type: Some(ContentType::String),
connect: info.connect,
@@ -216,7 +229,7 @@ fn title(tokenizer: &mut Tokenizer, code: Code, info: Info) -> StateFnResult {
tokenizer.exit(TokenType::Data);
at_break(tokenizer, code, info)
}
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n') => {
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
tokenizer.exit(TokenType::Data);
at_break(tokenizer, code, info)
}
diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs
index 62b1205..c9ec564 100644
--- a/src/construct/partial_whitespace.rs
+++ b/src/construct/partial_whitespace.rs
@@ -33,7 +33,7 @@ pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
space_or_tab(),
if matches!(
tokenizer.previous,
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
) {
// If there’s whitespace, and we were at an eol/eof, `ok`
ok
@@ -48,7 +48,7 @@ pub fn whitespace(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
fn at_eol(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
if matches!(
code,
- Code::None | Code::CarriageReturnLineFeed | Code::Char('\r' | '\n')
+ Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')
) {
ok(tokenizer, code)
} else {
diff --git a/src/construct/thematic_break.rs b/src/construct/thematic_break.rs
index 8d29157..28aca34 100644
--- a/src/construct/thematic_break.rs
+++ b/src/construct/thematic_break.rs
@@ -95,7 +95,7 @@ impl Kind {
///
/// ## Panics
///
- /// Panics if `char` is not `*`, `_`, or `_`.
+ /// Panics if `char` is not `*`, `-`, or `_`.
fn from_char(char: char) -> Kind {
match char {
'*' => Kind::Asterisk,
@@ -104,6 +104,19 @@ impl Kind {
_ => unreachable!("invalid char"),
}
}
+ /// Turn [Code] into a kind.
+ ///
+ /// > 👉 **Note**: an opening paren must be used for `Kind::Paren`.
+ ///
+ /// ## Panics
+ ///
+ /// Panics if `code` is not `Code::Char('*' | '-' | '_')`.
+ fn from_code(code: Code) -> Kind {
+ match code {
+ Code::Char(char) => Kind::from_char(char),
+ _ => unreachable!("invalid code"),
+ }
+ }
}
/// State needed to parse thematic breaks.
@@ -133,11 +146,11 @@ pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
/// ```
fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
match code {
- Code::Char(char) if char == '*' || char == '-' || char == '_' => at_break(
+ Code::Char('*' | '-' | '_') => at_break(
tokenizer,
code,
Info {
- kind: Kind::from_char(char),
+ kind: Kind::from_code(code),
size: 0,
},
),
diff --git a/src/content/flow.rs b/src/content/flow.rs
index 3ff948d..74c6a62 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -140,7 +140,7 @@ fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.interrupt = false;
(State::Fn(Box::new(start)), None)
}
- _ => unreachable!("expected eol/eof after blank line `{:?}`", code),
+ _ => unreachable!("expected eol/eof"),
}
}
@@ -162,7 +162,7 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {
tokenizer.exit(TokenType::LineEnding);
(State::Fn(Box::new(start)), None)
}
- _ => unreachable!("unexpected non-eol/eof after flow `{:?}`", code),
+ _ => unreachable!("expected eol/eof"),
}
}
diff --git a/src/content/mod.rs b/src/content/mod.rs
index 395e41b..ae8ad83 100644
--- a/src/content/mod.rs
+++ b/src/content/mod.rs
@@ -1,6 +1,5 @@
//! Content types found in markdown.
-#[allow(clippy::module_inception)]
pub mod flow;
pub mod string;
pub mod text;
diff --git a/src/content/text.rs b/src/content/text.rs
index ecb6ae1..cf630f1 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -8,15 +8,15 @@
//!
//! * [Attention][crate::construct::attention]
//! * [Autolink][crate::construct::autolink]
-//! * [HTML (text)][crate::construct::html_text]
+//! * [Character escape][crate::construct::character_escape]
+//! * [Character reference][crate::construct::character_reference]
+//! * [Code (text)][crate::construct::code_text]
//! * [Hard break (escape)][crate::construct::hard_break_escape]
//! * [Hard break (trailing)][crate::construct::hard_break_trailing]
-//! * [Code (text)][crate::construct::code_text]
+//! * [HTML (text)][crate::construct::html_text]
//! * [Label start (image)][crate::construct::label_start_image]
//! * [Label start (link)][crate::construct::label_start_link]
//! * [Label end][crate::construct::label_end]
-//! * [Character escape][crate::construct::character_escape]
-//! * [Character reference][crate::construct::character_reference]
use crate::construct::{
attention::start as attention, autolink::start as autolink,
diff --git a/src/parser.rs b/src/parser.rs
index 89a0de1..32689d6 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -3,7 +3,8 @@
use std::collections::HashSet;
// To do: this should start with `containers`, when they’re done.
use crate::content::flow::flow;
-use crate::tokenizer::{as_codes, Code, Event, Point};
+use crate::tokenizer::{Code, Event, Point};
+use crate::util::codes::parse as parse_codes;
/// Information needed, in all content types, when parsing markdown.
///
@@ -22,7 +23,7 @@ pub struct ParseState {
/// Passes the codes back so the compiler can access the source.
pub fn parse(value: &str) -> (Vec<Event>, Vec<Code>) {
let mut parse_state = ParseState {
- codes: as_codes(value),
+ codes: parse_codes(value),
definitions: HashSet::new(),
};
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 1fa94d7..f0f9ff0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -11,7 +11,6 @@
//! [`attempt`]: Tokenizer::attempt
//! [`check`]: Tokenizer::check
-use crate::constant::TAB_SIZE;
use crate::parser::ParseState;
use std::collections::HashMap;
@@ -2224,83 +2223,6 @@ fn feed_impl(
check_statefn_result((state, None))
}
-/// Turn a string into codes.
-pub fn as_codes(value: &str) -> Vec<Code> {
- let mut codes: Vec<Code> = vec![];
- let mut at_start = true;
- let mut at_carriage_return = false;
- let mut column = 1;
-
- for char in value.chars() {
- if at_start {
- if char == '\u{feff}' {
- // Ignore.
- continue;
- }
-
- at_start = false;
- }
-
- // Send a CRLF.
- if at_carriage_return && '\n' == char {
- at_carriage_return = false;
- codes.push(Code::CarriageReturnLineFeed);
- } else {
- // Send the previous CR: we’re not at a next `\n`.
- if at_carriage_return {
- at_carriage_return = false;
- codes.push(Code::Char('\r'));
- }
-
- match char {
- // Send a replacement character.
- '\0' => {
- column += 1;
- codes.push(Code::Char('�'));
- }
- // Send a tab and virtual spaces.
- '\t' => {
- let remainder = column % TAB_SIZE;
- let mut virtual_spaces = if remainder == 0 {
- 0
- } else {
- TAB_SIZE - remainder
- };
- codes.push(Code::Char(char));
- column += 1;
- while virtual_spaces > 0 {
- codes.push(Code::VirtualSpace);
- column += 1;
- virtual_spaces -= 1;
- }
- }
- // Send an LF.
- '\n' => {
- column = 1;
- codes.push(Code::Char(char));
- }
- // Don’t send anything yet.
- '\r' => {
- column = 1;
- at_carriage_return = true;
- }
- // Send the char.
- _ => {
- column += 1;
- codes.push(Code::Char(char));
- }
- }
- };
- }
-
- // Send the last CR: we’re not at a next `\n`.
- if at_carriage_return {
- codes.push(Code::Char('\r'));
- }
-
- codes
-}
-
/// Check a [`StateFnResult`][], make sure its valid (that there are no bugs),
/// and clean a final eof passed back in `remainder`.
fn check_statefn_result(result: StateFnResult) -> StateFnResult {
diff --git a/src/util/codes.rs b/src/util/codes.rs
new file mode 100644
index 0000000..8a46d02
--- /dev/null
+++ b/src/util/codes.rs
@@ -0,0 +1,126 @@
+//! Utilities to deal with character codes.
+
+use crate::constant::TAB_SIZE;
+use crate::tokenizer::Code;
+
+/// Turn a string into codes.
+pub fn parse(value: &str) -> Vec<Code> {
+ let mut codes: Vec<Code> = vec![];
+ let mut at_start = true;
+ let mut at_carriage_return = false;
+ let mut column = 1;
+
+ for char in value.chars() {
+ if at_start {
+ if char == '\u{feff}' {
+ // Ignore.
+ continue;
+ }
+
+ at_start = false;
+ }
+
+ // Send a CRLF.
+ if at_carriage_return && '\n' == char {
+ at_carriage_return = false;
+ codes.push(Code::CarriageReturnLineFeed);
+ } else {
+ // Send the previous CR: we’re not at a next `\n`.
+ if at_carriage_return {
+ at_carriage_return = false;
+ codes.push(Code::Char('\r'));
+ }
+
+ match char {
+ // Send a replacement character.
+ '\0' => {
+ column += 1;
+ codes.push(Code::Char('�'));
+ }
+ // Send a tab and virtual spaces.
+ '\t' => {
+ let remainder = column % TAB_SIZE;
+ let mut virtual_spaces = if remainder == 0 {
+ 0
+ } else {
+ TAB_SIZE - remainder
+ };
+ codes.push(Code::Char(char));
+ column += 1;
+ while virtual_spaces > 0 {
+ codes.push(Code::VirtualSpace);
+ column += 1;
+ virtual_spaces -= 1;
+ }
+ }
+ // Send an LF.
+ '\n' => {
+ column = 1;
+ codes.push(Code::Char(char));
+ }
+ // Don’t send anything yet.
+ '\r' => {
+ column = 1;
+ at_carriage_return = true;
+ }
+ // Send the char.
+ _ => {
+ column += 1;
+ codes.push(Code::Char(char));
+ }
+ }
+ };
+ }
+
+ // Send the last CR: we’re not at a next `\n`.
+ if at_carriage_return {
+ codes.push(Code::Char('\r'));
+ }
+
+ codes
+}
+
+/// Serialize codes, optionally expanding tabs.
+pub fn serialize(codes: &[Code], expand_tabs: bool) -> String {
+ let mut at_tab = false;
+ let mut index = 0;
+ let mut value: Vec<char> = vec![];
+
+ while index < codes.len() {
+ let code = codes[index];
+ let mut at_tab_next = false;
+
+ match code {
+ Code::CarriageReturnLineFeed => {
+ value.push('\r');
+ value.push('\n');
+ }
+ Code::Char(char) if char == '\n' || char == '\r' => {
+ value.push(char);
+ }
+ Code::Char(char) if char == '\t' => {
+ at_tab_next = true;
+ value.push(if expand_tabs { ' ' } else { char });
+ }
+ Code::VirtualSpace => {
+ if !expand_tabs && at_tab {
+ index += 1;
+ continue;
+ }
+ value.push(' ');
+ }
+ Code::Char(char) => {
+ value.push(char);
+ }
+ Code::None => {
+ unreachable!("unexpected EOF code in codes");
+ }
+ }
+
+ at_tab = at_tab_next;
+
+ index += 1;
+ }
+
+ value.into_iter().collect()
+}
diff --git a/src/util/encode.rs b/src/util/encode.rs
index 5762c22..a3bd589 100644
--- a/src/util/encode.rs
+++ b/src/util/encode.rs
@@ -21,11 +21,36 @@
///
/// * [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)
pub fn encode(value: &str) -> String {
- // To do: replacing 4 times might just be slow.
- // Perhaps we can walk the chars.
- value
- .replace('&', "&amp;")
- .replace('"', "&quot;")
- .replace('<', "&lt;")
- .replace('>', "&gt;")
+ let mut result: Vec<&str> = vec![];
+ let mut start = 0;
+ let mut index = 0;
+
+ for byte in value.bytes() {
+ if let Some(replacement) = match byte {
+ b'&' => Some("&amp;"),
+ b'"' => Some("&quot;"),
+ b'<' => Some("&lt;"),
+ b'>' => Some("&gt;"),
+ _ => None,
+ } {
+ if start != index {
+ result.push(&value[start..index]);
+ }
+
+ result.push(replacement);
+ start = index + 1;
+ }
+
+ index += 1;
+ }
+
+ if start == 0 {
+ value.to_string()
+ } else {
+ if start < index {
+ result.push(&value[start..index]);
+ }
+
+ result.join("")
+ }
}
diff --git a/src/util/mod.rs b/src/util/mod.rs
index 68ef275..d1a0e01 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -1,5 +1,6 @@
//! Utilities used when compiling markdown.
+pub mod codes;
pub mod decode_character_reference;
pub mod edit_map;
pub mod encode;
diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs
index 4753f7b..123a3a9 100644
--- a/src/util/normalize_identifier.rs
+++ b/src/util/normalize_identifier.rs
@@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String {
// Collapse markdown whitespace and trim it.
for char in value.chars() {
match char {
- '\t' | '\r' | '\n' | ' ' => {
+ '\t' | '\n' | '\r' | ' ' => {
at_whitespace = true;
}
_ => {
diff --git a/src/util/span.rs b/src/util/span.rs
index 02811cc..32dd00f 100644
--- a/src/util/span.rs
+++ b/src/util/span.rs
@@ -1,20 +1,15 @@
//! Utilities to deal with semantic labels.
use crate::tokenizer::{Code, Event, EventType};
+use crate::util::codes::serialize as serialize_codes;
/// A struct representing the span of an opening and closing event of a token.
#[derive(Debug)]
pub struct Span {
- // To do: probably needed in the future.
- // start: Point,
/// Absolute offset (and `index` in `codes`) of where this span starts.
pub start_index: usize,
- // To do: probably needed in the future.
- // end: Point,
/// Absolute offset (and `index` in `codes`) of where this span ends.
pub end_index: usize,
- // To do: probably needed in the future.
- // token_type: TokenType,
}
/// Get a span from an event.
@@ -29,10 +24,8 @@ pub struct Span {
/// When `micromark` is used, this function never panics.
pub fn from_exit_event(events: &[Event], index: usize) -> Span {
let exit = &events[index];
- // let end = exit.point.clone();
let end_index = exit.index;
let token_type = exit.token_type.clone();
- // To do: support `enter` events if needed and walk forwards?
assert_eq!(
exit.event_type,
EventType::Exit,
@@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span {
let enter = &events[enter_index];
if enter.event_type == EventType::Enter && enter.token_type == token_type {
return Span {
- // start: enter.point.clone(),
start_index: enter.index,
- // end,
end_index,
- // token_type,
};
}
@@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {
pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {
&codes[span.start_index..span.end_index]
}
-
-/// Serialize a slice of codes, optionally expanding tabs.
-fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String {
- let mut at_tab = false;
- let mut index = 0;
- let mut value: Vec<char> = vec![];
-
- while index < codes.len() {
- let code = codes[index];
- let mut at_tab_next = false;
-
- match code {
- Code::CarriageReturnLineFeed => {
- value.push('\r');
- value.push('\n');
- }
- Code::Char(char) if char == '\n' || char == '\r' => {
- value.push(char);
- }
- Code::Char(char) if char == '\t' => {
- at_tab_next = true;
- value.push(if expand_tabs { ' ' } else { char });
- }
- Code::VirtualSpace => {
- if !expand_tabs && at_tab {
- index += 1;
- continue;
- }
- value.push(' ');
- }
- Code::Char(char) => {
- value.push(char);
- }
- Code::None => {
- unreachable!("unexpected EOF code in codes");
- }
- }
-
- at_tab = at_tab_next;
-
- index += 1;
- }
-
- value.into_iter().collect()
-}