aboutsummaryrefslogtreecommitdiffstats
path: root/src/content
diff options
context:
space:
mode:
authorLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
committerLibravatar Titus Wormer <tituswormer@gmail.com>2022-07-28 16:48:00 +0200
commitf7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 (patch)
treec1ac3f22473bd79566d835b2474d2ae9e00d6c55 /src/content
parentd729b07712ca9cc91e68af1776dac9d7008a90cb (diff)
downloadmarkdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.gz
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.tar.bz2
markdown-rs-f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456.zip
Refactor to work on `char`s
Previously, a custom char implementation was used. This was easier to work with, as sometimes “virtual” characters are injected, or characters are ignored. This replaces that with working on actual `char`s. In the hope of in the future working on `u8`s, even. This simplifies the state machine somewhat, as only `\n` is fed, regardless of whether it was a CRLF, CR, or LF. It also feeds `' '` instead of virtual spaces. The BOM, if present, is now available as a `ByteOrderMark` event.
Diffstat (limited to 'src/content')
-rw-r--r--src/content/document.rs39
-rw-r--r--src/content/flow.rs14
-rw-r--r--src/content/string.rs6
-rw-r--r--src/content/text.rs24
4 files changed, 52 insertions, 31 deletions
diff --git a/src/content/document.rs b/src/content/document.rs
index 32b32ba..2924f6c 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -17,12 +17,12 @@ use crate::parser::ParseState;
use crate::subtokenize::subtokenize;
use crate::token::Token;
use crate::tokenizer::{
- Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
+ Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
};
use crate::util::{
normalize_identifier::normalize_identifier,
skip,
- span::{from_exit_event, serialize},
+ slice::{Position, Slice},
};
/// Phases where we can exit containers.
@@ -78,7 +78,7 @@ struct DocumentInfo {
pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let mut tokenizer = Tokenizer::new(point, parse_state);
- let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start));
+ let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before));
tokenizer.flush(state, true);
let mut index = 0;
@@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
let event = &tokenizer.events[index];
if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
+ // To do: when we operate on u8, we can use a `to_str` here as we
+ // don‘t need virtual spaces.
let id = normalize_identifier(
- serialize(
- &parse_state.codes,
- &from_exit_event(&tokenizer.events, index),
- false,
+ &Slice::from_position(
+ &tokenizer.parse_state.chars,
+ &Position::from_exit_event(&tokenizer.events, index),
)
- .as_str(),
+ .serialize(),
);
if !definitions.contains(&id) {
@@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
events
}
+/// At the beginning.
+///
+/// Perhaps a BOM?
+///
+/// ```markdown
+/// > | a
+/// ^
+/// ```
+fn before(tokenizer: &mut Tokenizer) -> State {
+ match tokenizer.current {
+ Some('\u{FEFF}') => {
+ tokenizer.enter(Token::ByteOrderMark);
+ tokenizer.consume();
+ tokenizer.exit(Token::ByteOrderMark);
+ State::Fn(Box::new(start))
+ }
+ _ => start(tokenizer),
+ }
+}
+
/// Before document.
//
/// ```markdown
@@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State
// Parse flow, pausing after eols.
tokenizer.go_until(
state,
- |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')),
+ |code| matches!(code, Some('\n')),
move |state| Box::new(move |t| flow_end(t, info, state)),
)(tokenizer)
}
diff --git a/src/content/flow.rs b/src/content/flow.rs
index ea09cd9..09c4e2c 100644
--- a/src/content/flow.rs
+++ b/src/content/flow.rs
@@ -27,7 +27,7 @@ use crate::construct::{
thematic_break::start as thematic_break,
};
use crate::token::Token;
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
/// Before flow.
///
@@ -41,7 +41,7 @@ use crate::tokenizer::{Code, State, Tokenizer};
/// ```
pub fn start(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt(blank_line, |ok| {
Box::new(if ok { blank_line_after } else { initial_before })
})(tokenizer),
@@ -62,7 +62,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// ```
fn initial_before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt_n(
vec![
Box::new(code_indented),
@@ -87,8 +87,8 @@ fn initial_before(tokenizer: &mut Tokenizer) -> State {
/// ```
fn blank_line_after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None => State::Ok,
+ Some('\n') => {
tokenizer.enter(Token::BlankLineEnding);
tokenizer.consume();
tokenizer.exit(Token::BlankLineEnding);
@@ -111,8 +111,8 @@ fn blank_line_after(tokenizer: &mut Tokenizer) -> State {
/// ```
fn after(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
- Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => {
+ None => State::Ok,
+ Some('\n') => {
tokenizer.enter(Token::LineEnding);
tokenizer.consume();
tokenizer.exit(Token::LineEnding);
diff --git a/src/content/string.rs b/src/content/string.rs
index c6c0094..8bc2b91 100644
--- a/src/content/string.rs
+++ b/src/content/string.rs
@@ -16,9 +16,9 @@ use crate::construct::{
character_escape::start as character_escape, character_reference::start as character_reference,
partial_data::start as data, partial_whitespace::create_resolve_whitespace,
};
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
-const MARKERS: [Code; 2] = [Code::Char('&'), Code::Char('\\')];
+const MARKERS: [char; 2] = ['&', '\\'];
/// Start of string.
pub fn start(tokenizer: &mut Tokenizer) -> State {
@@ -32,7 +32,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// Before string.
fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt_n(
vec![Box::new(character_reference), Box::new(character_escape)],
|ok| Box::new(if ok { before } else { before_data }),
diff --git a/src/content/text.rs b/src/content/text.rs
index 4248053..ebdf888 100644
--- a/src/content/text.rs
+++ b/src/content/text.rs
@@ -28,18 +28,18 @@ use crate::construct::{
label_start_image::start as label_start_image, label_start_link::start as label_start_link,
partial_data::start as data, partial_whitespace::create_resolve_whitespace,
};
-use crate::tokenizer::{Code, State, Tokenizer};
+use crate::tokenizer::{State, Tokenizer};
-const MARKERS: [Code; 9] = [
- Code::Char('!'), // `label_start_image`
- Code::Char('&'), // `character_reference`
- Code::Char('*'), // `attention`
- Code::Char('<'), // `autolink`, `html_text`
- Code::Char('['), // `label_start_link`
- Code::Char('\\'), // `character_escape`, `hard_break_escape`
- Code::Char(']'), // `label_end`
- Code::Char('_'), // `attention`
- Code::Char('`'), // `code_text`
+const MARKERS: [char; 9] = [
+ '!', // `label_start_image`
+ '&', // `character_reference`
+ '*', // `attention`
+ '<', // `autolink`, `html_text`
+ '[', // `label_start_link`
+ '\\', // `character_escape`, `hard_break_escape`
+ ']', // `label_end`
+ '_', // `attention`
+ '`', // `code_text`
];
/// Start of text.
@@ -57,7 +57,7 @@ pub fn start(tokenizer: &mut Tokenizer) -> State {
/// Before text.
pub fn before(tokenizer: &mut Tokenizer) -> State {
match tokenizer.current {
- Code::None => State::Ok,
+ None => State::Ok,
_ => tokenizer.attempt_n(
vec![
Box::new(attention),