From f7e5fb852dc9c416b9eeb1f0d4f2d51ba5b68456 Mon Sep 17 00:00:00 2001
From: Titus Wormer <tituswormer@gmail.com>
Date: Thu, 28 Jul 2022 16:48:00 +0200
Subject: Refactor to work on `char`s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, a custom char implementation was used.
This was easier to work with, as sometimes “virtual” characters are injected,
or characters are ignored.

This replaces that with working on actual `char`s.
In the hope of in the future working on `u8`s, even.

This simplifies the state machine somewhat, as only `\n` is fed, regardless of
whether it was a CRLF, CR, or LF.
It also feeds `' '` instead of virtual spaces.

The BOM, if present, is now available as a `ByteOrderMark` event.
---
 src/content/document.rs | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

(limited to 'src/content/document.rs')
diff --git a/src/content/document.rs b/src/content/document.rs
index 32b32ba..2924f6c 100644
--- a/src/content/document.rs
+++ b/src/content/document.rs
@@ -17,12 +17,12 @@ use crate::parser::ParseState;
 use crate::subtokenize::subtokenize;
 use crate::token::Token;
 use crate::tokenizer::{
-    Code, Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
+    Container, ContainerState, Event, EventType, Point, State, StateFn, Tokenizer,
 };
 use crate::util::{
     normalize_identifier::normalize_identifier,
     skip,
-    span::{from_exit_event, serialize},
+    slice::{Position, Slice},
 };
 
 /// Phases where we can exit containers.
@@ -78,7 +78,7 @@ struct DocumentInfo {
 pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
     let mut tokenizer = Tokenizer::new(point, parse_state);
 
-    let state = tokenizer.push(0, parse_state.codes.len(), Box::new(start));
+    let state = tokenizer.push(0, parse_state.chars.len(), Box::new(before));
     tokenizer.flush(state, true);
 
     let mut index = 0;
@@ -88,13 +88,14 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
         let event = &tokenizer.events[index];
 
         if event.event_type == EventType::Exit && event.token_type == Token::DefinitionLabelString {
+            // To do: when we operate on u8, we can use a `to_str` here as we
+            // don‘t need virtual spaces.
             let id = normalize_identifier(
-                serialize(
-                    &parse_state.codes,
-                    &from_exit_event(&tokenizer.events, index),
-                    false,
+                &Slice::from_position(
+                    &tokenizer.parse_state.chars,
+                    &Position::from_exit_event(&tokenizer.events, index),
                 )
-                .as_str(),
+                .serialize(),
             );
 
             if !definitions.contains(&id) {
@@ -114,6 +115,26 @@ pub fn document(parse_state: &mut ParseState, point: Point) -> Vec<Event> {
     events
 }
 
+/// At the beginning.
+///
+/// Perhaps a BOM?
+///
+/// ```markdown
+/// > | a
+///     ^
+/// ```
+fn before(tokenizer: &mut Tokenizer) -> State {
+    match tokenizer.current {
+        Some('\u{FEFF}') => {
+            tokenizer.enter(Token::ByteOrderMark);
+            tokenizer.consume();
+            tokenizer.exit(Token::ByteOrderMark);
+            State::Fn(Box::new(start))
+        }
+        _ => start(tokenizer),
+    }
+}
+
 /// Before document.
 //
 /// ```markdown
@@ -337,7 +358,7 @@ fn containers_after(tokenizer: &mut Tokenizer, mut info: DocumentInfo) -> State
     // Parse flow, pausing after eols.
     tokenizer.go_until(
         state,
-        |code| matches!(code, Code::CarriageReturnLineFeed | Code::Char('\n' | '\r')),
+        |code| matches!(code, Some('\n')),
         move |state| Box::new(move |t| flow_end(t, info, state)),
     )(tokenizer)
 }
-- 
cgit