diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/construct/mod.rs | 1 | ||||
-rw-r--r-- | src/construct/partial_bom.rs | 12 | ||||
-rw-r--r-- | src/construct/partial_whitespace.rs | 16 | ||||
-rw-r--r-- | src/tokenizer.rs | 100 | ||||
-rw-r--r-- | src/util/slice.rs | 39 |
5 files changed, 90 insertions, 78 deletions
diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 7b50957..cfaca0a 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -41,6 +41,7 @@ //! //! There are also several routines used in different places: //! +//! * [bom][partial_bom] //! * [data][partial_data] //! * [destination][partial_destination] //! * [label][partial_label] diff --git a/src/construct/partial_bom.rs b/src/construct/partial_bom.rs index 155a1a3..d92c9c1 100644 --- a/src/construct/partial_bom.rs +++ b/src/construct/partial_bom.rs @@ -1,4 +1,14 @@ -//! To do. +//! Byte order mark occurs at the start of the document. +//! +//! It’s the three bytes 0xEF, 0xBB, and 0xBF. +//! +//! ## Tokens +//! +//! * [`ByteOrderMark`][Token::ByteOrderMark] +//! +//! ## References +//! +//! * [`micromark/lib/preprocess.js` in `micromark`](https://github.com/micromark/micromark/blob/ed23453/packages/micromark/dev/lib/preprocess.js#L54-L60) use crate::token::Token; use crate::tokenizer::{State, Tokenizer}; diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index 4f872ba..bf3bd4d 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -47,15 +47,18 @@ use crate::constant::HARD_BREAK_PREFIX_SIZE_MIN; use crate::token::Token; -use crate::tokenizer::{Event, EventType, Tokenizer}; +use crate::tokenizer::{Event, EventType, Resolver, Tokenizer}; use crate::util::slice::{Position, Slice}; -/// To do. -pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> impl Fn(&mut Tokenizer) { - move |t| resolve_whitespace(t, hard_break, trim_whole) +/// Create a resolver to handle trailing whitespace in events. +/// +/// Performing this as a resolver instead of a tokenizer improves performance +/// *a lot*. +pub fn create_resolve_whitespace(hard_break: bool, trim_whole: bool) -> Box<Resolver> { + Box::new(move |t| resolve_whitespace(t, hard_break, trim_whole)) } -/// To do. +/// Resolve whitespace. pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) { let mut index = 0; @@ -76,8 +79,7 @@ pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whol } } -/// To do. -#[allow(clippy::too_many_lines)] +/// Trim a [`Data`][Token::Data] token. fn trim_data( tokenizer: &mut Tokenizer, exit_index: usize, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9c5e9f6..9ab4309 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -26,7 +26,7 @@ pub enum ContentType { } #[derive(Debug, PartialEq)] -pub enum CharAction { +pub enum ByteAction { Normal(u8), Insert(u8), Ignore, @@ -47,10 +47,9 @@ pub struct Point { pub column: usize, /// 0-indexed position in the document. /// - /// Also an `index` into `codes`. - // To do: call it `offset`? + /// Also an `index` into `bytes`. pub index: usize, - /// To do. + /// Virtual step on the same `index`. pub vs: usize, } @@ -171,7 +170,7 @@ pub struct Tokenizer<'a> { column_start: Vec<(usize, usize)>, // First line. first_line: usize, - /// To do. + /// First point after the last line ending. line_start: Point, /// Track whether the current byte is already consumed (`true`) or expected /// to be consumed (`false`). @@ -192,7 +191,7 @@ pub struct Tokenizer<'a> { /// /// Tracked to make sure everything’s valid. pub stack: Vec<Token>, - /// To do. + /// Edit map, to batch changes. pub map: EditMap, /// List of attached resolvers, which will be called when done feeding, /// to clean events. @@ -323,15 +322,15 @@ impl<'a> Tokenizer<'a> { /// Move to the next (virtual) byte. pub fn move_one(&mut self) { match byte_action(self.parse_state.bytes, &self.point) { - CharAction::Ignore => { + ByteAction::Ignore => { self.point.index += 1; } - CharAction::Insert(byte) => { + ByteAction::Insert(byte) => { self.previous = Some(byte); self.point.column += 1; self.point.vs += 1; } - CharAction::Normal(byte) => { + ByteAction::Normal(byte) => { self.previous = Some(byte); self.point.vs = 0; self.point.index += 1; @@ -386,7 +385,7 @@ impl<'a> Tokenizer<'a> { while point.index > 0 { point.index -= 1; let action = byte_action(self.parse_state.bytes, &point); - if !matches!(action, CharAction::Ignore) { + if !matches!(action, ByteAction::Ignore) { point.index += 1; break; } @@ -439,7 +438,7 @@ impl<'a> Tokenizer<'a> { while point.index > 0 { point.index -= 1; let action = byte_action(self.parse_state.bytes, &point); - if !matches!(action, CharAction::Ignore) { + if !matches!(action, ByteAction::Ignore) { point.index += 1; break; } @@ -636,6 +635,7 @@ impl<'a> Tokenizer<'a> { /// /// This is set up to support repeatedly calling `feed`, and thus streaming /// markdown into the state machine, and normally pauses after feeding. + // Note: if needed: accept `vs`? pub fn push( &mut self, min: usize, @@ -644,8 +644,6 @@ impl<'a> Tokenizer<'a> { ) -> State { debug_assert!(!self.resolved, "cannot feed after drain"); debug_assert!(min >= self.point.index, "cannot move backwards"); - - // To do: accept `vs`? self.move_to((min, 0)); let mut state = State::Fn(Box::new(start)); @@ -654,16 +652,11 @@ impl<'a> Tokenizer<'a> { match state { State::Ok | State::Nok => break, State::Fn(func) => match byte_action(self.parse_state.bytes, &self.point) { - CharAction::Ignore => { + ByteAction::Ignore => { state = State::Fn(Box::new(func)); self.move_one(); } - CharAction::Insert(byte) => { - log::debug!("main: passing (fake): `{:?}` ({:?})", byte, self.point); - self.expect(Some(byte)); - state = func(self); - } - CharAction::Normal(byte) => { + ByteAction::Insert(byte) | ByteAction::Normal(byte) => { log::debug!("main: passing: `{:?}` ({:?})", byte, self.point); self.expect(Some(byte)); state = func(self); @@ -685,35 +678,30 @@ impl<'a> Tokenizer<'a> { match state { State::Ok | State::Nok => break, State::Fn(func) => { - // To do: clean this? // We sometimes move back when flushing, so then we use those codes. - if self.point.index == max { - let byte = None; - log::debug!("main: flushing eof: `{:?}` ({:?})", byte, self.point); - self.expect(byte); - state = func(self); + let action = if self.point.index == max { + None } else { - match byte_action(self.parse_state.bytes, &self.point) { - CharAction::Ignore => { - state = State::Fn(Box::new(func)); - self.move_one(); - } - CharAction::Insert(byte) => { - log::debug!( - "main: flushing (fake): `{:?}` ({:?})", - byte, - self.point - ); - self.expect(Some(byte)); - state = func(self); - } - CharAction::Normal(byte) => { - log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point); - self.expect(Some(byte)); - state = func(self); - } - } + Some(byte_action(self.parse_state.bytes, &self.point)) }; + + if let Some(ByteAction::Ignore) = action { + state = State::Fn(Box::new(func)); + self.move_one(); + } else { + let byte = + if let Some(ByteAction::Insert(byte) | ByteAction::Normal(byte)) = + action + { + Some(byte) + } else { + None + }; + + log::debug!("main: flushing: `{:?}` ({:?})", byte, self.point); + self.expect(byte); + state = func(self); + } } } } @@ -733,18 +721,18 @@ impl<'a> Tokenizer<'a> { } } -fn byte_action(bytes: &[u8], point: &Point) -> CharAction { +fn byte_action(bytes: &[u8], point: &Point) -> ByteAction { if point.index < bytes.len() { let byte = bytes[point.index]; if byte == b'\r' { // CRLF. if point.index < bytes.len() - 1 && bytes[point.index + 1] == b'\n' { - CharAction::Ignore + ByteAction::Ignore } // CR. else { - CharAction::Normal(b'\n') + ByteAction::Normal(b'\n') } } else if byte == b'\t' { let remainder = point.column % TAB_SIZE; @@ -757,19 +745,17 @@ fn byte_action(bytes: &[u8], point: &Point) -> CharAction { // On the tab itself, first send it. if point.vs == 0 { if vs == 0 { - CharAction::Normal(byte) + ByteAction::Normal(byte) } else { - CharAction::Insert(byte) + ByteAction::Insert(byte) } } else if vs == 0 { - CharAction::Normal(b' ') + ByteAction::Normal(b' ') } else { - CharAction::Insert(b' ') + ByteAction::Insert(b' ') } - } - // VS? - else { - CharAction::Normal(byte) + } else { + ByteAction::Normal(byte) } } else { unreachable!("out of bounds") diff --git a/src/util/slice.rs b/src/util/slice.rs index 13b664d..f287978 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -43,7 +43,11 @@ impl<'a> Position<'a> { } } - /// To do. + /// Turn a position into indices. + /// + /// Indices are places in `bytes` where this position starts and ends. + /// + /// > 👉 **Note**: indices cannot represent virtual spaces. pub fn to_indices(&self) -> (usize, usize) { (self.start.index, self.end.index) } @@ -60,7 +64,7 @@ pub struct Slice<'a> { } impl<'a> Slice<'a> { - /// Get the slice belonging to a position. + /// Get the slice belonging to a point. pub fn from_point(bytes: &'a [u8], point: &Point) -> Slice<'a> { let mut before = point.vs; let mut start = point.index; @@ -84,13 +88,13 @@ impl<'a> Slice<'a> { } } - /// To do. + /// Create a slice from one index. + /// + /// Indices are places in `bytes`. + /// + /// > 👉 **Note**: indices cannot represent virtual spaces. pub fn from_index(bytes: &'a [u8], index: usize) -> Slice<'a> { - Slice { - bytes: &bytes[index..=index], - before: 0, - after: 0, - } + Slice::from_indices(bytes, index, index + 1) } /// Get the slice belonging to a position. @@ -121,7 +125,11 @@ impl<'a> Slice<'a> { } } - /// To do. + /// Create a slice from two indices. + /// + /// Indices are places in `bytes`. + /// + /// > 👉 **Note**: indices cannot represent virtual spaces. pub fn from_indices(bytes: &'a [u8], start: usize, end: usize) -> Slice<'a> { Slice { bytes: &bytes[start..end], @@ -130,12 +138,13 @@ impl<'a> Slice<'a> { } } - /// To do. + /// Get the size of this slice, including virtual spaces. pub fn len(&self) -> usize { self.bytes.len() + self.before + self.after } - /// To do. + /// Get the first byte in this slice, representing a virtual space as a + /// space. pub fn head(&self) -> Option<u8> { if self.before > 0 { Some(b' ') @@ -146,12 +155,16 @@ impl<'a> Slice<'a> { } } - // To do: + /// Turn the slice into a `&str`. + /// + /// Does not support virtual spaces. pub fn as_str(&self) -> &str { str::from_utf8(self.bytes).unwrap() } - /// To do. + /// Turn the slice into a `String`. + /// + /// Support virtual spaces. pub fn serialize(&self) -> String { let mut string = String::with_capacity(self.len()); let mut index = self.before; |