diff options
| author | 2022-06-10 16:29:56 +0200 | |
|---|---|---|
| committer | 2022-06-10 16:29:56 +0200 | |
| commit | 5133042973f31a3992f216e591d840bb491bfd45 (patch) | |
| tree | 810a44ac1d98f65dd2eedd0d9e8387eac0753e25 /src | |
| parent | 021d5f989ae41ae39a9b937b498141d9dc70d894 (diff) | |
| download | markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.gz markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.tar.bz2 markdown-rs-5133042973f31a3992f216e591d840bb491bfd45.zip  | |
Add proper support for subtokenization
- Add “content” content type
- Add paragraph
- Add skips
- Add linked tokens
Diffstat (limited to '')
| -rw-r--r-- | src/compiler.rs | 23 | ||||
| -rw-r--r-- | src/content/content.rs | 84 | ||||
| -rw-r--r-- | src/content/flow.rs | 45 | ||||
| -rw-r--r-- | src/content/mod.rs | 2 | ||||
| -rw-r--r-- | src/content/string.rs | 42 | ||||
| -rw-r--r-- | src/subtokenize.rs | 166 | ||||
| -rw-r--r-- | src/tokenizer.rs | 40 | 
7 files changed, 279 insertions, 123 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 3632d29..05a56e1 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -38,7 +38,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St          match event.event_type {              EventType::Enter => match token_type { -                TokenType::Content => { +                TokenType::Paragraph => {                      buf_tail_mut(buffers).push("<p>".to_string());                  }                  TokenType::CodeIndented => { @@ -62,7 +62,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                          ignore_encode = true;                      }                  } -                TokenType::ContentChunk +                TokenType::Content                  | TokenType::AtxHeading                  | TokenType::AtxHeadingSequence                  | TokenType::AtxHeadingWhitespace @@ -79,7 +79,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                  | TokenType::HtmlFlowData                  | TokenType::CodeFencedFence                  | TokenType::CodeFencedFenceSequence -                | TokenType::ChunkString +                | TokenType::ChunkText                  | TokenType::CodeFencedFenceWhitespace                  | TokenType::Data                  | TokenType::CharacterEscape @@ -97,7 +97,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                  }              },              EventType::Exit => match token_type { -                TokenType::ThematicBreakSequence +                TokenType::Content +                | TokenType::ThematicBreakSequence                  | TokenType::ThematicBreakWhitespace                  | TokenType::CodeIndentedPrefixWhitespace                  | TokenType::BlankLineEnding @@ -120,7 +121,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                      // last_was_tag = false;                      buf_tail_mut(buffers).push(res);                  } -                TokenType::Content => { +                TokenType::Paragraph => {                      buf_tail_mut(buffers).push("</p>".to_string());                  }                  TokenType::CodeIndented | TokenType::CodeFenced => { @@ -278,17 +279,9 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                      character_reference_kind = None;                  } -                // To do: `ContentPhrasing` should be parsed as phrasing first.                  // This branch below currently acts as the resulting `data` tokens. -                // To do: initial and final whitespace should be handled in `text`. -                TokenType::ContentChunk => { -                    // last_was_tag = false; -                    buf_tail_mut(buffers).push(encode( -                        slice_serialize(codes, &get_span(events, index), false).trim(), -                    )); -                } -                // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported. -                TokenType::ChunkString | TokenType::Data | TokenType::CharacterEscapeValue => { +                // To do: `ChunkText` does not belong here. Remove it when subtokenization is supported. +                TokenType::ChunkText | TokenType::Data | TokenType::CharacterEscapeValue => {                      // last_was_tag = false;                      buf_tail_mut(buffers).push(encode(&slice_serialize(                          codes, diff --git a/src/content/content.rs b/src/content/content.rs new file mode 100644 index 0000000..7bf692f --- /dev/null +++ b/src/content/content.rs @@ -0,0 +1,84 @@ +//! The `content`, ahum, content type. +//! +//! **Content** is zero or more definitions, and then zero or one paragraph. +//! It’s a weird one, and needed to make certain edge cases around definitions +//! spec compliant. +//! Definitions are unlike other things in markdown, in that they behave like +//! **text** in that they can contain arbitrary line endings, but *have* to end +//! at a line ending. +//! If they end in something else, the whole definition instead is seen as a +//! paragraph. +//! +//! The constructs found in content are: +//! +//! *   Definition +//! *   Paragraph + +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer}; + +/// Before content. +/// +/// ```markdown +/// |[x]: y +/// |asd +/// ``` +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            unreachable!("expected non-eol/eof"); +        } +        _ => paragraph_initial(tokenizer, code) +        // To do: definition. +        // _ => tokenizer.attempt(definition, |ok| { +        //     Box::new(if ok { +        //         a +        //     } else { +        //         b +        //     }) +        // })(tokenizer, code), +    } +} + +/// Before a paragraph. +/// +/// ```markdown +/// |asd +/// ``` +fn paragraph_initial(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            unreachable!("expected non-eol/eof"); +        } +        _ => { +            tokenizer.enter(TokenType::Paragraph); +            tokenizer.enter(TokenType::ChunkText); +            data(tokenizer, code) +        } +    } +} + +/// In a line in a paragraph. +/// +/// ```markdown +/// |\& +/// |qwe +/// ``` +fn data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::None => { +            tokenizer.exit(TokenType::ChunkText); +            tokenizer.exit(TokenType::Paragraph); +            (State::Ok, None) +        } +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.consume(code); +            tokenizer.exit(TokenType::ChunkText); +            tokenizer.enter(TokenType::ChunkText); +            (State::Fn(Box::new(data)), None) +        } +        _ => { +            tokenizer.consume(code); +            (State::Fn(Box::new(data)), None) +        } +    } +} diff --git a/src/content/flow.rs b/src/content/flow.rs index 6f94424..0d1bd22 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -31,8 +31,6 @@ use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Toke  use crate::util::get_span;  /// Turn `codes` as the flow content type into events. -// To do: remove this `allow` when all the content types are glued together. -#[allow(dead_code)]  pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {      let mut tokenizer = Tokenizer::new(point, index);      tokenizer.feed(codes, Box::new(start), true); @@ -49,7 +47,7 @@ pub fn flow(codes: &[Code], point: Point, index: usize) -> Vec<Event> {  /// |    bravo  /// |***  /// ``` -fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None => (State::Ok, None),          _ => tokenizer.attempt(blank_line, |ok| { @@ -168,7 +166,7 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {          _ => {              tokenizer.enter(TokenType::Content);              tokenizer.enter(TokenType::ContentChunk); -            content(tokenizer, code) +            content(tokenizer, code, tokenizer.events.len() - 1)          }      }  } @@ -178,21 +176,26 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// al|pha  /// ```  // To do: lift limitations as documented above. -fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +fn content(tokenizer: &mut Tokenizer, code: Code, previous: usize) -> StateFnResult {      match code { -        Code::None => { -            tokenizer.exit(TokenType::ContentChunk); -            content_end(tokenizer, code) -        } +        Code::None => content_end(tokenizer, code),          Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { -            tokenizer.exit(TokenType::ContentChunk); -            tokenizer.check(continuation_construct, |ok| { -                Box::new(if ok { content_continue } else { content_end }) +            tokenizer.check(continuation_construct, move |ok| { +                Box::new(move |t, c| { +                    if ok { +                        content_continue(t, c, previous) +                    } else { +                        content_end(t, c) +                    } +                })              })(tokenizer, code)          }          _ => {              tokenizer.consume(code); -            (State::Fn(Box::new(content)), None) +            ( +                State::Fn(Box::new(move |t, c| content(t, c, previous))), +                None, +            )          }      }  } @@ -254,17 +257,21 @@ fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) ->      }  } -fn content_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    // To do: should this be part of the content chunk? -    // That’s what `micromark-js` does. -    tokenizer.enter(TokenType::LineEnding); +fn content_continue(tokenizer: &mut Tokenizer, code: Code, previous_index: usize) -> StateFnResult {      tokenizer.consume(code); -    tokenizer.exit(TokenType::LineEnding); +    tokenizer.exit(TokenType::ContentChunk);      tokenizer.enter(TokenType::ContentChunk); -    (State::Fn(Box::new(content)), None) +    let next_index = tokenizer.events.len() - 1; +    tokenizer.events[previous_index].next = Some(next_index); +    tokenizer.events[next_index].previous = Some(previous_index); +    ( +        State::Fn(Box::new(move |t, c| content(t, c, next_index))), +        None, +    )  }  fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.exit(TokenType::ContentChunk);      tokenizer.exit(TokenType::Content);      after(tokenizer, code)  } diff --git a/src/content/mod.rs b/src/content/mod.rs index d5771a3..4c0a7f4 100644 --- a/src/content/mod.rs +++ b/src/content/mod.rs @@ -1,4 +1,6 @@  //! Content types found in markdown. +#[allow(clippy::module_inception)] +pub mod content;  pub mod flow;  pub mod string; diff --git a/src/content/string.rs b/src/content/string.rs index 64f544b..ff9e3fc 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -5,7 +5,7 @@  //! It exists in things such as identifiers (media references, definitions),  //! titles, URLs, code (fenced) info and meta parts.  //! -//! The constructs found in strin are: +//! The constructs found in string are:  //!  //! *   [Character escape][crate::construct::character_escape]  //! *   [Character reference][crate::construct::character_reference] @@ -13,16 +13,7 @@  use crate::construct::{      character_escape::start as character_escape, character_reference::start as character_reference,  }; -use crate::tokenizer::{Code, Event, Point, State, StateFnResult, TokenType, Tokenizer}; - -/// Turn `codes` as the string content type into events. -// To do: remove this `allow` when all the content types are glued together. -#[allow(dead_code)] -pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> { -    let mut tokenizer = Tokenizer::new(point, index); -    tokenizer.feed(codes, Box::new(before), true); -    tokenizer.events -} +use crate::tokenizer::{Code, State, StateFnResult, TokenType, Tokenizer};  /// Before string.  /// @@ -33,33 +24,12 @@ pub fn string(codes: &[Code], point: Point, index: usize) -> Vec<Event> {  /// |\&  /// |qwe  /// ``` -fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    match code { -        Code::None => (State::Ok, None), -        _ => tokenizer.attempt(character_reference, |ok| { -            Box::new(if ok { -                before -            } else { -                before_not_character_reference -            }) -        })(tokenizer, code), -    } -} - -/// Before string, not at a character reference. -/// -/// Assume character escape. -/// -/// ```markdown -/// |\& -/// |qwe -/// ``` -fn before_not_character_reference(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +pub fn start(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None => (State::Ok, None), -        _ => tokenizer.attempt(character_escape, |ok| { +        _ => tokenizer.attempt_2(character_reference, character_escape, |ok| {              Box::new(if ok { -                before +                start              } else {                  before_not_character_escape              }) @@ -98,7 +68,7 @@ fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {          // To do: somehow get these markers from constructs.          Code::Char('&' | '\\') => {              tokenizer.exit(TokenType::Data); -            before(tokenizer, code) +            start(tokenizer, code)          }          _ => {              tokenizer.consume(code); diff --git a/src/subtokenize.rs b/src/subtokenize.rs index c1a8435..adf843f 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -1,66 +1,132 @@ -use crate::content::string::string; -use crate::tokenizer::{Code, Event, EventType, TokenType}; +use crate::content::content::start as content; +use crate::content::string::start as string; +use crate::tokenizer::{ +    Code, Event, EventType, State, StateFn, StateFnResult, TokenType, Tokenizer, +};  use crate::util::{slice_codes, Span}; +use std::collections::HashMap;  pub fn subtokenize(events: Vec<Event>, codes: &[Code]) -> Vec<Event> {      let mut events = events;      let mut index = 0; - -    // println!("before"); -    // while index < events.len() { -    //     let event = &events[index]; -    //     println!( -    //         "ev1: {:?} {:?} {:?}", -    //         event.event_type, event.token_type, index -    //     ); -    //     index += 1; -    // } -    // -    // index = 0; -    // -    // println!("change"); +    // Map of first chunks its tokenizer. +    let mut head_to_tokenizer: HashMap<usize, Tokenizer> = HashMap::new(); +    // Map of chunks to their head and corresponding range of events. +    let mut link_to_info: HashMap<usize, (usize, usize, usize)> = HashMap::new();      while index < events.len() {          let event = &events[index]; -        // println!( -        //     "ev2: {:?} {:?} {:?}", -        //     event.event_type, event.token_type, index -        // ); +        // Find each first opening chunk. +        if (event.token_type == TokenType::ChunkString +                || event.token_type == TokenType::ContentChunk) && +            event.event_type == EventType::Enter && +            // No need to enter linked events again. +            event.previous == None +        { +            // Index into `events` pointing to a chunk. +            let mut index_opt: Option<usize> = Some(index); +            // Subtokenizer. +            let mut tokenizer = Tokenizer::new(event.point.clone(), event.index); +            // Substate. +            let mut result: StateFnResult = ( +                State::Fn(Box::new(if event.token_type == TokenType::ContentChunk { +                    content +                } else { +                    string +                })), +                None, +            ); +            // Indices into `codes` of each end of chunk. +            let mut ends: Vec<usize> = vec![]; -        if event.event_type == EventType::Enter && event.token_type == TokenType::ChunkString { -            let exit = &events[index + 1]; +            // Loop through chunks to pass them in order to the subtokenizer. +            while let Some(index_ptr) = index_opt { +                let enter = &events[index_ptr]; +                let span = Span { +                    start_index: enter.index, +                    end_index: events[index_ptr + 1].index, +                }; +                ends.push(span.end_index); -            assert_eq!( -                exit.event_type, -                EventType::Exit, -                "expected `enter` of `{:?}` to be follow by an `exit` event", -                event.token_type -            ); -            assert_eq!( -                exit.token_type, event.token_type, -                "expected `exit` of `{:?}` to follow its `enter` event", -                event.token_type -            ); +                if enter.previous != None { +                    tokenizer.define_skip(&enter.point, span.start_index); +                } -            let subevents = string( -                slice_codes( -                    codes, -                    &Span { -                        start_index: event.index, -                        end_index: exit.index, -                    }, -                ), -                event.point.clone(), -                event.index, -            ); -            let len = subevents.len(); -            // To do: recursion needed? -            events.splice(index..(index + 2), subevents); -            index += len; -        } else { -            index += 1; +                let func: Box<StateFn> = match result.0 { +                    State::Fn(func) => func, +                    _ => unreachable!("cannot be ok/nok"), +                }; + +                result = tokenizer.feed(slice_codes(codes, &span), func, enter.next == None); + +                if let Some(ref x) = result.1 { +                    if !x.is_empty() { +                        // To do: handle? +                        unreachable!("subtokenize:remainder {:?}", x); +                    } +                } + +                index_opt = enter.next; +            } + +            // Now, loop through all subevents (and `ends`), to figure out +            // which parts belong where. +            // Current index. +            let mut subindex = 0; +            // Index into subevents that starts the current slice. +            let mut last_start = 0; +            // Counter into `ends`. +            let mut end_index = 0; +            let mut index_opt: Option<usize> = Some(index); + +            while subindex < tokenizer.events.len() { +                let subevent = &tokenizer.events[subindex]; + +                // Find the first event that starts after the end we’re looking +                // for. +                // To do: is this logic correct? +                if subevent.event_type == EventType::Enter && subevent.index >= ends[end_index] { +                    let link = index_opt.unwrap(); +                    link_to_info.insert(link, (index, last_start, subindex)); + +                    last_start = subindex; +                    end_index += 1; +                    index_opt = events[link].next; +                } + +                subindex += 1; +            } + +            let link = index_opt.unwrap(); +            link_to_info.insert(link, (index, last_start, subindex)); +            head_to_tokenizer.insert(index, tokenizer);          } + +        index += 1; +    } + +    // Now that we fed everything into a tokenizer, and we know which parts +    // belong where, the final task is to splice the events from each +    // tokenizer into the current events. +    // To do: instead of splicing, it might be possible to create a new `events` +    // from each slice and slices from events? +    let mut index = events.len() - 1; + +    while index > 0 { +        let slice_opt = link_to_info.get(&index); + +        if let Some(slice) = slice_opt { +            let (head, start, end) = *slice; +            // If there’s a slice at this index, it must also point to a head, +            // and that head must have a tokenizer. +            let tokenizer = head_to_tokenizer.get(&head).unwrap(); + +            // To do: figure out a way that moves instead of clones? +            events.splice(index..(index + 2), tokenizer.events[start..end].to_vec()); +        } + +        index -= 1;      }      events diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 35e768e..1746a19 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -12,6 +12,7 @@  //! [`check`]: Tokenizer::check  use crate::constant::TAB_SIZE; +use std::collections::HashMap;  /// Semantic label of a span.  // To do: figure out how to share this so extensions can add their own stuff, @@ -64,7 +65,10 @@ pub enum TokenType {      Content,      ContentChunk, +    Paragraph, +      ChunkString, +    ChunkText,  }  /// Enum representing a character code. @@ -101,7 +105,7 @@ pub struct Point {  }  /// Possible event types. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)]  pub enum EventType {      /// The start of something.      Enter, @@ -110,12 +114,14 @@ pub enum EventType {  }  /// Something semantic happening somewhere. -#[derive(Debug)] +#[derive(Debug, Clone)]  pub struct Event {      pub event_type: EventType,      pub token_type: TokenType,      pub point: Point,      pub index: usize, +    pub previous: Option<usize>, +    pub next: Option<usize>,  }  /// The essence of the state machine are functions: `StateFn`. @@ -156,6 +162,7 @@ struct InternalState {  /// A tokenizer itself.  #[derive(Debug)]  pub struct Tokenizer { +    column_start: HashMap<usize, usize>,      /// Track whether a character is expected to be consumed, and whether it’s      /// actually consumed      /// @@ -180,6 +187,7 @@ impl Tokenizer {      pub fn new(point: Point, index: usize) -> Tokenizer {          Tokenizer {              current: Code::None, +            column_start: HashMap::new(),              index,              consumed: true,              point, @@ -195,6 +203,28 @@ impl Tokenizer {          self.current = code;      } +    pub fn define_skip(&mut self, point: &Point, index: usize) { +        self.column_start.insert(point.line, point.column); +        self.account_for_potential_skip(); +        log::debug!("position: define skip: `{:?}` ({:?})", point, index); +    } + +    fn account_for_potential_skip(&mut self) { +        println!("account?: {:?} {:?}", self.point, self.index); +        match self.column_start.get(&self.point.line) { +            None => {} +            Some(next_column) => { +                if self.point.column == 1 { +                    let col = *next_column; +                    self.point.column = col; +                    self.point.offset += col - 1; +                    self.index += col - 1; +                    println!("account! {:?} {:?}", self.point, self.index); +                } +            } +        }; +    } +      /// Consume the current character.      /// Each [`StateFn`][] is expected to call this to signal that this code is      /// used, or call a next `StateFn`. @@ -215,7 +245,7 @@ impl Tokenizer {                  } else {                      1                  }; -                // To do: accountForPotentialSkip() +                self.account_for_potential_skip();                  log::debug!("position: after eol: `{:?}`", self.point);              }              Code::VirtualSpace => { @@ -240,6 +270,8 @@ impl Tokenizer {              token_type: token_type.clone(),              point: self.point.clone(),              index: self.index, +            previous: None, +            next: None,          };          self.events.push(event); @@ -270,6 +302,8 @@ impl Tokenizer {              token_type,              point,              index: self.index, +            previous: None, +            next: None,          };          self.events.push(event);  | 
