diff options
| -rw-r--r-- | src/compiler.rs | 4 | ||||
| -rw-r--r-- | src/content/flow.rs | 164 | ||||
| -rw-r--r-- | src/tokenizer.rs | 58 | 
3 files changed, 154 insertions, 72 deletions
| diff --git a/src/compiler.rs b/src/compiler.rs index 166950e..4f362b8 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -62,7 +62,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                          ignore_encode = true;                      }                  } -                TokenType::ContentPhrasing +                TokenType::ContentChunk                  | TokenType::AtxHeading                  | TokenType::AtxHeadingSequence                  | TokenType::AtxHeadingWhitespace @@ -280,7 +280,7 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St                  }                  // To do: `ContentPhrasing` should be parsed as phrasing first.                  // This branch below currently acts as the resulting `data` tokens. -                TokenType::ContentPhrasing +                TokenType::ContentChunk                  // To do: `ChunkString` does not belong here. Remove it when subtokenization is supported.                  | TokenType::ChunkString                  | TokenType::Data diff --git a/src/content/flow.rs b/src/content/flow.rs index 21c5721..6c47a10 100644 --- a/src/content/flow.rs +++ b/src/content/flow.rs @@ -96,16 +96,14 @@ fn blank_line_after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// ```markdown  /// |qwe  /// |    asd +/// |~~~js +/// |<div>  /// ```  fn initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code {          Code::None => (State::Ok, None), -        _ => tokenizer.attempt(code_indented, |ok| { -            Box::new(if ok { -                after -            } else { -                initial_before_not_code_indented -            }) +        _ => tokenizer.attempt_3(code_indented, code_fenced, html_flow, |ok| { +            Box::new(if ok { after } else { before })          })(tokenizer, code),      }  } @@ -132,38 +130,6 @@ fn after(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      }  } -/// Before flow (initial), but not at code (indented). -/// -/// ```markdown -/// |qwe -/// ``` -fn initial_before_not_code_indented(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    match code { -        Code::None => (State::Ok, None), -        _ => tokenizer.attempt(code_fenced, |ok| { -            Box::new(if ok { -                after -            } else { -                initial_before_not_code_fenced -            }) -        })(tokenizer, code), -    } -} - -/// Before flow (initial), but not at code (fenced). -/// -/// ```markdown -/// |qwe -/// ``` -fn initial_before_not_code_fenced(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    match code { -        Code::None => (State::Ok, None), -        _ => tokenizer.attempt(html_flow, |ok| Box::new(if ok { after } else { before }))( -            tokenizer, code, -        ), -    } -} -  /// Before flow, but not at code (indented) or code (fenced).  ///  /// Compared to flow (initial), normal flow can be arbitrarily prefixed. @@ -181,32 +147,11 @@ pub fn before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  /// Before flow, after potential whitespace.  ///  /// ```markdown -/// |qwe +/// |# asd +/// |***  /// ```  pub fn before_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    tokenizer.attempt(heading_atx, |ok| { -        Box::new(if ok { after } else { before_not_heading_atx }) -    })(tokenizer, code) -} - -/// Before flow, but not before a heading (atx) -/// -/// ```markdown -/// |qwe -/// ``` -pub fn before_not_heading_atx(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    tokenizer.attempt(thematic_break, |ok| { -        Box::new(if ok { after } else { before_not_thematic_break }) -    })(tokenizer, code) -} - -/// Before flow, but not before a heading (atx) or thematic break. -/// -/// ```markdown -/// |qwe -/// ``` -pub fn before_not_thematic_break(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { -    tokenizer.attempt(html_flow, |ok| { +    tokenizer.attempt_2(heading_atx, thematic_break, |ok| {          Box::new(if ok { after } else { content_before })      })(tokenizer, code)  } @@ -231,9 +176,8 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {          }          _ => {              tokenizer.enter(TokenType::Content); -            tokenizer.enter(TokenType::ContentPhrasing); -            tokenizer.consume(code); -            (State::Fn(Box::new(content)), None) +            tokenizer.enter(TokenType::ContentChunk); +            content(tokenizer, code)          }      }  } @@ -245,10 +189,15 @@ fn content_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {  // To do: lift limitations as documented above.  fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {      match code { -        Code::None | Code::Char('\n' | '\r') => { -            tokenizer.exit(TokenType::ContentPhrasing); -            tokenizer.exit(TokenType::Content); -            after(tokenizer, code) +        Code::None => { +            tokenizer.exit(TokenType::ContentChunk); +            content_end(tokenizer, code) +        } +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.exit(TokenType::ContentChunk); +            tokenizer.check(continuation_construct, |ok| { +                Box::new(if ok { content_continue } else { content_end }) +            })(tokenizer, code)          }          _ => {              tokenizer.consume(code); @@ -256,3 +205,80 @@ fn content(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult {          }      }  } + +fn continuation_construct(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    match code { +        Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => { +            tokenizer.enter(TokenType::LineEnding); +            tokenizer.consume(code); +            tokenizer.exit(TokenType::LineEnding); +            ( +                State::Fn(Box::new(continuation_construct_initial_before)), +                None, +            ) +        } +        _ => unreachable!("expected eol"), +    } +} + +fn continuation_construct_initial_before(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.attempt_2(code_fenced, html_flow, |ok| { +        if ok { +            Box::new(|_tokenizer, _code| (State::Nok, None)) +        } else { +            Box::new(|tokenizer, code| { +                tokenizer.attempt( +                    |tokenizer, code| whitespace(tokenizer, code, TokenType::Whitespace), +                    |_ok| Box::new(continuation_construct_after_prefix), +                )(tokenizer, code) +            }) +        } +    })(tokenizer, code) +} + +fn continuation_construct_after_prefix(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    // let tail = tokenizer.events.last(); +    // let mut prefix = 0; + +    // if let Some(event) = tail { +    //     if event.token_type == TokenType::Whitespace { +    //         let span = get_span(&tokenizer.events, tokenizer.events.len() - 1); +    //         prefix = span.end_index - span.start_index; +    //     } +    // } + +    match code { +        // Blank lines are not allowed in content. +        Code::None | Code::CarriageReturnLineFeed | Code::Char('\n' | '\r') => (State::Nok, None), +        // If code is disabled, indented lines are part of the content. +        // _ if prefix >= TAB_SIZE => { +        //     (State::Ok, None) +        // } +        _ => { +            println!("to do: check if flow interrupts, assuming it can’t"); +            tokenizer.attempt_2(heading_atx, thematic_break, |ok| { +                let result = if ok { +                    (State::Nok, None) +                } else { +                    (State::Ok, None) +                }; +                Box::new(|_t, _c| result) +            })(tokenizer, code) +        } +    } +} + +fn content_continue(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    // To do: should this be part of the content chunk? +    // That’s what `micromark-js` does. +    tokenizer.enter(TokenType::LineEnding); +    tokenizer.consume(code); +    tokenizer.exit(TokenType::LineEnding); +    tokenizer.enter(TokenType::ContentChunk); +    (State::Fn(Box::new(content)), None) +} + +fn content_end(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { +    tokenizer.exit(TokenType::Content); +    after(tokenizer, code) +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c8b1440..4239520 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -62,7 +62,8 @@ pub enum TokenType {      BlankLineWhitespace,      Content, -    ContentPhrasing, +    ContentChunk, +      ChunkString,  } @@ -377,6 +378,61 @@ impl Tokenizer {          )      } +    // To do: lifetimes, boxes, lmao. +    pub fn attempt_2( +        &mut self, +        a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        done: impl FnOnce(bool) -> Box<StateFn> + 'static, +    ) -> Box<StateFn> { +        self.call_multiple(false, Some(Box::new(a)), Some(Box::new(b)), None, done) +    } + +    pub fn attempt_3( +        &mut self, +        a: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        b: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        c: impl FnOnce(&mut Tokenizer, Code) -> StateFnResult + 'static, +        done: impl FnOnce(bool) -> Box<StateFn> + 'static, +    ) -> Box<StateFn> { +        self.call_multiple( +            false, +            Some(Box::new(a)), +            Some(Box::new(b)), +            Some(Box::new(c)), +            done, +        ) +    } + +    pub fn call_multiple( +        &mut self, +        check: bool, +        a: Option<Box<StateFn>>, +        b: Option<Box<StateFn>>, +        c: Option<Box<StateFn>>, +        done: impl FnOnce(bool) -> Box<StateFn> + 'static, +    ) -> Box<StateFn> { +        if let Some(head) = a { +            let callback = move |ok| { +                if ok { +                    done(ok) +                } else { +                    Box::new(move |tokenizer: &mut Tokenizer, code| { +                        tokenizer.call_multiple(check, b, c, None, done)(tokenizer, code) +                    }) +                } +            }; + +            if check { +                self.check(head, callback) +            } else { +                self.attempt(head, callback) +            } +        } else { +            done(false) +        } +    } +      /// Feed a list of `codes` into `start`.      ///      /// This is set up to support repeatedly calling `feed`, and thus streaming | 
