diff options
-rw-r--r-- | src/compiler.rs | 335 | ||||
-rw-r--r-- | src/content/string.rs | 1 | ||||
-rw-r--r-- | src/content/text.rs | 1 | ||||
-rw-r--r-- | src/tokenizer.rs | 28 |
4 files changed, 176 insertions, 189 deletions
diff --git a/src/compiler.rs b/src/compiler.rs index 9f84a38..50c06e1 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -102,8 +102,51 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St match event.event_type { EventType::Enter => match token_type { - TokenType::Paragraph => { - buf_tail_mut(buffers).push("<p>".to_string()); + TokenType::AtxHeading + | TokenType::AtxHeadingSequence + | TokenType::AtxHeadingWhitespace + | TokenType::Autolink + | TokenType::AutolinkEmail + | TokenType::AutolinkMarker + | TokenType::AutolinkProtocol + | TokenType::BlankLineEnding + | TokenType::BlankLineWhitespace + | TokenType::CharacterEscape + | TokenType::CharacterEscapeMarker + | TokenType::CharacterEscapeValue + | TokenType::CharacterReference + | TokenType::CharacterReferenceMarker + | TokenType::CharacterReferenceMarkerHexadecimal + | TokenType::CharacterReferenceMarkerNumeric + | TokenType::CharacterReferenceMarkerSemi + | TokenType::CharacterReferenceValue + | TokenType::CodeIndentedPrefixWhitespace + | TokenType::CodeFencedFence + | TokenType::CodeFencedFenceSequence + | TokenType::CodeFencedFenceWhitespace + | TokenType::CodeFlowChunk + | TokenType::CodeTextData + | TokenType::CodeTextLineEnding + | TokenType::CodeTextSequence + | TokenType::Content + | TokenType::Data + | TokenType::HardBreakEscape + | TokenType::HardBreakEscapeMarker + | TokenType::HardBreakTrailing + | TokenType::HardBreakTrailingSpace + | TokenType::HtmlFlowData + | TokenType::HtmlTextData + | TokenType::LineEnding + | TokenType::ThematicBreak + | TokenType::ThematicBreakSequence + | TokenType::ThematicBreakWhitespace + | TokenType::Whitespace => { + // Ignore. + } + TokenType::AtxHeadingText + | TokenType::CodeFencedFenceInfo + | TokenType::CodeFencedFenceMeta => { + buffer(buffers); } TokenType::CodeIndented => { code_flow_seen_data = Some(false); @@ -117,9 +160,8 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St buf_tail_mut(buffers).push("<pre><code".to_string()); code_fenced_fences_count = Some(0); } - TokenType::AtxHeadingText - | TokenType::CodeFencedFenceInfo - | TokenType::CodeFencedFenceMeta => { + TokenType::CodeText => { + buf_tail_mut(buffers).push("<code>".to_string()); buffer(buffers); } TokenType::HtmlFlow => { @@ -133,161 +175,58 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St ignore_encode = true; } } - TokenType::CodeText => { - buf_tail_mut(buffers).push("<code>".to_string()); - buffer(buffers); + TokenType::Paragraph => { + buf_tail_mut(buffers).push("<p>".to_string()); } - TokenType::Content - | TokenType::AtxHeading - | TokenType::AtxHeadingSequence - | TokenType::AtxHeadingWhitespace - | TokenType::LineEnding - | TokenType::ThematicBreak - | TokenType::ThematicBreakSequence - | TokenType::ThematicBreakWhitespace - | TokenType::CodeIndentedPrefixWhitespace - | TokenType::CodeFlowChunk - | TokenType::BlankLineEnding - | TokenType::BlankLineWhitespace - | TokenType::Whitespace - | TokenType::HardBreakEscape - | TokenType::HardBreakEscapeMarker - | TokenType::HardBreakTrailing - | TokenType::HardBreakTrailingSpace - | TokenType::HtmlFlowData - | TokenType::HtmlTextData - | TokenType::CodeFencedFence - | TokenType::CodeFencedFenceSequence - | TokenType::CodeFencedFenceWhitespace - | TokenType::CodeTextSequence - | TokenType::CodeTextData - | TokenType::CodeTextLineEnding - | TokenType::Data - | TokenType::CharacterEscape - | TokenType::CharacterEscapeMarker - | TokenType::CharacterEscapeValue - | TokenType::CharacterReference - | TokenType::CharacterReferenceMarker - | TokenType::CharacterReferenceMarkerNumeric - | TokenType::CharacterReferenceMarkerHexadecimal - | TokenType::CharacterReferenceMarkerSemi - | TokenType::CharacterReferenceValue - | TokenType::Autolink - | TokenType::AutolinkMarker - | TokenType::AutolinkProtocol - | TokenType::AutolinkEmail => {} #[allow(unreachable_patterns)] _ => { unreachable!("unhandled `enter` of TokenType {:?}", token_type) } }, EventType::Exit => match token_type { - TokenType::Content - | TokenType::ThematicBreakSequence - | TokenType::ThematicBreakWhitespace - | TokenType::CodeIndentedPrefixWhitespace + TokenType::Autolink + | TokenType::AutolinkMarker | TokenType::BlankLineEnding | TokenType::BlankLineWhitespace - | TokenType::Whitespace - | TokenType::CodeFencedFenceSequence - | TokenType::CodeFencedFenceWhitespace - | TokenType::CodeTextSequence | TokenType::CharacterEscape | TokenType::CharacterEscapeMarker | TokenType::CharacterReference | TokenType::CharacterReferenceMarkerSemi + | TokenType::CodeFencedFenceSequence + | TokenType::CodeFencedFenceWhitespace + | TokenType::CodeIndentedPrefixWhitespace + | TokenType::CodeTextSequence + | TokenType::Content | TokenType::HardBreakEscapeMarker | TokenType::HardBreakTrailingSpace - | TokenType::Autolink - | TokenType::AutolinkMarker => {} - TokenType::HtmlFlow | TokenType::HtmlText => { - ignore_encode = false; + | TokenType::ThematicBreakSequence + | TokenType::ThematicBreakWhitespace + | TokenType::Whitespace => { + // Ignore. } - TokenType::HtmlFlowData | TokenType::HtmlTextData => { - let slice = serialize(codes, &from_exit_event(events, index), false); - - let res = if ignore_encode { slice } else { encode(&slice) }; - + // Just output it. + TokenType::CodeTextData | TokenType::Data | TokenType::CharacterEscapeValue => { // last_was_tag = false; - buf_tail_mut(buffers).push(res); - } - TokenType::Paragraph => { - buf_tail_mut(buffers).push("</p>".to_string()); - } - TokenType::HardBreakEscape | TokenType::HardBreakTrailing => { - buf_tail_mut(buffers).push("<br />".to_string()); - } - TokenType::CodeIndented | TokenType::CodeFenced => { - let seen_data = - code_flow_seen_data.expect("`code_flow_seen_data` must be defined"); - - // To do: containers. - // One special case is if we are inside a container, and the fenced code was - // not closed (meaning it runs to the end). - // In that case, the following line ending, is considered *outside* the - // fenced code and block quote by micromark, but CM wants to treat that - // ending as part of the code. - // if fenced_count != None && fenced_count < 2 && tightStack.length > 0 && !last_was_tag { - // line_ending(); - // } - - // But in most cases, it’s simpler: when we’ve seen some data, emit an extra - // line ending when needed. - if seen_data { - line_ending_if_needed(buffers); - } - - buf_tail_mut(buffers).push("</code></pre>".to_string()); - - if let Some(count) = code_fenced_fences_count { - if count < 2 { - line_ending_if_needed(buffers); - } - } - - code_flow_seen_data = None; - code_fenced_fences_count = None; - slurp_one_line_ending = false; - } - TokenType::CodeFencedFence => { - let count = if let Some(count) = code_fenced_fences_count { - count - } else { - 0 - }; - - if count == 0 { - buf_tail_mut(buffers).push(">".to_string()); - // tag = true; - slurp_one_line_ending = true; - } - - code_fenced_fences_count = Some(count + 1); - } - TokenType::CodeFencedFenceInfo => { - let value = resume(buffers); - buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); - // tag = true; - } - TokenType::CodeFencedFenceMeta => { - resume(buffers); - } - TokenType::CodeFlowChunk => { - code_flow_seen_data = Some(true); buf_tail_mut(buffers).push(encode(&serialize( codes, &from_exit_event(events, index), false, ))); } - + TokenType::AtxHeading => { + let rank = atx_opening_sequence_size + .expect("`atx_opening_sequence_size` must be set in headings"); + buf_tail_mut(buffers).push(format!("</h{}>", rank)); + atx_opening_sequence_size = None; + atx_heading_buffer = None; + } // `AtxHeadingWhitespace` is ignored after the opening sequence, // before the closing sequence, and after the closing sequence. // But it is used around intermediate sequences. // `atx_heading_buffer` is set to `Some` by the first `AtxHeadingText`. // `AtxHeadingSequence` is ignored as the opening and closing sequence, // but not when intermediate. - TokenType::AtxHeadingWhitespace | TokenType::AtxHeadingSequence => { + TokenType::AtxHeadingSequence | TokenType::AtxHeadingWhitespace => { if let Some(buf) = atx_heading_buffer { atx_heading_buffer = Some( buf.to_string() @@ -320,50 +259,26 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St buf_tail_mut(buffers).push(encode(&result)); } - TokenType::AtxHeading => { - let rank = atx_opening_sequence_size - .expect("`atx_opening_sequence_size` must be set in headings"); - buf_tail_mut(buffers).push(format!("</h{}>", rank)); - atx_opening_sequence_size = None; - atx_heading_buffer = None; - } - TokenType::AutolinkProtocol => { + TokenType::AutolinkEmail => { let slice = serialize(codes, &from_exit_event(events, index), false); let buf = buf_tail_mut(buffers); buf.push(format!( - "<a href=\"{}\">", + "<a href=\"mailto:{}\">", sanitize_uri(slice.as_str(), &protocol_href) )); buf.push(encode(&slice)); buf.push("</a>".to_string()); } - TokenType::AutolinkEmail => { + TokenType::AutolinkProtocol => { let slice = serialize(codes, &from_exit_event(events, index), false); let buf = buf_tail_mut(buffers); buf.push(format!( - "<a href=\"mailto:{}\">", + "<a href=\"{}\">", sanitize_uri(slice.as_str(), &protocol_href) )); buf.push(encode(&slice)); buf.push("</a>".to_string()); } - TokenType::ThematicBreak => { - buf_tail_mut(buffers).push("<hr />".to_string()); - } - TokenType::LineEnding => { - // if slurp_all_line_endings { - // // Empty. - // } else - if slurp_one_line_ending { - slurp_one_line_ending = false; - } else { - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); - } - } TokenType::CharacterReferenceMarker => { character_reference_kind = Some(CharacterReferenceKind::Named); } @@ -389,9 +304,71 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St }; buf_tail_mut(buffers).push(encode(&value)); - character_reference_kind = None; } + TokenType::CodeFenced | TokenType::CodeIndented => { + let seen_data = + code_flow_seen_data.expect("`code_flow_seen_data` must be defined"); + + // To do: containers. + // One special case is if we are inside a container, and the fenced code was + // not closed (meaning it runs to the end). + // In that case, the following line ending, is considered *outside* the + // fenced code and block quote by micromark, but CM wants to treat that + // ending as part of the code. + // if fenced_count != None && fenced_count < 2 && tightStack.length > 0 && !last_was_tag { + // line_ending(); + // } + + // But in most cases, it’s simpler: when we’ve seen some data, emit an extra + // line ending when needed. + if seen_data { + line_ending_if_needed(buffers); + } + + buf_tail_mut(buffers).push("</code></pre>".to_string()); + + if let Some(count) = code_fenced_fences_count { + if count < 2 { + line_ending_if_needed(buffers); + } + } + + code_flow_seen_data = None; + code_fenced_fences_count = None; + slurp_one_line_ending = false; + } + TokenType::CodeFencedFence => { + let count = if let Some(count) = code_fenced_fences_count { + count + } else { + 0 + }; + + if count == 0 { + buf_tail_mut(buffers).push(">".to_string()); + // tag = true; + slurp_one_line_ending = true; + } + + code_fenced_fences_count = Some(count + 1); + } + TokenType::CodeFencedFenceInfo => { + let value = resume(buffers); + buf_tail_mut(buffers).push(format!(" class=\"language-{}\"", value)); + // tag = true; + } + TokenType::CodeFencedFenceMeta => { + resume(buffers); + } + TokenType::CodeFlowChunk => { + code_flow_seen_data = Some(true); + buf_tail_mut(buffers).push(encode(&serialize( + codes, + &from_exit_event(events, index), + false, + ))); + } TokenType::CodeText => { let result = resume(buffers); let mut chars = result.chars(); @@ -417,14 +394,38 @@ pub fn compile(events: &[Event], codes: &[Code], options: &CompileOptions) -> St TokenType::CodeTextLineEnding => { buf_tail_mut(buffers).push(" ".to_string()); } - // This branch below currently acts as the resulting `data` tokens. - TokenType::CodeTextData | TokenType::Data | TokenType::CharacterEscapeValue => { + + TokenType::HardBreakEscape | TokenType::HardBreakTrailing => { + buf_tail_mut(buffers).push("<br />".to_string()); + } + + TokenType::HtmlFlow | TokenType::HtmlText => { + ignore_encode = false; + } + TokenType::HtmlFlowData | TokenType::HtmlTextData => { + let slice = serialize(codes, &from_exit_event(events, index), false); // last_was_tag = false; - buf_tail_mut(buffers).push(encode(&serialize( - codes, - &from_exit_event(events, index), - false, - ))); + buf_tail_mut(buffers).push(if ignore_encode { slice } else { encode(&slice) }); + } + TokenType::LineEnding => { + // if slurp_all_line_endings { + // // Empty. + // } else + if slurp_one_line_ending { + slurp_one_line_ending = false; + } else { + buf_tail_mut(buffers).push(encode(&serialize( + codes, + &from_exit_event(events, index), + false, + ))); + } + } + TokenType::Paragraph => { + buf_tail_mut(buffers).push("</p>".to_string()); + } + TokenType::ThematicBreak => { + buf_tail_mut(buffers).push("<hr />".to_string()); } #[allow(unreachable_patterns)] _ => { diff --git a/src/content/string.rs b/src/content/string.rs index 2723785..25d8582 100644 --- a/src/content/string.rs +++ b/src/content/string.rs @@ -55,6 +55,7 @@ fn before_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { /// ``` fn in_data(tokenizer: &mut Tokenizer, code: Code) -> StateFnResult { match code { + // To do: line endings. Code::None => { tokenizer.exit(TokenType::Data); (State::Ok, None) diff --git a/src/content/text.rs b/src/content/text.rs index f61b390..6a30d4c 100644 --- a/src/content/text.rs +++ b/src/content/text.rs @@ -11,7 +11,6 @@ //! * [Hard break (escape)][crate::construct::hard_break_escape] //! * [Hard break (trailing)][crate::construct::hard_break_trailing] //! * [Code (text)][crate::construct::code_text] -//! * Line ending //! * Label start (image) //! * Label start (link) //! * [Character escape][crate::construct::character_escape] diff --git a/src/tokenizer.rs b/src/tokenizer.rs index da45ee5..0aae480 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -24,66 +24,52 @@ pub enum TokenType { AutolinkMarker, AutolinkProtocol, AutolinkEmail, - AtxHeading, AtxHeadingSequence, AtxHeadingWhitespace, AtxHeadingText, - + BlankLineEnding, + BlankLineWhitespace, CharacterEscape, CharacterEscapeMarker, CharacterEscapeValue, - CharacterReference, CharacterReferenceMarker, CharacterReferenceMarkerNumeric, CharacterReferenceMarkerHexadecimal, CharacterReferenceMarkerSemi, CharacterReferenceValue, - CodeFenced, CodeFencedFence, CodeFencedFenceSequence, CodeFencedFenceWhitespace, CodeFencedFenceInfo, CodeFencedFenceMeta, - + CodeFlowChunk, CodeIndented, CodeIndentedPrefixWhitespace, - CodeText, CodeTextSequence, CodeTextLineEnding, CodeTextData, - - CodeFlowChunk, - + Content, Data, - HardBreakEscape, HardBreakEscapeMarker, HardBreakTrailing, HardBreakTrailingSpace, - HtmlFlow, HtmlFlowData, - HtmlText, HtmlTextData, - + LineEnding, + Paragraph, ThematicBreak, ThematicBreakSequence, ThematicBreakWhitespace, - Whitespace, - LineEnding, - BlankLineEnding, - BlankLineWhitespace, - - Content, - - Paragraph, + // Chunks are tokenizer, but unraveled by `subtokenize`. ChunkContent, ChunkString, ChunkText, |