diff options
Diffstat (limited to '')
38 files changed, 687 insertions, 424 deletions
| diff --git a/src/compiler.rs b/src/compiler.rs index f2af8f1..db0df9b 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -19,26 +19,29 @@ struct Media {      /// Whether this represents an image (`true`) or a link or definition      /// (`false`).      image: bool, -    /// The text between the brackets (`x` in `![x]()` and `[x]()`), as an -    /// identifier, meaning that the original source characters are used -    /// instead of interpreting them. +    /// The text between the brackets (`x` in `![x]()` and `[x]()`). +    ///      /// Not interpreted.      label_id: Option<(usize, usize)>, -    /// The text between the brackets (`x` in `![x]()` and `[x]()`), as -    /// interpreted content. -    /// When this is a link, it can contain further text content and thus HTML +    /// The result of interpreting the text between the brackets +    /// (`x` in `![x]()` and `[x]()`). +    /// +    /// When this is a link, it contains further text content and thus HTML      /// tags.      /// Otherwise, when an image, text content is also allowed, but resulting      /// tags are ignored.      label: Option<String>, -    /// The text between the explicit brackets of the reference (`y` in +    /// The string between the explicit brackets of the reference (`y` in      /// `[x][y]`), as content. +    ///      /// Not interpreted.      reference_id: Option<(usize, usize)>,      /// The destination (url). +    ///      /// Interpreted string content.      destination: Option<String>,      /// The destination (url). +    ///      /// Interpreted string content.      title: Option<String>,  } @@ -46,10 +49,14 @@ struct Media {  /// Representation of a definition.  #[derive(Debug)]  struct Definition { +    /// Identifier. +    id: String,      /// The destination (url). +    ///      /// Interpreted string content.      destination: Option<String>,      /// The title. +    ///      /// Interpreted string content.      title: Option<String>,  } @@ -58,32 +65,55 @@ struct Definition {  #[allow(clippy::struct_excessive_bools)]  #[derive(Debug)]  struct CompileContext<'a> { -    /// Static info. +    // Static info. +    /// List of events.      pub events: &'a [Event], +    /// List of bytes.      pub bytes: &'a [u8], -    /// Fields used by handlers to track the things they need to track to -    /// compile markdown. -    pub atx_opening_sequence_size: Option<usize>, +    // Fields used by handlers to track the things they need to track to +    // compile markdown. +    /// Rank of heading (atx). +    pub heading_atx_rank: Option<usize>, +    /// Buffer of heading (setext) text.      pub heading_setext_buffer: Option<String>, +    /// Whether code (flow) contains data.      pub code_flow_seen_data: Option<bool>, +    /// Number of code (fenced) fenced.      pub code_fenced_fences_count: Option<usize>, +    /// Whether we are in code (text).      pub code_text_inside: bool, +    /// Whether we are in image text. +    pub image_alt_inside: bool, +    /// Marker of character reference.      pub character_reference_marker: Option<u8>, -    pub expect_first_item: Option<bool>, +    /// Whether we are expecting the first list item marker. +    pub list_expect_first_marker: Option<bool>, +    /// Stack of media (link, image).      pub media_stack: Vec<Media>, -    pub definitions: Vec<(String, Definition)>, +    /// Stack of containers.      pub tight_stack: Vec<bool>, -    /// Fields used to influance the current compilation. +    /// List of definitions. +    pub definitions: Vec<Definition>, +    // Fields used to influance the current compilation. +    /// Ignore the next line ending.      pub slurp_one_line_ending: bool, -    pub in_image_alt: bool, +    /// Whether to encode HTML.      pub encode_html: bool, -    /// Configuration +    // Configuration +    /// Whether to sanitize `href`s, and in which case, which protocols to +    /// allow.      pub protocol_href: Option<Vec<&'static str>>, +    /// Whether to sanitize `src`s, and in which case, which protocols to +    /// allow.      pub protocol_src: Option<Vec<&'static str>>, +    /// Line ending to use.      pub line_ending_default: LineEnding, +    /// Whether to allow HTML.      pub allow_dangerous_html: bool, -    /// Intermediate results. +    // Intermediate results. +    /// Stack of buffers.      pub buffers: Vec<String>, +    /// Current event index.      pub index: usize,  } @@ -98,18 +128,18 @@ impl<'a> CompileContext<'a> {          CompileContext {              events,              bytes, -            atx_opening_sequence_size: None, +            heading_atx_rank: None,              heading_setext_buffer: None,              code_flow_seen_data: None,              code_fenced_fences_count: None,              code_text_inside: false,              character_reference_marker: None, -            expect_first_item: None, +            list_expect_first_marker: None,              media_stack: vec![],              definitions: vec![],              tight_stack: vec![],              slurp_one_line_ending: false, -            in_image_alt: false, +            image_alt_inside: false,              encode_html: true,              protocol_href: if options.allow_dangerous_protocol {                  None @@ -258,7 +288,7 @@ pub fn compile(events: &[Event], bytes: &[u8], options: &Options) -> String {          .to_string()  } -// Handle the event at `index`. +/// Handle the event at `index`.  fn handle(context: &mut CompileContext, index: usize) {      context.index = index; @@ -389,7 +419,7 @@ fn on_enter_code_fenced(context: &mut CompileContext) {  /// Handle [`Enter`][Kind::Enter]:[`CodeText`][Name::CodeText].  fn on_enter_code_text(context: &mut CompileContext) {      context.code_text_inside = true; -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("<code>");      }      context.buffer(); @@ -416,7 +446,7 @@ fn on_enter_definition_destination_string(context: &mut CompileContext) {  /// Handle [`Enter`][Kind::Enter]:[`Emphasis`][Name::Emphasis].  fn on_enter_emphasis(context: &mut CompileContext) { -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("<em>");      }  } @@ -446,7 +476,7 @@ fn on_enter_image(context: &mut CompileContext) {          destination: None,          title: None,      }); -    context.in_image_alt = true; // Disallow tags. +    context.image_alt_inside = true; // Disallow tags.  }  /// Handle [`Enter`][Kind::Enter]:[`Link`][Name::Link]. @@ -556,21 +586,19 @@ fn on_enter_list(context: &mut CompileContext) {      } else {          "<ul"      }); -    context.expect_first_item = Some(true); +    context.list_expect_first_marker = Some(true);  }  /// Handle [`Enter`][Kind::Enter]:[`ListItemMarker`][Name::ListItemMarker].  fn on_enter_list_item_marker(context: &mut CompileContext) { -    let expect_first_item = context.expect_first_item.take().unwrap(); - -    if expect_first_item { +    if context.list_expect_first_marker.take().unwrap() {          context.push(">");      }      context.line_ending_if_needed();      context.push("<li>"); -    context.expect_first_item = Some(false); +    context.list_expect_first_marker = Some(false);  }  /// Handle [`Enter`][Kind::Enter]:[`Paragraph`][Name::Paragraph]. @@ -599,7 +627,7 @@ fn on_enter_resource_destination_string(context: &mut CompileContext) {  /// Handle [`Enter`][Kind::Enter]:[`Strong`][Name::Strong].  fn on_enter_strong(context: &mut CompileContext) { -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("<strong>");      }  } @@ -612,7 +640,7 @@ fn on_exit_autolink_email(context: &mut CompileContext) {      );      let value = slice.as_str(); -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("<a href=\"");          context.push(&sanitize_uri(              &format!("mailto:{}", value), @@ -623,7 +651,7 @@ fn on_exit_autolink_email(context: &mut CompileContext) {      context.push(&encode(value, context.encode_html)); -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("</a>");      }  } @@ -636,7 +664,7 @@ fn on_exit_autolink_protocol(context: &mut CompileContext) {      );      let value = slice.as_str(); -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("<a href=\"");          context.push(&sanitize_uri(value, &context.protocol_href));          context.push("\">"); @@ -644,14 +672,14 @@ fn on_exit_autolink_protocol(context: &mut CompileContext) {      context.push(&encode(value, context.encode_html)); -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("</a>");      }  }  /// Handle [`Exit`][Kind::Exit]:{[`HardBreakEscape`][Name::HardBreakEscape],[`HardBreakTrailing`][Name::HardBreakTrailing]}.  fn on_exit_break(context: &mut CompileContext) { -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("<br />");      }  } @@ -748,11 +776,6 @@ fn on_exit_code_fenced_fence_info(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:{[`CodeFenced`][Name::CodeFenced],[`CodeIndented`][Name::CodeIndented]}.  fn on_exit_code_flow(context: &mut CompileContext) { -    let seen_data = context -        .code_flow_seen_data -        .take() -        .expect("`code_flow_seen_data` must be defined"); -      // One special case is if we are inside a container, and the fenced code was      // not closed (meaning it runs to the end).      // In that case, the following line ending, is considered *outside* the @@ -772,7 +795,11 @@ fn on_exit_code_flow(context: &mut CompileContext) {      // But in most cases, it’s simpler: when we’ve seen some data, emit an extra      // line ending when needed. -    if seen_data { +    if context +        .code_flow_seen_data +        .take() +        .expect("`code_flow_seen_data` must be defined") +    {          context.line_ending_if_needed();      } @@ -814,7 +841,7 @@ fn on_exit_code_text(context: &mut CompileContext) {      context.code_text_inside = false;      context.push(str::from_utf8(bytes).unwrap()); -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("</code>");      }  } @@ -846,13 +873,11 @@ fn on_exit_definition(context: &mut CompileContext) {      let id =          normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str()); -    context.definitions.push(( +    context.definitions.push(Definition {          id, -        Definition { -            destination: media.destination, -            title: media.title, -        }, -    )); +        destination: media.destination, +        title: media.title, +    });  }  /// Handle [`Exit`][Kind::Exit]:[`DefinitionDestinationString`][Name::DefinitionDestinationString]. @@ -878,7 +903,7 @@ fn on_exit_definition_title_string(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:[`Strong`][Name::Emphasis].  fn on_exit_emphasis(context: &mut CompileContext) { -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("</em>");      }  } @@ -886,9 +911,9 @@ fn on_exit_emphasis(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:[`HeadingAtx`][Name::HeadingAtx].  fn on_exit_heading_atx(context: &mut CompileContext) {      let rank = context -        .atx_opening_sequence_size +        .heading_atx_rank          .take() -        .expect("`atx_opening_sequence_size` must be set in headings"); +        .expect("`heading_atx_rank` must be set in headings");      context.push("</h");      context.push(&rank.to_string()); @@ -898,14 +923,14 @@ fn on_exit_heading_atx(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:[`HeadingAtxSequence`][Name::HeadingAtxSequence].  fn on_exit_heading_atx_sequence(context: &mut CompileContext) {      // First fence we see. -    if context.atx_opening_sequence_size.is_none() { +    if context.heading_atx_rank.is_none() {          let rank = Slice::from_position(              context.bytes,              &Position::from_exit_event(context.events, context.index),          )          .len();          context.line_ending_if_needed(); -        context.atx_opening_sequence_size = Some(rank); +        context.heading_atx_rank = Some(rank);          context.push("<h");          context.push(&rank.to_string());          context.push(">"); @@ -930,7 +955,7 @@ fn on_exit_heading_setext_underline(context: &mut CompileContext) {      let text = context          .heading_setext_buffer          .take() -        .expect("`atx_opening_sequence_size` must be set in headings"); +        .expect("`heading_atx_rank` must be set in headings");      let head = Slice::from_position(          context.bytes,          &Position::from_exit_event(context.events, context.index), @@ -1034,9 +1059,7 @@ fn on_exit_list_item(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:[`ListItemValue`][Name::ListItemValue].  fn on_exit_list_item_value(context: &mut CompileContext) { -    let expect_first_item = context.expect_first_item.unwrap(); - -    if expect_first_item { +    if context.list_expect_first_marker.unwrap() {          let slice = Slice::from_position(              context.bytes,              &Position::from_exit_event(context.events, context.index), @@ -1066,11 +1089,11 @@ fn on_exit_media(context: &mut CompileContext) {          index += 1;      } -    context.in_image_alt = is_in_image; +    context.image_alt_inside = is_in_image;      let media = context.media_stack.pop().unwrap();      let label = media.label.unwrap(); -    let in_image_alt = context.in_image_alt; +    let image_alt_inside = context.image_alt_inside;      let id = media.reference_id.or(media.label_id).map(|indices| {          normalize_identifier(Slice::from_indices(context.bytes, indices.0, indices.1).as_str())      }); @@ -1080,7 +1103,7 @@ fn on_exit_media(context: &mut CompileContext) {              let mut index = 0;              while index < context.definitions.len() { -                if context.definitions[index].0 == id { +                if context.definitions[index].id == id {                      return Some(index);                  } @@ -1093,7 +1116,7 @@ fn on_exit_media(context: &mut CompileContext) {          None      }; -    if !in_image_alt { +    if !image_alt_inside {          if media.image {              context.push("<img src=\"");          } else { @@ -1101,7 +1124,7 @@ fn on_exit_media(context: &mut CompileContext) {          };          let destination = if let Some(index) = definition_index { -            context.definitions[index].1.destination.as_ref() +            context.definitions[index].destination.as_ref()          } else {              media.destination.as_ref()          }; @@ -1126,11 +1149,11 @@ fn on_exit_media(context: &mut CompileContext) {          context.push(&label);      } -    if !in_image_alt { +    if !image_alt_inside {          context.push("\"");          let title = if let Some(index) = definition_index { -            context.definitions[index].1.title.clone() +            context.definitions[index].title.clone()          } else {              media.title          }; @@ -1151,7 +1174,7 @@ fn on_exit_media(context: &mut CompileContext) {      if !media.image {          context.push(&label); -        if !in_image_alt { +        if !image_alt_inside {              context.push("</a>");          }      } @@ -1192,7 +1215,7 @@ fn on_exit_resource_title_string(context: &mut CompileContext) {  /// Handle [`Exit`][Kind::Exit]:[`Strong`][Name::Strong].  fn on_exit_strong(context: &mut CompileContext) { -    if !context.in_image_alt { +    if !context.image_alt_inside {          context.push("</strong>");      }  } diff --git a/src/constant.rs b/src/constant.rs index 47cb50c..b856fd0 100644 --- a/src/constant.rs +++ b/src/constant.rs @@ -254,8 +254,7 @@ pub const THEMATIC_BREAK_MARKER_COUNT_MIN: usize = 3;  // Important: please touch the below lists as few times as possible to keep Git small. -/// List of names that can form named [character reference][character_reference]s -/// and corresponding values. +/// List of names and values that form named [character reference][character_reference]s.  ///  /// This list is sensitive to casing.  /// diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 1dc8868..21407b7 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -1,22 +1,29 @@ -//! Attention is a construct that occurs in the [text][] content type. +//! Attention (emphasis and strong) occurs in the [text][] content type.  //! -//! How attention parses is too complex to explain in BNF. -//! Essentially, one or more of `*` or `_` form attention sequences. -//! Depending on the code before and after a sequence, it can open or close -//! attention. -//! When everything is parsed, we find each sequence that can close, and a -//! corresponding sequence that can open which uses the same marker. -//! If both sequences have two or more markers, strong is formed. -//! Otherwise emphasis is formed. +//! ## Grammar  //! -//! Attention sequences do not, on their own, relate to anything in HTML. -//! When matched with another sequence, and two markers can be “taken” from -//! them, they together relate to the `<strong>` element in HTML. +//! Attention sequences form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>): +//! +//! ```bnf +//! attention_sequence ::= 1*'*' | 1*'_' +//! ``` +//! +//! Sequences are matched together to form attention based on which character +//! they contain, and what character occurs before and after each sequence. +//! Otherwise they are turned into data. +//! +//! ## HTML +//! +//! When sequences match, and two markers can be “taken” from them, they +//! together relate to the `<strong>` element in HTML.  //! When one marker can be taken, they relate to the `<em>` element.  //! See [*§ 4.5.2 The `em` element*][html-em] and  //! [*§ 4.5.3 The `strong` element*][html-strong] in the HTML spec for more  //! info.  //! +//! ## Recommendation +//!  //! It is recommended to use asterisks for attention when writing markdown.  //!  //! There are some small differences in whether sequences can open and/or close diff --git a/src/construct/autolink.rs b/src/construct/autolink.rs index 37e21d9..9890aaf 100644 --- a/src/construct/autolink.rs +++ b/src/construct/autolink.rs @@ -1,22 +1,24 @@ -//! Autolinks are a construct that occurs in the [text][] content type. +//! Autolinks occur in the [text][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! Autolinks form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! autolink ::= '<' ( url | email ) '>' +//! autolink ::= '<' (url | email) '>' +//! +//! url ::= protocol *url_byte +//! protocol ::= ascii_alphabetic 0*31(protocol_byte) ':' +//! protocol_byte ::= '+' '-' '.' ascii_alphanumeric +//! url_byte ::= byte - ascii_control - ' '  //! -//! url ::= ascii_alphabetic 0*31( '+' '-' '.' ascii_alphanumeric ) ':' *( code - ascii_control - '\r' - '\n' - ' ') -//! email ::= 1*ascii_atext '@' domain *('.' domain) +//! email ::= 1*ascii_atext '@' email_domain *('.' email_domain)  //! ; Restriction: up to (including) 63 character are allowed in each domain. -//! domain ::= ascii_alphanumeric *( ascii_alphanumeric | '-' ascii_alphanumeric ) -//! ascii_atext ::= ascii_alphanumeric | '#' .. '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' .. '`' | '{' .. '~' -//! ``` +//! email_domain ::= ascii_alphanumeric *(ascii_alphanumeric | '-' ascii_alphanumeric)  //! -//! Autolinks relate to the `<a>` element in HTML. -//! See [*§ 4.5.1 The `a` element*][html-a] in the HTML spec for more info. -//! When an email autolink is used (so, without a protocol), the string -//! `mailto:` is prepended before the email, when generating the `href` -//! attribute of the hyperlink. +//! ascii_atext ::= ascii_alphanumeric | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '*' | '+' | '-' | '/' | '=' | '?' | '^' | '_' | '`' | '{' | '|' | '}' | '~' +//! ```  //!  //! The maximum allowed size of a scheme is `31` (inclusive), which is defined  //! in [`AUTOLINK_SCHEME_SIZE_MAX`][autolink_scheme_size_max]. @@ -41,7 +43,7 @@  //! There are several cases where incorrect encoding of URLs would, in other  //! languages, result in a parse error.  //! In markdown, there are no errors, and URLs are normalized. -//! In addition, unicode characters are percent encoded +//! In addition, many characters are percent encoded  //! ([`sanitize_uri`][sanitize_uri]).  //! For example:  //! @@ -82,6 +84,22 @@  //! <p><a href="#"></a><a href="https://example.com">https://example.com</a></p>  //! ```  //! +//! ## HTML +//! +//! Autolinks relate to the `<a>` element in HTML. +//! See [*§ 4.5.1 The `a` element*][html_a] in the HTML spec for more info. +//! When an email autolink is used (so, without a protocol), the string +//! `mailto:` is prepended before the email, when generating the `href` +//! attribute of the hyperlink. +//! +//! ## Recommendation +//! +//! It is recommended to use labels ([label start link][label_start_link], +//! [label end][label_end]), either with a resource or a definition +//! ([definition][]), instead of autolinks, as those allow more characters in +//! URLs, and allow relative URLs and `www.` URLs. +//! They also allow for descriptive text to explain the URL in prose. +//!  //! ## Tokens  //!  //! *   [`Autolink`][Name::Autolink] @@ -95,11 +113,13 @@  //! *   [*§ 6.4 Autolinks* in `CommonMark`](https://spec.commonmark.org/0.30/#autolinks)  //!  //! [text]: crate::construct::text +//! [definition]: crate::construct::definition +//! [label_start_link]: crate::construct::label_start_link  //! [label_end]: crate::construct::label_end  //! [autolink_scheme_size_max]: crate::constant::AUTOLINK_SCHEME_SIZE_MAX  //! [autolink_domain_size_max]: crate::constant::AUTOLINK_DOMAIN_SIZE_MAX  //! [sanitize_uri]: crate::util::sanitize_uri -//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element  use crate::constant::{AUTOLINK_DOMAIN_SIZE_MAX, AUTOLINK_SCHEME_SIZE_MAX};  use crate::event::Name; @@ -293,7 +313,7 @@ pub fn email_label(tokenizer: &mut Tokenizer) -> State {              tokenizer.tokenize_state.size = 0;              let index = tokenizer.events.len();              tokenizer.exit(Name::AutolinkProtocol); -            // Change the token type. +            // Change the event name.              tokenizer.events[index - 1].name = Name::AutolinkEmail;              tokenizer.events[index].name = Name::AutolinkEmail;              tokenizer.enter(Name::AutolinkMarker); diff --git a/src/construct/blank_line.rs b/src/construct/blank_line.rs index 928b8cc..5be406d 100644 --- a/src/construct/blank_line.rs +++ b/src/construct/blank_line.rs @@ -1,22 +1,39 @@ -//! Blank lines are a construct that occurs in the [flow][] content type. +//! Blank lines occur in the [flow][] content type.  //! -//! They’re formed with the following BNF: +//! ## Grammar +//! +//! Blank lines form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! blank_line ::= *(' ' '\t') +//! blank_line ::= *space_or_tab  //! ```  //! +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file). +//!  //! Blank lines are sometimes needed, such as to differentiate a [paragraph][]  //! from another paragraph.  //! In several cases, blank lines are not needed between flow constructs, -//! such as between two [heading (atx)][heading-atx]s. +//! such as between two [heading (atx)][heading_atx]s.  //! Sometimes, whether blank lines are present, changes the behavior of how -//! HTML is rendered, such as whether blank lines are present between list -//! items in a [list][list-item]. +//! HTML is rendered, such as whether blank lines are present inside or between +//! [list items][list_item].  //! More than one blank line is never needed in `CommonMark`.  //!  //! Because blank lines can be empty (line endings are not considered part of -//! it), and events cannot be empty, blank lines are not present as a token. +//! it), and events cannot be empty, blank lines are not present as an event. +//! +//! ## HTML +//! +//! Blank lines do not relate an element in HTML, except for the role they play +//! when inside or between [list items][list_item]. +//! +//! ## Recommendation +//! +//! It is recommended to always use a blank line between every flow construct, +//! to use blank lines (consistently) between list items as desired, and to +//! never use more than one blank line.  //!  //! ## Tokens  //! @@ -27,8 +44,8 @@  //! *   [`blank-line.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/blank-line.js)  //! *   [*§ 4.9 Blank lines* in `CommonMark`](https://spec.commonmark.org/0.30/#blank-lines)  //! -//! [heading-atx]: crate::construct::heading_atx -//! [list-item]: crate::construct::list_item +//! [heading_atx]: crate::construct::heading_atx +//! [list_item]: crate::construct::list_item  //! [paragraph]: crate::construct::paragraph  //! [flow]: crate::construct::flow diff --git a/src/construct/block_quote.rs b/src/construct/block_quote.rs index 37726c5..8d7e227 100644 --- a/src/construct/block_quote.rs +++ b/src/construct/block_quote.rs @@ -1,6 +1,9 @@ -//! Block quote is a construct that occurs in the [document][] content type. +//! Block quotes occur in the [document][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! Block quotes form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf  //! block_quote_start ::= '>' [ space_or_tab ] @@ -9,14 +12,24 @@  //!  //! Further lines that are not prefixed with `block_quote_cont` cause the block  //! quote to be exited, except when those lines are lazy continuation. -//! Like so many things in markdown, block quotes too, are very complex. -//! See [*§ Phase 1: block structure*][commonmark-block] for more on parsing -//! details. +//! Like so many things in markdown, block quotes too, are complex. +//! See [*§ Phase 1: block structure* in `CommonMark`][commonmark-block] for +//! more on parsing details. +//! +//! As block quote is a container, it takes several bytes from the start of the +//! line, while the rest of the line includes more containers or flow. +//! +//! ## HTML  //!  //! Block quote relates to the `<blockquote>` element in HTML.  //! See [*§ 4.4.4 The `blockquote` element*][html-blockquote] in the HTML spec  //! for more info.  //! +//! ## Recommendation +//! +//! Always use a single space after a block quote marker (`>`). +//! Never use lazy continuation. +//!  //! ## Tokens  //!  //! *   [`BlockQuote`][Name::BlockQuote] diff --git a/src/construct/character_escape.rs b/src/construct/character_escape.rs index 6dac458..438092e 100644 --- a/src/construct/character_escape.rs +++ b/src/construct/character_escape.rs @@ -1,7 +1,9 @@ -//! Character escapes are a construct that occurs in the [string][] and -//! [text][] content types. +//! Character escapes occur in the [string][] and [text][] content types.  //! -//! They’re formed with the following BNF: +//! ## Grammar +//! +//! Character escapes form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf  //! character_escape ::= '\\' ascii_punctuation @@ -10,13 +12,20 @@  //! Like much of markdown, there are no “invalid” character escapes: just a  //! slash, or a slash followed by anything other than an ASCII punctuation  //! character, is exactly that: just a slash. -//! To escape (most) arbitrary characters, use a -//! [character reference][character_reference] instead +//! +//! To escape (almost all) arbitrary characters instead of only ASCII +//! punctuation, use a [character reference][character_reference] instead  //! (as in, `&`, `{`, or say `	`). +//!  //! It is also possible to escape a line ending in text with a similar  //! construct: a [hard break (escape)][hard_break_escape] is a backslash followed  //! by a line ending (that is part of the construct instead of ending it).  //! +//! ## Recommendation +//! +//! If possible, use a character escape. +//! Otherwise, use a character reference. +//!  //! ## Tokens  //!  //! *   [`CharacterEscape`][Name::CharacterEscape] diff --git a/src/construct/character_reference.rs b/src/construct/character_reference.rs index 7935109..3bdc636 100644 --- a/src/construct/character_reference.rs +++ b/src/construct/character_reference.rs @@ -1,25 +1,27 @@ -//! Character references are a construct that occurs in the [string][] and -//! [text][] content types. +//! Character references occur in the [string][] and [text][] content types.  //! -//! They’re formed with the following BNF: +//! ## Grammar +//! +//! Character references form with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf  //! character_reference ::= '&' (numeric | named) ';'  //!  //! numeric ::= '#' (hexadecimal | decimal) -//! ; Note: Limit of `6` imposed as all bigger numbers are invalid: +//! ; Note: Limit of `6` imposed, as all bigger numbers are invalid.  //! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) -//! ; Note: Limit of `7` imposed as all bigger numbers are invalid: +//! ; Note: Limit of `7` imposed, as all bigger numbers are invalid.  //! decimal ::= 1*7(ascii_digit) -//! ; Note: Limit of `31` imposed by `CounterClockwiseContourIntegral`: +//! ; Note: Limit of `31` imposed, for `CounterClockwiseContourIntegral`.  //! ; Note: Limited to any known named character reference (see `constants.rs`)  //! named ::= 1*31(ascii_alphanumeric)  //! ```  //!  //! Like much of markdown, there are no “invalid” character references.  //! However, for security reasons, several numeric character references parse -//! fine but are not rendered as their corresponding character and they are -//! instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). +//! fine but are not rendered as their corresponding character. +//! They are instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`).  //! See [`decode_numeric`][decode_numeric] for more info.  //!  //! To escape ASCII punctuation characters, use the terser @@ -33,13 +35,18 @@  //!  //! Character references are parsed insensitive to casing.  //! The casing of hexadecimal numeric character references has no effect. -//! The casing of named character references does not matter when parsing them, -//! but does affect whether they match. +//! The casing of named character references does not matter when parsing, but +//! does affect whether they match.  //! Depending on the name, one or more cases are allowed, such as that `AMP`  //! and `amp` are both allowed but other cases are not.  //! See [`CHARACTER_REFERENCES`][character_references] for which  //! names match.  //! +//! ## Recommendation +//! +//! If possible, use a character escape. +//! Otherwise, use a character reference. +//!  //! ## Tokens  //!  //! *   [`CharacterReference`][Name::CharacterReference] diff --git a/src/construct/code_fenced.rs b/src/construct/code_fenced.rs index 3812d44..748e38f 100644 --- a/src/construct/code_fenced.rs +++ b/src/construct/code_fenced.rs @@ -1,9 +1,12 @@ -//! Code (fenced) is a construct that occurs in the [flow][] content type. +//! Code (fenced) occurs in the [flow][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! Code (fenced) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! code_fenced ::= fence_open *( eol *code ) [ eol fence_close ] +//! code_fenced ::= fence_open *( eol *byte ) [ eol fence_close ]  //!  //! fence_open ::= sequence [ 1*space_or_tab info [ 1*space_or_tab meta ] ] *space_or_tab  //! ; Restriction: the number of markers in the closing fence sequence must be @@ -13,41 +16,53 @@  //! ; marker in the opening fence sequence  //! fence_close ::= sequence *space_or_tab  //! sequence ::= 3*'`' | 3*'~' +//! ; Restriction: the `` ` `` character cannot occur in `info` if it is the marker.  //! info ::= 1*text +//! ; Restriction: the `` ` `` character cannot occur in `meta` if it is the marker.  //! meta ::= 1*text *( *space_or_tab 1*text ) -//! -//! ; Restriction: the `` ` `` character cannot occur in `text` if it is the -//! ; marker of the opening fence sequence. -//! text ::= code - eol - space_or_tab -//! eol ::= '\r' | '\r\n' | '\n' -//! space_or_tab ::= ' ' | '\t' -//! code ::= . ; any unicode code point (other than line endings).  //! ```  //! -//! The above grammar does not show how whitespace is handled. -//! To parse code (fenced), let `X` be the number of whitespace characters +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file). +//! +//! The above grammar does not show how indentation (with `space_or_tab`) of +//! each line is handled. +//! To parse code (fenced), let `x` be the number of `space_or_tab` characters  //! before the opening fence sequence.  //! Each line of text is then allowed (not required) to be indented with up -//! to `X` spaces or tabs, which are then ignored as an indent instead of being +//! to `x` spaces or tabs, which are then ignored as an indent instead of being  //! considered as part of the code.  //! This indent does not affect the closing fence.  //! It can be indented up to a separate 3 spaces or tabs.  //! A bigger indent makes it part of the code instead of a fence.  //! -//! Code (fenced) relates to both the `<pre>` and the `<code>` elements in -//! HTML. -//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` -//! element*][html-code] in the HTML spec for more info. +//! The `info` and `meta` parts are interpreted as the [string][] content type. +//! That means that [character escapes][character_escape] and +//! [character references][character_reference] are allowed.  //!  //! The optional `meta` part is ignored: it is not used when parsing or  //! rendering. +//!  //! The optional `info` part is used and is expected to specify the programming  //! language that the code is in.  //! Which value it holds depends on what your syntax highlighter supports, if  //! one is used. +//! +//! In markdown, it is also possible to use [code (text)][code_text] in the +//! [text][] content type. +//! It is also possible to create code with the +//! [code (indented)][code_indented] construct. +//! +//! ## HTML +//! +//! Code (fenced) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html_pre] and the [*§ 4.5.15 The `code` +//! element*][html_code] in the HTML spec for more info. +//!  //! The `info` is, when rendering to HTML, typically exposed as a class.  //! This behavior stems from the HTML spec ([*§ 4.5.15 The `code` -//! element*][html-code]). +//! element*][html_code]).  //! For example:  //!  //! ```markdown @@ -63,17 +78,11 @@  //! </code></pre>  //! ```  //! -//! The `info` and `meta` parts are interpreted as the [string][] content type. -//! That means that [character escapes][character_escape] and -//! [character references][character_reference] are allowed. +//! ## Recommendation  //! -//! In markdown, it is also possible to use [code (text)][code_text] in the -//! [text][] content type. -//! It is also possible to create code with the -//! [code (indented)][code_indented] construct. -//! That construct is less explicit, different from code (text), and has no -//! support for specifying the programming language, so it is recommended to -//! use code (fenced) instead of code (indented). +//! It is recommended to use code (fenced) instead of code (indented). +//! Code (fenced) is more explicit, similar to code (text), and has support +//! for specifying the programming language.  //!  //! ## Tokens  //! @@ -94,12 +103,12 @@  //! [flow]: crate::construct::flow  //! [string]: crate::construct::string  //! [text]: crate::construct::text -//! [code_indented]: crate::construct::code_indented -//! [code_text]: crate::construct::code_text  //! [character_escape]: crate::construct::character_escape  //! [character_reference]: crate::construct::character_reference -//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element -//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! [code_indented]: crate::construct::code_indented +//! [code_text]: crate::construct::code_text +//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element  use crate::constant::{CODE_FENCED_SEQUENCE_SIZE_MIN, TAB_SIZE};  use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; diff --git a/src/construct/code_indented.rs b/src/construct/code_indented.rs index e3a5333..89c5652 100644 --- a/src/construct/code_indented.rs +++ b/src/construct/code_indented.rs @@ -1,30 +1,38 @@ -//! Code (indented) is a construct that occurs in the [flow][] content type. +//! Code (indented) occurs in the [flow][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! Code (indented) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! code_indented ::= indented_filled_line *( eol *( blank_line eol ) indented_filled_line ) +//! code_indented ::= filled_line *( eol *( blank_line eol ) filled_line )  //! -//! ; Restriction: at least one `code` must not be whitespace. -//! indented_filled_line ::= 4space_or_tab *code +//! ; Restriction: at least one `line` byte must be `text`. +//! filled_line ::= 4(space_or_tab) *line  //! blank_line ::= *space_or_tab -//! eol ::= '\r' | '\r\n' | '\n' -//! code ::= . ; any unicode code point (other than line endings). -//! space_or_tab ::= ' ' | '\t'  //! ```  //! -//! Code (indented) relates to both the `<pre>` and the `<code>` elements in -//! HTML. -//! See [*§ 4.4.3 The `pre` element*][html-pre] and the [*§ 4.5.15 The `code` -//! element*][html-code] in the HTML spec for more info. +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file).  //!  //! In markdown, it is also possible to use [code (text)][code_text] in the  //! [text][] content type.  //! It is also possible to create code with the [code (fenced)][code_fenced]  //! construct. -//! That construct is more explicit, more similar to code (text), and has -//! support for specifying the programming language that the code is in, so it -//! is recommended to use that instead of indented code. +//! +//! ## HTML +//! +//! Code (indented) relates to both the `<pre>` and the `<code>` elements in +//! HTML. +//! See [*§ 4.4.3 The `pre` element*][html_pre] and the [*§ 4.5.15 The `code` +//! element*][html_code] in the HTML spec for more info. +//! +//! ## Recommendation +//! +//! It is recommended to use code (fenced) instead of code (indented). +//! Code (fenced) is more explicit, similar to code (text), and has support +//! for specifying the programming language.  //!  //! ## Tokens  //! @@ -40,10 +48,10 @@  //!  //! [flow]: crate::construct::flow  //! [text]: crate::construct::text -//! [code_text]: crate::construct::code_text  //! [code_fenced]: crate::construct::code_fenced -//! [html-pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element -//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! [code_text]: crate::construct::code_text +//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! [html_pre]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-pre-element  use super::partial_space_or_tab::{space_or_tab, space_or_tab_min_max};  use crate::constant::TAB_SIZE; diff --git a/src/construct/code_text.rs b/src/construct/code_text.rs index 7ebee96..413b5ee 100644 --- a/src/construct/code_text.rs +++ b/src/construct/code_text.rs @@ -1,12 +1,16 @@ -//! Code (text) is a construct that occurs in the [text][] content type. +//! Code (text) occurs in the [text][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! Code (text) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf  //! ; Restriction: the number of markers in the closing sequence must be equal  //! ; to the number of markers in the opening sequence. -//! code_text ::= sequence 1*code sequence +//! code_text ::= sequence 1*byte sequence  //! +//! ; Restriction: not preceded or followed by `` ` ``.  //! sequence ::= 1*'`'  //! ```  //! @@ -18,15 +22,13 @@  //! Include more: `a``b` or include less: ``a`b``.  //! ```  //! -//! When turning markdown into HTML, each line ending is turned into a space. -//!  //! It is also possible to include just one grave accent (tick):  //!  //! ```markdown  //! Include just one: `` ` ``.  //! ```  //! -//! Sequences are “gready”, in that they cannot be preceded or succeeded by +//! Sequences are “gready”, in that they cannot be preceded or followed by  //! more grave accents (ticks).  //! To illustrate:  //! @@ -53,17 +55,17 @@  //! if both exist and there is also a non-space in the code, are removed.  //! Line endings, at that stage, are considered as spaces.  //! -//! Code (text) relates to the `<code>` element in HTML. -//! See [*§ 4.5.15 The `code` element*][html-code] in the HTML spec for more -//! info. -//!  //! In markdown, it is possible to create code with the  //! [code (fenced)][code_fenced] or [code (indented)][code_indented] constructs  //! in the [flow][] content type. -//! Compared to code (indented), fenced code is more explicit and more similar -//! to code (text), and it has support for specifying the programming language -//! that the code is in, so it is recommended to use that instead of indented -//! code. +//! +//! ## HTML +//! +//! Code (text) relates to the `<code>` element in HTML. +//! See [*§ 4.5.15 The `code` element*][html_code] in the HTML spec for more +//! info. +//! +//! When turning markdown into HTML, each line ending is turned into a space.  //!  //! ## Tokens  //! @@ -81,7 +83,7 @@  //! [text]: crate::construct::text  //! [code_indented]: crate::construct::code_indented  //! [code_fenced]: crate::construct::code_fenced -//! [html-code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element +//! [html_code]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-code-element  use crate::event::Name;  use crate::state::{Name as StateName, State}; diff --git a/src/construct/definition.rs b/src/construct/definition.rs index 8f274ee..071e595 100644 --- a/src/construct/definition.rs +++ b/src/construct/definition.rs @@ -1,31 +1,29 @@ -//! Definition is a construct that occurs in the [flow] content type. +//! Definition occurs in the [flow] content type.  //! -//! They’re formed with the following BNF: +//! ## Grammar +//! +//! Definition forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! definition ::= label ':' [ whitespace ] destination [ whitespace title ] [ space_or_tab ] +//! definition ::= label ':' [ space_or_tab_eol ] destination [ space_or_tab_eol title ] [ space_or_tab ]  //!  //! ; See the `destination`, `title`, and `label` constructs for the BNF of  //! ; those parts.  //! ```  //! -//! See [`destination`][destination], [`label`][label], and [`title`][title] -//! for grammar, notes, and recommendations. +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file).  //! -//! Definitions in markdown do not, on their own, relate to anything in HTML. -//! When matched with a [label end (reference)][label_end], they together -//! relate to the `<a>` or `<img>` elements in HTML. -//! The definition forms its `href` or `src`, and optionally `title`, -//! attributes. -//! See [*§ 4.5.1 The `a` element*][html-a] and -//! [*§ 4.8.3 The `img` element*][html-img] in the HTML spec for more info. +//! See [`destination`][destination], [`label`][label], and [`title`][title] +//! for grammar, notes, and recommendations on each part.  //!  //! The `destination`, `label`, and `title` parts are interpreted as the  //! [string][] content type.  //! That means that [character escapes][character_escape] and  //! [character references][character_reference] are allowed.  //! -//! Definitions match to references through their label. +//! Definitions match to references through identifiers.  //! To match, both labels must be equal after normalizing with  //! [`normalize_identifier`][normalize_identifier].  //! One definition can match to multiple references. @@ -57,6 +55,16 @@  //! `<img>` when compiling, see  //! [`sanitize_uri`][sanitize_uri].  //! +//! ## HTML +//! +//! Definitions in markdown do not, on their own, relate to anything in HTML. +//! When matched with a [label end (reference)][label_end], they together +//! relate to the `<a>` or `<img>` elements in HTML. +//! The definition forms its `href` or `src`, and optionally `title`, +//! attributes. +//! See [*§ 4.5.1 The `a` element*][html_a] and +//! [*§ 4.8.3 The `img` element*][html_img] in the HTML spec for more info. +//!  //! ## Tokens  //!  //! *   [`Definition`][Name::Definition] @@ -84,14 +92,14 @@  //! [string]: crate::construct::string  //! [character_escape]: crate::construct::character_escape  //! [character_reference]: crate::construct::character_reference -//! [label_end]: crate::construct::label_end  //! [destination]: crate::construct::partial_destination -//! [title]: crate::construct::partial_title  //! [label]: crate::construct::partial_label +//! [label_end]: crate::construct::label_end +//! [title]: crate::construct::partial_title  //! [sanitize_uri]: crate::util::sanitize_uri::sanitize_uri  //! [normalize_identifier]: crate::util::normalize_identifier -//! [html-a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element -//! [html-img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element +//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element +//! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element  use crate::construct::partial_space_or_tab::space_or_tab;  use crate::construct::partial_space_or_tab_eol::space_or_tab_eol; diff --git a/src/construct/document.rs b/src/construct/document.rs index 9def6c5..0cda368 100644 --- a/src/construct/document.rs +++ b/src/construct/document.rs @@ -6,7 +6,7 @@  //! The constructs found in flow are:  //!  //! *   [Block quote][crate::construct::block_quote] -//! *   [List][crate::construct::list_item] +//! *   [List item][crate::construct::list_item]  use crate::event::{Content, Event, Kind, Link, Name};  use crate::state::{Name as StateName, State}; @@ -409,7 +409,7 @@ fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) {                  }              } -            debug_assert!(found, "expected to find container token to exit"); +            debug_assert!(found, "expected to find container event to exit");          }          if let Some(ref mut list) = tokenizer.tokenize_state.document_exits[index] { diff --git a/src/construct/hard_break_escape.rs b/src/construct/hard_break_escape.rs index 1fafa0b..64c909a 100644 --- a/src/construct/hard_break_escape.rs +++ b/src/construct/hard_break_escape.rs @@ -1,28 +1,33 @@ -//! Hard break (escape) is a construct that occurs in the  [text][] content -//! type. +//! Hard break (escape) occurs in the  [text][] content type.  //! -//! They’re formed with the following BNF: +//! ## Grammar +//! +//! Hard break (escape) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! ; Restriction: followed by a line ending  (that is part of the construct +//! ; Restriction: followed by a line ending  (that is part of the content  //! ; instead of ending it).  //! hard_break_escape ::= '\\'  //! ```  //! -//! Hard breaks in markdown relate to the HTML element `<br>`. -//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. -//!  //! It is also possible to create a hard break with a  //! [hard break (trailing)][hard_break_trailing]. -//! That construct is not recommended because trailing spaces are typically -//! invisible in editors, or even automatically removed, making them hard to -//! use.  //! -//! It is also possible to escape punctuation characters with a similar +//! Punctuation characters can be escaped with a similar  //! construct: a [character escape][character_escape] is a backslash followed  //! by an ASCII punctuation character.  //! Arbitrary characters can be escaped with -//! [character reference][character_reference]s. +//! [character references][character_reference]. +//! +//! ## HTML +//! +//! Hard breaks in markdown relate to the HTML element `<br>`. +//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. +//! +//! ## Recommendation +//! +//! Always use hard break (escape), never hard break (trailing).  //!  //! ## Tokens  //! diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index 3bcff54..960ae32 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -1,17 +1,16 @@ -//! Heading (atx) is a construct that occurs in the [flow] content type. +//! Heading (atx) occurs in the [flow][] content type.  //! -//! They’re formed with the following BNF: +//! ## Grammar  //! -//! ```bnf -//! heading_atx ::= 1*6'#' [ 1*space_or_tab text [ 1*space_or_tab 1*'#' ] ] *space_or_tab +//! Heading (atx) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //! -//! text ::= code - eol -//! space_or_tab ::= ' ' | '\t' +//! ```bnf +//! heading_atx ::= 1*6'#' [ 1*space_or_tab line [ 1*space_or_tab 1*'#' ] ] *space_or_tab  //! ```  //! -//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. -//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the -//! HTML spec][html] for more info. +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file).  //!  //! `CommonMark` introduced the requirement on whitespace existing after the  //! opening sequence and before text. @@ -25,16 +24,25 @@  //! [hard break (escape)][hard_break_escape]).  //! However, their limit is that they cannot form `<h3>` through `<h6>`  //! headings. -//! Due to this limitation, it is recommended to use atx headings.  //!  //! > 🏛 **Background**: the word *setext* originates from a small markup  //! > language by Ian Feldman from 1991. -//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > See [*§ Setext* on Wikipedia][wiki_setext] for more info.  //! > The word *atx* originates from a tiny markup language by Aaron Swartz  //! > from 2002.  //! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for  //! > more info.  //! +//! ## HTML +//! +//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! ## Recommendation +//! +//! Always use heading (atx), never heading (setext). +//!  //! ## Tokens  //!  //! *   [`HeadingAtx`][Name::HeadingAtx] @@ -51,7 +59,7 @@  //! [heading_setext]: crate::construct::heading_setext  //! [hard_break_escape]: crate::construct::hard_break_escape  //! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements -//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [wiki_setext]: https://en.wikipedia.org/wiki/Setext  //! [atx]: http://www.aaronsw.com/2002/atx/  use crate::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 043104a..bad781c 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -1,20 +1,21 @@ -//! Heading (setext) is a construct that occurs in the [flow] content type. +//! Heading (setext) occurs in the [flow][] content type.  //! -//! They’re formed with the following BNF: +//! ## Grammar +//! +//! Heading (setext) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf -//! heading_setext ::= line *(eol line) eol whitespace_optional (1*'-' | 1*'=') whitespace_optional +//! heading_setext ::= paragraph eol *space_or_tab (1*'-' | 1*'=')  *space_or_tab  //! -//! whitespace ::= 1*space_or_tab -//! whitespace_optional ::= [ whitespace ] -//! line ::= code - eol -//! eol ::= '\r' | '\r\n' | '\n' +//! ; See the `paragraph` construct for the BNF of that part.  //! ```  //! -//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in -//! HTML. -//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the -//! HTML spec][html] for more info. +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file). +//! +//! See [`paragraph`][paragraph] for grammar, notes, and recommendations on +//! that part.  //!  //! In markdown, it is also possible to create headings with a  //! [heading (atx)][heading_atx] construct. @@ -23,7 +24,6 @@  //! [hard break (escape)][hard_break_escape]).  //! However, their limit is that they cannot form `<h3>` through `<h6>`  //! headings. -//! Due to this limitation, it is recommended to use atx headings.  //!  //! [Thematic breaks][thematic_break] formed with dashes and without whitespace  //! could be interpreted as a heading (setext). @@ -32,12 +32,23 @@  //!  //! > 🏛 **Background**: the word *setext* originates from a small markup  //! > language by Ian Feldman from 1991. -//! > See [*§ Setext* on Wikipedia][wiki-setext] for more info. +//! > See [*§ Setext* on Wikipedia][wiki_setext] for more info.  //! > The word *atx* originates from a tiny markup language by Aaron Swartz  //! > from 2002.  //! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for  //! > more info.  //! +//! ## HTML +//! +//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in +//! HTML. +//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the +//! HTML spec][html] for more info. +//! +//! ## Recommendation +//! +//! Always use heading (atx), never heading (setext). +//!  //! ## Tokens  //!  //! *   [`HeadingSetext`][Name::HeadingSetext] @@ -50,11 +61,12 @@  //! *   [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.30/#setext-headings)  //!  //! [flow]: crate::construct::flow +//! [paragraph]: crate::construct::paragraph  //! [heading_atx]: crate::construct::heading_atx  //! [thematic_break]: crate::construct::thematic_break  //! [hard_break_escape]: crate::construct::hard_break_escape  //! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements -//! [wiki-setext]: https://en.wikipedia.org/wiki/Setext +//! [wiki_setext]: https://en.wikipedia.org/wiki/Setext  //! [atx]: http://www.aaronsw.com/2002/atx/  use crate::constant::TAB_SIZE; diff --git a/src/construct/html_flow.rs b/src/construct/html_flow.rs index 38e33f8..bd41aa9 100644 --- a/src/construct/html_flow.rs +++ b/src/construct/html_flow.rs @@ -1,38 +1,38 @@ -//! HTML (flow) is a construct that occurs in the [flow][] cont&ent type. +//! HTML (flow) occurs in the [flow][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! HTML (flow) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf  //! html_flow ::= raw | comment | instruction | declaration | cdata | basic | complete  //! -//! ; Note: closing tag name need to match opening tag name. -//! raw ::= '<' raw_tag_name [ [ ( whitespace | '>' ) *line ] *( eol *line ) ] [ '</' raw_tag_name *line ] -//! comment ::= '<!--' [ *'-' '>' *line | *line *( eol *line ) [ '-->' *line ] ] -//! instruction ::= '<?' [ '>' *line | *line *( eol *line ) [ '?>' *line ] ] -//! declaration ::= '<!' ascii_alphabetic *line *( eol *line ) [ '>' *line ] -//! cdata ::= '<![CDATA[' *line *( eol *line ) [ ']]>' *line ] -//! basic ::= '< [ '/' ] basic_tag_name [ [ '/' ] '>' *line *( eol 1*line ) ] -//! complete ::= ( opening_tag | closing_tag ) ( whitespace_optional *( eol 1*line ) | whitespace_optional ) +//! ; Note: closing tag name does not need to match opening tag name. +//! raw ::= '<' raw_tag_name [[space_or_tab *line | '>' *line] eol] *(*line eol) ['</' raw_tag_name *line] +//! comment ::= '<!--' [*'-' '>' *line | *line *(eol *line) ['-->' *line]] +//! instruction ::= '<?' ['>' *line | *line *(eol *line) ['?>' *line]] +//! declaration ::= '<!' ascii_alphabetic *line *(eol *line) ['>' *line] +//! cdata ::= '<![CDATA[' *line *(eol *line) [']]>' *line] +//! basic ::= '< ['/'] basic_tag_name [['/'] '>' *line *(eol 1*line)] +//! complete ::= (opening_tag | closing_tag) [*space_or_tab *(eol 1*line)]  //!  //! raw_tag_name ::= 'pre' | 'script' | 'style' | 'textarea' ; Note: case-insensitive.  //! basic_tag_name ::= 'address' | 'article' | 'aside' | ... ; See `constants.rs`, and note: case-insensitive. -//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' -//! closing_tag ::= '</' tag_name whitespace_optional '>' -//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) -//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] -//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) -//! attribute_value ::= '"' *( line - '"' ) '"' | "'" *( line - "'" )  "'" | 1*( line - space_or_tab - '"' - "'" - '/' - '<' - '=' - '>' - '`') -//! -//! whitespace ::= 1*space_or_tab -//! whitespace_optional ::= [ whitespace ] -//! line ::= code - eol -//! eol ::= '\r' | '\r\n' | '\n' -//! space_or_tab ::= ' ' | '\t' +//! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>' +//! closing_tag ::= '</' tag_name [space_or_tab_eol] '>' +//! tag_name ::= ascii_alphabetic *('-' | ascii_alphanumeric) +//! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value] +//! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric) +//! attribute_value ::= '"' *(line - '"') '"' | "'" *(line - "'")  "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`')  //! ```  //! +//! As this construct occurs in flow, like all flow constructs, it must be +//! followed by an eol (line ending) or eof (end of file). +//!  //! The grammar for HTML in markdown does not resemble the rules of parsing  //! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML -//! spec][html-parsing]. +//! spec][html_parsing].  //! As such, HTML in markdown *resembles* HTML, but is instead a (naïve?)  //! attempt to parse an XML-like language.  //! By extension, another notable property of the grammar is that it can @@ -96,7 +96,7 @@  //! [paragraph]: crate::construct::paragraph  //! [html_raw_names]: crate::constant::HTML_RAW_NAMES  //! [html_block_names]: crate::constant::HTML_BLOCK_NAMES -//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing  use crate::constant::{      HTML_BLOCK_NAMES, HTML_CDATA_PREFIX, HTML_RAW_NAMES, HTML_RAW_SIZE_MAX, TAB_SIZE, diff --git a/src/construct/html_text.rs b/src/construct/html_text.rs index fde0847..26eded9 100644 --- a/src/construct/html_text.rs +++ b/src/construct/html_text.rs @@ -1,34 +1,31 @@ -//! HTML (text) is a construct that occurs in the [text][] content type. +//! HTML (text) occurs in the [text][] content type.  //! -//! It forms with the following BNF: +//! ## Grammar +//! +//! HTML (text) forms with the following BNF +//! (<small>see [construct][crate::construct] for character groups</small>):  //!  //! ```bnf  //! html_text ::= comment | instruction | declaration | cdata | tag_close | tag_open  //!  //! ; Restriction: the text is not allowed to start with `>`, `->`, or to contain `--`. -//! comment ::= '<!--' *code '-->' -//! instruction ::= '<?' *code '?>' -//! declaration ::= '<!' ascii_alphabetic *code '>' +//! comment ::= '<!--' *byte '-->' +//! instruction ::= '<?' *byte '?>' +//! declaration ::= '<!' ascii_alphabetic *byte '>'  //! ; Restriction: the text is not allowed to contain `]]`. -//! cdata ::= '<![CDATA[' *code ']]>' -//! tag_close ::= '</' tag_name whitespace_optional '>' -//! opening_tag ::= '<' tag_name *( whitespace attribute ) [ whitespace_optional '/' ] whitespace_optional '>' +//! cdata ::= '<![CDATA[' *byte ']]>' +//! tag_close ::= '</' tag_name [space_or_tab_eol] '>' +//! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>'  //!  //! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) -//! attribute ::= attribute_name [ whitespace_optional '=' whitespace_optional attribute_value ] -//! attribute_name ::= ( ':' | '_' | ascii_alphabetic ) *( '-' | '.' | ':' | '_' | ascii_alphanumeric ) -//! attribute_value ::= '"' *( code - '"' ) '"' | "'" *( code - "'" )  "'" | 1*( code - space_or_tab - eol - '"' - "'" - '/' - '<' - '=' - '>' - '`') -//! -//! ; Note: blank lines can never occur in `text`. -//! whitespace ::= 1*space_or_tab | [ *space_or_tab eol *space_or_tab ] -//! whitespace_optional ::= [ whitespace ] -//! eol ::= '\r' | '\r\n' | '\n' -//! space_or_tab ::= ' ' | '\t' +//! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value] +//! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric) +//! attribute_value ::= '"' *(byte - '"') '"' | "'" *(byte - "'")  "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`')  //! ```  //!  //! The grammar for HTML in markdown does not resemble the rules of parsing  //! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML -//! spec][html-parsing]. +//! spec][html_parsing].  //! See the related flow construct [HTML (flow)][html_flow] for more info.  //!  //! Because the **tag open** and **tag close** productions in the grammar form @@ -52,7 +49,7 @@  //!  //! [text]: crate::construct::text  //! [html_flow]: crate::construct::html_flow -//! [html-parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing +//! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing  use crate::constant::HTML_CDATA_PREFIX;  use crate::construct::partial_space_or_tab::space_or_tab; diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 09716b7..4752639 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -1,4 +1,4 @@ -//! Label end is a construct that occurs in the [text][] conten&t type. +//! Label end is a construct that occurs in the [text][] content type.  //!  //! It forms with the following BNF:  //! diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index a70906a..09678dd 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -289,7 +289,7 @@ pub fn after(tokenizer: &mut Tokenizer) -> State {          container.size = prefix;          tokenizer.exit(Name::ListItemPrefix); -        tokenizer.register_resolver_before(ResolveName::List); +        tokenizer.register_resolver_before(ResolveName::ListItem);          State::Ok      }  } diff --git a/src/construct/mod.rs b/src/construct/mod.rs index 49868e9..da2f5e8 100644 --- a/src/construct/mod.rs +++ b/src/construct/mod.rs @@ -1,17 +1,33 @@  //! Constructs found in markdown.  //! -//! There are several *things* found when parsing markdown, such as, say, a -//! thematic break. -//! These things are called constructs here. -//! Sometimes, there are several constructs that result in an equivalent thing. -//! For example, [code (fenced)][code_fenced] and -//! [code (indented)][code_indented] are considered different constructs +//! Constructs are grouped by content type. +//! Which content type is allowed somewhere, defines which constructs are +//! allowed there. +//! +//! ## Content type +//! +//! The following content types are found in markdown: +//! +//! *   [document][] +//! *   [flow][] +//! *   [string][] +//! *   [text][]  //!  //! Content types also have a *rest* thing: after all things are parsed,  //! there’s something left. +//! In document, that is [flow][].  //! In flow, that is a [paragraph][].  //! In string and text, that is [data][partial_data].  //! +//! ## Construct +//! +//! There are several *things* found when parsing markdown, such as, say, a +//! thematic break. +//! These things are called constructs here. +//! Sometimes, there are several constructs that result in an equivalent thing. +//! For example, [code (fenced)][code_fenced] and +//! [code (indented)][code_indented] are considered different constructs. +//!  //! The following constructs are found in markdown:  //!  //! *   [attention (strong, emphasis)][attention] @@ -39,7 +55,7 @@  //! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by  //! > [whitespace][partial_whitespace].  //! -//! There are also several routines used in different places: +//! There are also several small subroutines typically used in different places:  //!  //! *   [bom][partial_bom]  //! *   [data][partial_data] @@ -51,20 +67,60 @@  //! *   [title][partial_title]  //! *   [whitespace][partial_whitespace]  //! +//! ## Grammar +//!  //! Each construct maintained here is explained with a BNF diagram. +//! +//! Such diagrams are considered to be *non-normative*. +//! That is to say, they form illustrative, imperfect, but useful, examples. +//! The code, in Rust, is considered to be normative. +//!  //! For example, the docs for [character escape][character_escape] contain:  //!  //! ```bnf  //! character_escape ::= '\\' ascii_punctuation  //! ```  //! -//! Such diagrams are considered to be *non-normative*. -//! That is to say, they form illustrative, imperfect, but useful, examples. -//! The code, in Rust, is considered to be normative. +//! These diagrams contain references to character group as defined by Rust on +//! for example [char][], but also often on [u8][], which is what `micromark-rs` +//! typically works on. +//! So, for example, `ascii_punctuation` refers to +//! [`u8::is_ascii_punctuation`][u8::is_ascii_punctuation].  //! -//! They also contain references to character as defined by [char][], so for -//! example `ascii_punctuation` refers to -//! [`char::is_ascii_punctuation`][char::is_ascii_punctuation]. +//! For clarity, the productions used throughout are: +//! +//! ```bnf +//! ; Rust / ASCII groups: +//! ; 'a'..='z' +//! ascii_lowercase ::= 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' +//! ; 'A'..='Z' +//! ascii_uppercase ::= 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' +//! ; 'A'..='Z', 'a'..='z' +//! ascii_alphabetic ::= ascii_lowercase | ascii_uppercase +//! ; '0'..='9' +//! ascii_digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' +//! ; '0'..='9'; 'A'..='F', 'a'..='f' +//! ascii_hexdigit ::= ascii_digit | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' +//! ; '0'..='9'; 'A'..='Z', 'a'..='z' +//! ascii_alphanumeric ::= ascii_digit | ascii_alphabetic +//! ; '!'..='/'; ':'..='@'; '['..='`'; '{'..='~' +//! ascii_punctuation ::= '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\' | ']' | '^' | '_' | '`' | '{' | '|' | '}' | '~' +//! ; 0x00..=0x1F; 0x7F +//! ascii_control ::= 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 | 0x08 | 0x09 | 0x0A | 0x0B | 0x0C | 0x0D | 0x0E | 0x0F | 0x10 | 0x12 | 0x13 | 0x14 | 0x15 | 0x16 | 0x17 | 0x18 | 0x19 | 0x1A | 0x1B | 0x1C | 0x1D | 0x1E | 0x1F | 0x7F +//! +//! ; Markdown groups: +//! ; Any byte (u8) +//! byte ::= 0x00..=0xFFFF +//! space_or_tab ::= '\t' | ' ' +//! eol ::= '\n' | '\r' | '\r\n' +//! line ::= byte - eol +//! text ::= line - space_or_tab +//! space_or_tab_eol ::= 1*space_or_tab | 0*space_or_tab eol 0*space_or_tab +//! +//! ; Unicode groups: +//! unicode_whitespace ::= ? ; See `char::is_whitespace`. +//! unicode_punctuation ::= ? ; See `src/unicode.rs`. +//! ```  pub mod attention;  pub mod autolink; diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index 3ffa646..adbfae1 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -1,6 +1,6 @@  //! Data occurs in [text][] and [string][].  //! -//! It can include anything (including line endings), and stops at certain +//! It can include anything (except for line endings) and stops at certain  //! characters.  //!  //! [string]: crate::construct::string diff --git a/src/construct/partial_whitespace.rs b/src/construct/partial_whitespace.rs index bf06df9..04016cb 100644 --- a/src/construct/partial_whitespace.rs +++ b/src/construct/partial_whitespace.rs @@ -71,7 +71,7 @@ pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whol      }  } -/// Trim a [`Data`][Name::Data] token. +/// Trim a [`Data`][Name::Data] event.  fn trim_data(      tokenizer: &mut Tokenizer,      exit_index: usize, @@ -109,7 +109,7 @@ fn trim_data(          };          // The whole data is whitespace. -        // We can be very fast: we only change the token types. +        // We can be very fast: we only change the event names.          if index == 0 {              tokenizer.events[exit_index - 1].name = name.clone();              tokenizer.events[exit_index].name = name; @@ -157,7 +157,7 @@ fn trim_data(          }          // The whole data is whitespace. -        // We can be very fast: we only change the token types. +        // We can be very fast: we only change the event names.          if index == slice.bytes.len() {              tokenizer.events[exit_index - 1].name = Name::SpaceOrTab;              tokenizer.events[exit_index].name = Name::SpaceOrTab; diff --git a/src/event.rs b/src/event.rs index 7f81571..8058d64 100644 --- a/src/event.rs +++ b/src/event.rs @@ -1,3 +1,5 @@ +//! Semantic labels of things happening. +  /// Semantic label of a span.  #[derive(Clone, Debug, Eq, Hash, PartialEq)]  pub enum Name { @@ -1832,7 +1834,7 @@ pub enum Name {      ThematicBreakSequence,  } -/// List of void tokens, used to make sure everything is working well. +/// List of void events, used to make sure everything is working well.  pub const VOID_EVENTS: [Name; 41] = [      Name::AttentionSequence,      Name::AutolinkEmail, @@ -1891,21 +1893,25 @@ pub enum Content {  /// Link to another event.  #[derive(Clone, Debug)]  pub struct Link { +    /// Previous event.      pub previous: Option<usize>, +    /// Next event.      pub next: Option<usize>, +    /// Content type.      pub content: Content,  }  /// Place in the document.  /// -/// The interface for the location in the document comes from unist `Point`: -/// <https://github.com/syntax-tree/unist#point>. +/// The interface for the location in the document comes from unist +/// [`Point`](https://github.com/syntax-tree/unist#point).  #[derive(Clone, Debug)]  pub struct Point {      /// 1-indexed line number.      pub line: usize,      /// 1-indexed column number. -    /// This is increases up to a tab stop for tabs. +    /// +    /// This is increased up to a tab stop for tabs.      /// Some editors count tabs as 1 character, so this position is not the      /// same as editors.      pub column: usize, diff --git a/src/parser.rs b/src/parser.rs index cc93021..8b13d45 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,4 @@ -//! Turn a string of markdown into events. +//! Turn bytes of markdown into events.  use crate::event::{Event, Point};  use crate::state::{Name as StateName, State}; diff --git a/src/resolve.rs b/src/resolve.rs index 1106880..a62d382 100644 --- a/src/resolve.rs +++ b/src/resolve.rs @@ -1,17 +1,57 @@ +//! Resolve events. +  use crate::construct;  use crate::tokenizer::Tokenizer; -/// Names of functions that resolve. +/// Names of resolvers.  #[derive(Clone, Copy, Debug, Eq, PartialEq)]  pub enum Name { +    /// Resolve labels. +    /// +    /// Labels are parsed as starts and ends, and when they match, merged +    /// together to form media (links and images), and otherwise turned into +    /// data.      Label, +    /// Resolve attention. +    /// +    /// Attention sequences are parsed and finally matched together to form +    /// attention (emphasis and strong) based on which characters they contain, +    /// and what occurs before and after each sequence. +    /// Otherwise they are turned into data.      Attention, +    /// Resolve heading (atx). +    /// +    /// Heading (atx) contains further sequences and data. +    /// At the end, a final sequence is kept that way, while the rest is merged +    /// with the data.      HeadingAtx, +    /// Resolve heading (setext). +    /// +    /// Heading (setext) is parsed as an underline that is preceded by a +    /// paragraph, both will form the whole construct.      HeadingSetext, -    List, +    /// Resolve list item. +    /// +    /// List items are parsed on their own. +    /// They are wrapped into ordered or unordered lists based on whether items +    /// with the same marker occur next to each other. +    ListItem, +    /// Resolve paragraphs. +    /// +    /// Paragraphs are parsed as single line paragraphs, as what remains if +    /// other flow constructs don’t match. +    /// But, when they occur next to each other, they need to be merged.      Paragraph, +    /// Resolve data. +    /// +    /// Data is parsed as many small bits, due to many punctuation characters +    /// potentially starting something in particularly text content. +    /// It helps performance to merge them together if those markers did not +    /// match anything and hence they occur next to each other.      Data, +    /// Resolve whitespace in `string`.      String, +    /// Resolve whitespace in `text`.      Text,  } @@ -22,7 +62,7 @@ pub fn call(tokenizer: &mut Tokenizer, name: Name) {          Name::Attention => construct::attention::resolve,          Name::HeadingAtx => construct::heading_atx::resolve,          Name::HeadingSetext => construct::heading_setext::resolve, -        Name::List => construct::list_item::resolve, +        Name::ListItem => construct::list_item::resolve,          Name::Paragraph => construct::paragraph::resolve,          Name::Data => construct::partial_data::resolve,          Name::String => construct::string::resolve, diff --git a/src/state.rs b/src/state.rs index aae153f..f9cc39a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -1,7 +1,9 @@ +//! States of the state machine. +  use crate::construct;  use crate::tokenizer::Tokenizer; -/// The result of a state. +/// Result of a state.  #[derive(Clone, Copy, Debug, Eq, PartialEq)]  pub enum State {      /// Move to [`Name`][] next. @@ -14,7 +16,7 @@ pub enum State {      Nok,  } -/// Names of functions to move to. +/// Names of states to move to.  #[derive(Clone, Copy, Debug, Eq, PartialEq)]  #[allow(clippy::enum_variant_names)]  pub enum Name { @@ -296,7 +298,7 @@ pub enum Name {  }  #[allow(clippy::too_many_lines)] -/// Call the corresponding function for a state name. +/// Call the corresponding state for a state name.  pub fn call(tokenizer: &mut Tokenizer, name: Name) -> State {      let func = match name {          Name::AttentionStart => construct::attention::start, diff --git a/src/subtokenize.rs b/src/subtokenize.rs index f55c790..a031e35 100644 --- a/src/subtokenize.rs +++ b/src/subtokenize.rs @@ -1,27 +1,23 @@  //! Deal with content in other content.  //!  //! To deal with content in content, *you* (a `micromark-rs` contributor) add -//! information on events. -//! Events are a flat list, but they can be connected to each other by setting -//! `previous` and `next` links. -//! These links: -//! -//! *   …must occur on [`Enter`][Kind::Enter] events only -//! *   …must occur on void events (they are followed by their corresponding -//!     [`Exit`][Kind::Exit] event) -//! *   …must have `link` field +//! info on events. +//! Events are a flat list, but they can be connected to each other with a +//! [`Link`][crate::event::Link]. +//! Links must occur on [`Enter`][Kind::Enter] events only, which are void +//! (they are followed by their corresponding [`Exit`][Kind::Exit] event).  //!  //! Links will then be passed through a tokenizer for the corresponding content  //! type by `subtokenize`. -//! The subevents they result in are split up into slots for each linked token +//! The subevents they result in are split up into slots for each linked event  //! and replace those links.  //! -//! Subevents are not immediately subtokenized again because markdown prevents -//! us from doing so due to definitions, which can occur after references, and -//! thus the whole document needs to be parsed up to the level of definitions, -//! before any level that can include references can be parsed. +//! Subevents are not immediately subtokenized as markdown prevents us from +//! doing so due to definitions, which can occur after references, and thus the +//! whole document needs to be parsed up to the level of definitions, before +//! any level that can include references can be parsed. -use crate::event::{Content, Event, Kind}; +use crate::event::{Content, Event, Kind, VOID_EVENTS};  use crate::parser::ParseState;  use crate::state::{Name as StateName, State};  use crate::tokenizer::Tokenizer; @@ -30,31 +26,42 @@ use crate::util::edit_map::EditMap;  /// Link two [`Event`][]s.  ///  /// Arbitrary (void) events can be linked together. -/// This optimizes for the common case where the token at `index` is connected -/// to the previous void token. +/// This optimizes for the common case where the event at `index` is connected +/// to the previous void event.  pub fn link(events: &mut [Event], index: usize) {      link_to(events, index - 2, index);  }  /// Link two arbitrary [`Event`][]s together. -pub fn link_to(events: &mut [Event], pevious: usize, next: usize) { -    debug_assert_eq!(events[pevious].kind, Kind::Enter); -    debug_assert_eq!(events[pevious + 1].kind, Kind::Exit); -    debug_assert_eq!(events[pevious + 1].name, events[pevious].name); +pub fn link_to(events: &mut [Event], previous: usize, next: usize) { +    debug_assert_eq!(events[previous].kind, Kind::Enter); +    debug_assert!( +        VOID_EVENTS.iter().any(|d| d == &events[previous].name), +        "expected `{:?}` to be void", +        events[previous].name +    ); +    debug_assert_eq!(events[previous + 1].kind, Kind::Exit); +    debug_assert_eq!(events[previous].name, events[previous + 1].name);      debug_assert_eq!(events[next].kind, Kind::Enter); +    debug_assert!( +        VOID_EVENTS.iter().any(|d| d == &events[next].name), +        "expected `{:?}` to be void", +        events[next].name +    );      // Note: the exit of this event may not exist, so don’t check for that. -    let link_previous = events[pevious] +    let link_previous = events[previous]          .link          .as_mut()          .expect("expected `link` on previous");      link_previous.next = Some(next);      let link_next = events[next].link.as_mut().expect("expected `link` on next"); -    link_next.previous = Some(pevious); +    link_next.previous = Some(previous);      debug_assert_eq!( -        events[pevious].link.as_ref().unwrap().content, -        events[next].link.as_ref().unwrap().content +        events[previous].link.as_ref().unwrap().content, +        events[next].link.as_ref().unwrap().content, +        "expected `content` to match"      );  } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d66e8f6..7eba194 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,15 +1,12 @@ -//! The tokenizer glues states from the state machine together. +//! A tokenizer glues states from the state machine together.  //! -//! It facilitates everything needed to turn codes into tokens and  with -//! a state machine. -//! It also enables logic needed for parsing markdown, such as an [`attempt`][] -//! to parse something, which can succeed or, when unsuccessful, revert the -//! attempt. -//! Similarly, a [`check`][] exists, which does the same as an `attempt` but -//! reverts even if successful. +//! It facilitates everything needed to turn bytes into events with a state +//! machine. +//! It also enables the logic needed for parsing markdown, such as an +//! [`attempt`][] to try and parse something, which can succeed or, when +//! unsuccessful, revert the attempt.  //!  //! [`attempt`]: Tokenizer::attempt -//! [`check`]: Tokenizer::check  use crate::constant::TAB_SIZE;  use crate::event::{Content, Event, Kind, Link, Name, Point, VOID_EVENTS}; @@ -18,17 +15,31 @@ use crate::resolve::{call as call_resolve, Name as ResolveName};  use crate::state::{call, State};  use crate::util::edit_map::EditMap; -/// Info used to tokenize the current container. +/// Containers.  /// -/// This info is shared between the initial construct and its continuation. -/// It’s only used for list items. +/// Containers are found when tokenizing +/// [document content][crate::construct::document]. +/// They parse a portion at the start of one or more lines. +/// The rest of those lines is a different content type (specifically, flow), +/// which they “contain”. +#[derive(Debug, Eq, PartialEq)] +pub enum Container { +    /// [Block quote][crate::construct::block_quote]. +    BlockQuote, +    /// [List item][crate::construct::list_item]. +    ListItem, +} + +/// Info used to tokenize a container. +/// +/// Practically, these fields are only used for list items.  #[derive(Debug)]  pub struct ContainerState {      /// Kind.      pub kind: Container,      /// Whether the first line was blank.      pub blank_initial: bool, -    /// The size of the initial construct. +    /// Size.      pub size: usize,  } @@ -39,26 +50,19 @@ enum ByteAction {      ///      /// Includes replaced bytes.      Normal(u8), -    /// This is a new byte. -    Insert(u8),      /// This byte must be ignored.      Ignore, +    /// This is a new byte. +    Insert(u8),  } -/// Supported containers. -#[derive(Debug, PartialEq)] -pub enum Container { -    BlockQuote, -    ListItem, -} - -/// Loose label starts we found. +/// Label start, looking for an end.  #[derive(Debug)]  pub struct LabelStart {      /// Indices of where the label starts and ends in `events`.      pub start: (usize, usize), -    /// A boolean used internally to figure out if a (link) label start link -    /// can’t be used anymore (because it would contain another link). +    /// A boolean used internally to figure out if a (link) label start can’t +    /// be used anymore (because it would contain another link).      /// That link start is still looking for a balanced closing bracket though,      /// so we can’t remove it just yet.      pub inactive: bool, @@ -99,9 +103,10 @@ struct Attempt {      progress: Option<Progress>,  } -/// The internal state of a tokenizer, not to be confused with states from the -/// state machine, this instead is all the information about where we currently -/// are and what’s going on. +/// The internal state of a tokenizer. +/// +/// Not to be confused with states from the state machine, this instead is all +/// the information on where we currently are and what’s going on.  #[derive(Clone, Debug)]  struct Progress {      /// Length of `events`. @@ -168,7 +173,7 @@ pub struct TokenizeState<'a> {      /// List of defined identifiers.      pub definitions: Vec<String>, -    /// Whether to connect tokens. +    /// Whether to connect events.      pub connect: bool,      /// Marker.      pub marker: u8, @@ -188,15 +193,15 @@ pub struct TokenizeState<'a> {      pub start: usize,      /// Index.      pub end: usize, -    /// Slot for a token type. +    /// Slot for an event name.      pub token_1: Name, -    /// Slot for a token type. +    /// Slot for an event name.      pub token_2: Name, -    /// Slot for a token type. +    /// Slot for an event name.      pub token_3: Name, -    /// Slot for a token type. +    /// Slot for an event name.      pub token_4: Name, -    /// Slot for a token type. +    /// Slot for an event name.      pub token_5: Name,  } @@ -433,28 +438,25 @@ impl<'a> Tokenizer<'a> {      /// Mark the end of a semantic label.      pub fn exit(&mut self, name: Name) { -        let current_token = self.stack.pop().expect("cannot close w/o open tokens"); +        let current = self.stack.pop().expect("cannot close w/o open tokens"); -        debug_assert_eq!( -            current_token, name, -            "expected exit token to match current token" -        ); +        debug_assert_eq!(current, name, "expected exit event to match current event");          let previous = self.events.last().expect("cannot close w/o open event");          let mut point = self.point.clone();          debug_assert!( -            current_token != previous.name +            current != previous.name                  || previous.point.index != point.index                  || previous.point.vs != point.vs, -            "expected non-empty token" +            "expected non-empty event"          );          if VOID_EVENTS.iter().any(|d| d == &name) {              debug_assert!( -                current_token == previous.name, -                "expected token to be void (`{:?}`), instead of including `{:?}`", -                current_token, +                current == previous.name, +                "expected event to be void (`{:?}`), instead of including `{:?}`", +                current,                  previous.name              );          } diff --git a/src/unicode.rs b/src/unicode.rs index 764d4c7..2b79a88 100644 --- a/src/unicode.rs +++ b/src/unicode.rs @@ -1,6 +1,6 @@ -//! Information on Unicode. +//! Info on Unicode. -/// List of characters that are considered punctuation according to Unicode. +/// List of characters that are considered punctuation.  ///  /// > 👉 **Important**: this module is generated by `build.rs`.  /// > It is generate from the latest Unicode data. diff --git a/src/util/decode_character_reference.rs b/src/util/decode_character_reference.rs index f8fd18f..8ed32f4 100644 --- a/src/util/decode_character_reference.rs +++ b/src/util/decode_character_reference.rs @@ -1,4 +1,4 @@ -//! Utilities to decode character references. +//! Decode character references.  use crate::constant::CHARACTER_REFERENCES; @@ -43,11 +43,11 @@ pub fn decode_named(value: &str) -> String {  /// Decode numeric character references.  ///  /// Turn the number (in string form as either hexadecimal or decimal) coming -/// from a numeric character reference into a character. -/// Whether the base of the string form is `10` (decimal) or `16` (hexadecimal) -/// must be passed as the `radix` parameter. +/// from a numeric character reference into a string. +/// The base of the string form must be passed as the `radix` parameter, as +/// `10` (decimal) or `16` (hexadecimal).  /// -/// This returns the `char` associated with that number or a replacement +/// This returns a `String` form of the associated character or a replacement  /// character for C0 control characters (except for ASCII whitespace), C1  /// control characters, lone surrogates, noncharacters, and out of range  /// characters. diff --git a/src/util/edit_map.rs b/src/util/edit_map.rs index 11ac486..33c5706 100644 --- a/src/util/edit_map.rs +++ b/src/util/edit_map.rs @@ -1,6 +1,6 @@ -//! Helpers to deal with several changes in events, batching them together. +//! Deal with several changes in events, batching them together.  //! -//! Preferably, changes should be kept to a minumum. +//! Preferably, changes should be kept to a minimum.  //! Sometimes, it’s needed to change the list of events, because parsing can be  //! messy, and it helps to expose a cleaner interface of events to the compiler  //! and other users. diff --git a/src/util/encode.rs b/src/util/encode.rs index d37a2de..6530011 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -1,10 +1,11 @@ -//! Utilities to encode HTML. +//! Encode HTML.  /// Encode dangerous html characters.  ///  /// This ensures that certain characters which have special meaning in HTML are  /// dealt with. -/// Technically, we can skip `>` and `"` in many cases, but CM includes them. +/// Technically, we can skip `>` and `"` in many cases, but `CommonMark` +/// includes them.  ///  /// This behavior is not explained in prose in `CommonMark` but can be inferred  /// from the input/output test cases. diff --git a/src/util/mod.rs b/src/util/mod.rs index a01f31e..f51845c 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,4 +1,4 @@ -//! Utilities used when compiling markdown. +//! Utilities used when processing markdown.  pub mod decode_character_reference;  pub mod edit_map; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index f5b12d0..ddc51f8 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -1,17 +1,25 @@ -//! Utility to normalize identifiers. +//! Normalize identifiers.  /// Normalize an identifier, as found in [references][label_end] and  /// [definitions][definition], so it can be compared when matching.  ///  /// This collapsed whitespace found in markdown (`\t`, `\r`, `\n`, and ` `) -/// into one space, trims it (as in, dropping the first and last space), -/// and then performs unicode case folding twice: first by uppercasing -/// lowercase characters, and then lowercasing uppercase characters. +/// into one space, trims it (as in, dropping the first and last space), and +/// then performs unicode case folding twice: first by lowercasing uppercase +/// characters, and then uppercasing lowercase characters.  ///  /// Some characters are considered “uppercase”, such as U+03F4 (`ϴ`), but if  /// their lowercase counterpart (U+03B8 (`θ`)) is uppercased will result in a  /// different uppercase character (U+0398 (`Θ`)). -/// Hence, to get that form, we perform both upper- and lowercase. +/// Hence, to get that form, we perform both lower- and uppercase. +/// +/// Performing these steps in that order works, but the inverse does not work. +/// To illustrate, say the source markdown containes two identifiers +/// `SS` (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to +/// `ss` (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both +/// uppercase to `SS` (U+0053 U+0053). +/// If we’d inverse the steps, for `ẞ`, we’d first uppercase without a +/// change, and then lowercase to `ß`, which would not match `ss`.  ///  /// ## Examples  /// @@ -64,17 +72,5 @@ pub fn normalize_identifier(value: &str) -> String {          result.push_str(&value[start..]);      } -    // Some characters are considered “uppercase”, but if their lowercase -    // counterpart is uppercased will result in a different uppercase -    // character. -    // Hence, to get that form, we perform both lower- and uppercase. -    // Performing these steps in that order works, but the inverse does not -    // work. -    // To illustrate, say the source markdown containes two identifiers `SS` -    // (U+0053 U+0053) and `ẞ` (U+1E9E), which would be lowercased to `ss` -    // (U+0073 U+0073) and `ß` (U+00DF), and those in turn would both uppercase -    // to `SS` (U+0053 U+0053). -    // If we’d inverse the steps, for `ẞ`, we’d first uppercase without a -    // change, and then lowercase to `ß`, which would not match `ss`.      result.to_lowercase().to_uppercase()  } diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 051e1e1..593a70e 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -1,4 +1,4 @@ -//! Utilities to make urls safe. +//! Make urls safe.  use crate::util::encode::encode; @@ -60,9 +60,10 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {      value  } -/// Normalize a URL (such as used in definitions). +/// Normalize a URL (such as used in [definitions][definition], +/// [references][label_end]).  /// -/// Encode unsafe characters with percent-encoding, skipping already encoded +/// It encodes unsafe characters with percent-encoding, skipping already encoded  /// sequences.  ///  /// ## Examples @@ -77,6 +78,9 @@ pub fn sanitize_uri(value: &str, protocols: &Option<Vec<&str>>) -> String {  /// ## References  ///  /// *   [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) +/// +/// [definition]: crate::construct::definition +/// [label_end]: crate::construct::label_end  fn normalize_uri(value: &str) -> String {      let chars = value.chars().collect::<Vec<_>>();      // Note: it’ll grow bigger for each non-ascii or non-safe character. diff --git a/src/util/skip.rs b/src/util/skip.rs index 46cbb4a..a7de408 100644 --- a/src/util/skip.rs +++ b/src/util/skip.rs @@ -1,4 +1,4 @@ -//! Utilities to deal with lists of events. +//! Move across lists of events.  use crate::event::{Event, Kind, Name}; diff --git a/src/util/slice.rs b/src/util/slice.rs index e70078a..be2a381 100644 --- a/src/util/slice.rs +++ b/src/util/slice.rs @@ -1,4 +1,4 @@ -//! Utilities to deal with characters. +//! Deal with bytes.  use crate::constant::TAB_SIZE;  use crate::event::{Event, Kind, Point}; @@ -7,7 +7,9 @@ use std::str;  /// A range between two points.  #[derive(Debug)]  pub struct Position<'a> { +    /// Start point.      pub start: &'a Point, +    /// End point.      pub end: &'a Point,  } @@ -55,11 +57,14 @@ impl<'a> Position<'a> {  /// Bytes belonging to a range.  /// -/// Includes information on virtual spaces before and after the bytes. +/// Includes info on virtual spaces before and after the bytes.  #[derive(Debug)]  pub struct Slice<'a> { +    /// Bytes.      pub bytes: &'a [u8], +    /// Number of virtual spaces before the bytes.      pub before: usize, +    /// Number of virtual spaces after the bytes.      pub after: usize,  } | 
