From b33a81e40620b8b3eaeeec9d0e0b34ca5958dead Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Wed, 28 Sep 2022 17:54:39 +0200 Subject: Add support for turning mdast to hast --- src/construct/attention.rs | 5 +-- src/construct/gfm_table.rs | 6 +-- src/construct/heading_atx.rs | 6 +-- src/construct/heading_setext.rs | 6 +-- src/construct/label_end.rs | 4 +- src/construct/list_item.rs | 6 +-- src/construct/partial_data.rs | 6 +-- src/construct/partial_mdx_expression.rs | 2 +- src/construct/string.rs | 6 +-- src/construct/text.rs | 5 +-- src/lib.rs | 12 ++++- src/mdast.rs | 77 +++------------------------------ src/resolve.rs | 24 +++++----- src/to_mdast.rs | 6 +-- src/unist.rs | 75 ++++++++++++++++++++++++++++++++ src/util/sanitize_uri.rs | 1 + 16 files changed, 130 insertions(+), 117 deletions(-) create mode 100644 src/unist.rs (limited to 'src') diff --git a/src/construct/attention.rs b/src/construct/attention.rs index 4d58610..d99a52c 100644 --- a/src/construct/attention.rs +++ b/src/construct/attention.rs @@ -88,7 +88,6 @@ use crate::util::{ }, slice::Slice, }; -use alloc::string::String; use alloc::{vec, vec::Vec}; /// Attentention sequence that we can take markers from. @@ -152,7 +151,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Resolve sequences. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { // Find all sequences, gather info about them. let mut sequences = get_sequences(tokenizer); @@ -224,7 +223,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { tokenizer.map.consume(&mut tokenizer.events); - Ok(None) + None } /// Get sequences. diff --git a/src/construct/gfm_table.rs b/src/construct/gfm_table.rs index 63772c4..547358f 100644 --- a/src/construct/gfm_table.rs +++ b/src/construct/gfm_table.rs @@ -232,7 +232,7 @@ use crate::state::{Name as StateName, State}; use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{constant::TAB_SIZE, skip::opt_back as skip_opt_back}; -use alloc::{string::String, vec}; +use alloc::vec; /// Start of a GFM table. /// @@ -772,7 +772,7 @@ pub fn body_row_escape(tokenizer: &mut Tokenizer) -> State { } /// Resolve GFM table. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { let mut index = 0; let mut in_first_cell_awaiting_pipe = true; let mut in_row = false; @@ -887,7 +887,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { flush_table_end(tokenizer, last_table_end, last_table_has_body); } - Ok(None) + None } /// Generate a cell. diff --git a/src/construct/heading_atx.rs b/src/construct/heading_atx.rs index b76e455..c867117 100644 --- a/src/construct/heading_atx.rs +++ b/src/construct/heading_atx.rs @@ -69,7 +69,7 @@ use crate::state::{Name as StateName, State}; use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; -use alloc::{string::String, vec}; +use alloc::vec; /// Start of a heading (atx). /// @@ -223,7 +223,7 @@ pub fn data(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (atx). -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { let mut index = 0; let mut heading_inside = false; let mut data_start: Option = None; @@ -283,5 +283,5 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { index += 1; } - Ok(None) + None } diff --git a/src/construct/heading_setext.rs b/src/construct/heading_setext.rs index 3a484e1..1e6fd00 100644 --- a/src/construct/heading_setext.rs +++ b/src/construct/heading_setext.rs @@ -77,7 +77,7 @@ use crate::state::{Name as StateName, State}; use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; use crate::util::{constant::TAB_SIZE, skip}; -use alloc::{string::String, vec}; +use alloc::vec; /// At start of heading (setext) underline. /// @@ -184,7 +184,7 @@ pub fn after(tokenizer: &mut Tokenizer) -> State { } /// Resolve heading (setext). -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { tokenizer.map.consume(&mut tokenizer.events); let mut enter = skip::to(&tokenizer.events, 0, &[Name::HeadingSetextUnderline]); @@ -281,5 +281,5 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { tokenizer.map.consume(&mut tokenizer.events); - Ok(None) + None } diff --git a/src/construct/label_end.rs b/src/construct/label_end.rs index 95b9a27..ca71245 100644 --- a/src/construct/label_end.rs +++ b/src/construct/label_end.rs @@ -661,7 +661,7 @@ pub fn reference_collapsed_open(tokenizer: &mut Tokenizer) -> State { /// /// This turns matching label starts and label ends into links, images, and /// footnotes, and turns unmatched label starts back into data. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { // Inject labels. let labels = tokenizer.tokenize_state.labels.split_off(0); inject_labels(tokenizer, &labels); @@ -673,7 +673,7 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { tokenizer.map.consume(&mut tokenizer.events); - Ok(None) + None } /// Inject links/images/footnotes. diff --git a/src/construct/list_item.rs b/src/construct/list_item.rs index 13b740b..a4f166d 100644 --- a/src/construct/list_item.rs +++ b/src/construct/list_item.rs @@ -69,7 +69,7 @@ use crate::util::{ skip, slice::{Position, Slice}, }; -use alloc::{string::String, vec, vec::Vec}; +use alloc::{vec, vec::Vec}; /// Start of list item. /// @@ -371,7 +371,7 @@ pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { } /// Find adjacent list items with the same marker. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; let mut index = 0; @@ -474,5 +474,5 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { index += 1; } - Ok(None) + None } diff --git a/src/construct/partial_data.rs b/src/construct/partial_data.rs index b36d9f0..a27730c 100644 --- a/src/construct/partial_data.rs +++ b/src/construct/partial_data.rs @@ -10,7 +10,7 @@ use crate::event::{Kind, Name}; use crate::state::{Name as StateName, State}; use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use alloc::{string::String, vec}; +use alloc::vec; /// At beginning of data. /// @@ -73,7 +73,7 @@ pub fn inside(tokenizer: &mut Tokenizer) -> State { } /// Merge adjacent data events. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { let mut index = 0; // Loop through events and merge adjacent data events. @@ -105,5 +105,5 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { index += 1; } - Ok(None) + None } diff --git a/src/construct/partial_mdx_expression.rs b/src/construct/partial_mdx_expression.rs index 3ebd0f0..789443e 100644 --- a/src/construct/partial_mdx_expression.rs +++ b/src/construct/partial_mdx_expression.rs @@ -219,7 +219,7 @@ fn parse_expression(tokenizer: &mut Tokenizer, parse: &MdxExpressionParse) -> St }; // Parse and handle what was signaled back. - match parse(&result.value, kind) { + match parse(&result.value, &kind) { MdxSignal::Ok => State::Ok, MdxSignal::Error(message, place) => { let point = place_to_point(&result, place); diff --git a/src/construct/string.rs b/src/construct/string.rs index cf2f222..cad570d 100644 --- a/src/construct/string.rs +++ b/src/construct/string.rs @@ -17,7 +17,6 @@ use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use alloc::string::String; /// Characters that can start something in string. const MARKERS: [u8; 2] = [b'&', b'\\']; @@ -76,8 +75,7 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace in string. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { resolve_whitespace(tokenizer, false, false); - - Ok(None) + None } diff --git a/src/construct/text.rs b/src/construct/text.rs index 2648531..0ea0913 100644 --- a/src/construct/text.rs +++ b/src/construct/text.rs @@ -30,7 +30,6 @@ use crate::resolve::Name as ResolveName; use crate::state::{Name as StateName, State}; use crate::subtokenize::Subresult; use crate::tokenizer::Tokenizer; -use alloc::string::String; /// Characters that can start something in text. const MARKERS: [u8; 16] = [ @@ -244,7 +243,7 @@ pub fn before_data(tokenizer: &mut Tokenizer) -> State { } /// Resolve whitespace. -pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { +pub fn resolve(tokenizer: &mut Tokenizer) -> Option { resolve_whitespace( tokenizer, tokenizer.parse_state.options.constructs.hard_break_trailing, @@ -260,5 +259,5 @@ pub fn resolve(tokenizer: &mut Tokenizer) -> Result, String> { resolve_gfm_autolink_literal(tokenizer); } - Ok(None) + None } diff --git a/src/lib.rs b/src/lib.rs index fcdab10..e552327 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,7 +17,7 @@ extern crate alloc; mod construct; mod event; -pub mod mdast; +pub mod mdast; // To do: externalize? mod parser; mod resolve; mod state; @@ -25,6 +25,7 @@ mod subtokenize; mod to_html; mod to_mdast; mod tokenizer; +pub mod unist; // To do: externalize. mod util; use alloc::{boxed::Box, fmt, string::String}; @@ -32,6 +33,7 @@ use mdast::Node; use parser::parse; use to_html::compile as to_html; use to_mdast::compile as to_mdast; +use util::sanitize_uri::sanitize; /// Type of line endings in markdown. #[derive(Clone, Debug, Default, Eq, PartialEq)] @@ -146,7 +148,7 @@ pub enum MdxExpressionKind { /// Can be passed as `mdx_expression_parse` in [`Options`][] to support /// expressions according to a certain grammar (typically, a programming /// language). -pub type MdxExpressionParse = dyn Fn(&str, MdxExpressionKind) -> MdxSignal; +pub type MdxExpressionParse = dyn Fn(&str, &MdxExpressionKind) -> MdxSignal; /// Signature of a function that parses ESM. /// @@ -1187,3 +1189,9 @@ pub fn micromark_to_mdast(value: &str, options: &Options) -> Result String { + sanitize(value) +} diff --git a/src/mdast.rs b/src/mdast.rs index 79a39dd..8b5b74d 100644 --- a/src/mdast.rs +++ b/src/mdast.rs @@ -1,83 +1,14 @@ -//! [mdast][] syntax tree. +//! markdown syntax tree: [mdast][]. //! //! [mdast]: https://github.com/syntax-tree/mdast +use crate::unist::Position; use alloc::{ fmt, string::{String, ToString}, vec::Vec, }; -/// One place in a source file. -#[derive(Clone, Eq, PartialEq)] -pub struct Point { - /// 1-indexed integer representing a line in a source file. - pub line: usize, - /// 1-indexed integer representing a column in a source file. - pub column: usize, - /// 0-indexed integer representing a character in a source file. - pub offset: usize, -} - -impl Point { - #[must_use] - pub fn new(line: usize, column: usize, offset: usize) -> Point { - Point { - line, - column, - offset, - } - } -} - -impl fmt::Debug for Point { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}:{} ({})", self.line, self.column, self.offset) - } -} - -/// Location of a node in a source file. -#[derive(Clone, Eq, PartialEq)] -pub struct Position { - /// Represents the place of the first character of the parsed source region. - pub start: Point, - /// Represents the place of the first character after the parsed source - /// region, whether it exists or not. - pub end: Point, -} - -impl Position { - #[must_use] - pub fn new( - start_line: usize, - start_column: usize, - start_offset: usize, - end_line: usize, - end_column: usize, - end_offset: usize, - ) -> Position { - Position { - start: Point::new(start_line, start_column, start_offset), - end: Point::new(end_line, end_column, end_offset), - } - } -} - -impl fmt::Debug for Position { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "{}:{}-{}:{} ({}-{})", - self.start.line, - self.start.column, - self.end.line, - self.end.column, - self.start.offset, - self.end.offset - ) - } -} - /// Explicitness of a reference. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum ReferenceKind { @@ -370,7 +301,8 @@ impl Node { } } - pub fn position(&mut self) -> Option<&Position> { + #[must_use] + pub fn position(&self) -> Option<&Position> { match self { Node::Root(x) => x.position.as_ref(), Node::BlockQuote(x) => x.position.as_ref(), @@ -1204,6 +1136,7 @@ pub struct MdxJsxAttribute { #[cfg(test)] mod tests { use super::*; + use crate::unist::{Point, Position}; use alloc::{string::ToString, vec}; #[test] diff --git a/src/resolve.rs b/src/resolve.rs index 2586676..813ce52 100644 --- a/src/resolve.rs +++ b/src/resolve.rs @@ -64,18 +64,18 @@ pub enum Name { /// Call the corresponding resolver. pub fn call(tokenizer: &mut Tokenizer, name: Name) -> Result, String> { - let func = match name { - Name::Label => construct::label_end::resolve, - Name::Attention => construct::attention::resolve, - Name::GfmTable => construct::gfm_table::resolve, - Name::HeadingAtx => construct::heading_atx::resolve, - Name::HeadingSetext => construct::heading_setext::resolve, - Name::ListItem => construct::list_item::resolve, - Name::Content => construct::content::resolve, - Name::Data => construct::partial_data::resolve, - Name::String => construct::string::resolve, - Name::Text => construct::text::resolve, + let result = match name { + Name::Label => construct::label_end::resolve(tokenizer), + Name::Attention => construct::attention::resolve(tokenizer), + Name::GfmTable => construct::gfm_table::resolve(tokenizer), + Name::HeadingAtx => construct::heading_atx::resolve(tokenizer), + Name::HeadingSetext => construct::heading_setext::resolve(tokenizer), + Name::ListItem => construct::list_item::resolve(tokenizer), + Name::Content => construct::content::resolve(tokenizer)?, + Name::Data => construct::partial_data::resolve(tokenizer), + Name::String => construct::string::resolve(tokenizer), + Name::Text => construct::text::resolve(tokenizer), }; - func(tokenizer) + Ok(result) } diff --git a/src/to_mdast.rs b/src/to_mdast.rs index 9f03a03..42f68a0 100644 --- a/src/to_mdast.rs +++ b/src/to_mdast.rs @@ -5,10 +5,10 @@ use crate::mdast::{ AttributeContent, AttributeValue, BlockQuote, Break, Code, Definition, Delete, Emphasis, FootnoteDefinition, FootnoteReference, Heading, Html, Image, ImageReference, InlineCode, InlineMath, Link, LinkReference, List, ListItem, Math, MdxFlowExpression, MdxJsxAttribute, - MdxJsxFlowElement, MdxJsxTextElement, MdxTextExpression, MdxjsEsm, Node, Paragraph, Point, - Position, ReferenceKind, Root, Strong, Table, TableCell, TableRow, Text, ThematicBreak, Toml, - Yaml, + MdxJsxFlowElement, MdxJsxTextElement, MdxTextExpression, MdxjsEsm, Node, Paragraph, + ReferenceKind, Root, Strong, Table, TableCell, TableRow, Text, ThematicBreak, Toml, Yaml, }; +use crate::unist::{Point, Position}; use crate::util::{ decode_character_reference::{decode_named, decode_numeric}, infer::{gfm_table_align, list_item_loose, list_loose}, diff --git a/src/unist.rs b/src/unist.rs new file mode 100644 index 0000000..75ef359 --- /dev/null +++ b/src/unist.rs @@ -0,0 +1,75 @@ +//! abstract syntax trees: [unist][]. +//! +//! [unist]: https://github.com/syntax-tree/unist + +use alloc::fmt; + +/// One place in a source file. +#[derive(Clone, Eq, PartialEq)] +pub struct Point { + /// 1-indexed integer representing a line in a source file. + pub line: usize, + /// 1-indexed integer representing a column in a source file. + pub column: usize, + /// 0-indexed integer representing a character in a source file. + pub offset: usize, +} + +impl Point { + #[must_use] + pub fn new(line: usize, column: usize, offset: usize) -> Point { + Point { + line, + column, + offset, + } + } +} + +impl fmt::Debug for Point { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{} ({})", self.line, self.column, self.offset) + } +} + +/// Location of a node in a source file. +#[derive(Clone, Eq, PartialEq)] +pub struct Position { + /// Represents the place of the first character of the parsed source region. + pub start: Point, + /// Represents the place of the first character after the parsed source + /// region, whether it exists or not. + pub end: Point, +} + +impl Position { + #[must_use] + pub fn new( + start_line: usize, + start_column: usize, + start_offset: usize, + end_line: usize, + end_column: usize, + end_offset: usize, + ) -> Position { + Position { + start: Point::new(start_line, start_column, start_offset), + end: Point::new(end_line, end_column, end_offset), + } + } +} + +impl fmt::Debug for Position { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}:{}-{}:{} ({}-{})", + self.start.line, + self.start.column, + self.end.line, + self.end.column, + self.start.offset, + self.end.offset + ) + } +} diff --git a/src/util/sanitize_uri.rs b/src/util/sanitize_uri.rs index 0099347..8e44758 100644 --- a/src/util/sanitize_uri.rs +++ b/src/util/sanitize_uri.rs @@ -26,6 +26,7 @@ use alloc::{ /// ## References /// /// * [`micromark-util-sanitize-uri` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-sanitize-uri) +#[must_use] pub fn sanitize(value: &str) -> String { encode(&*normalize(value), true) } -- cgit