From e40e93796f45d34d038c6a20c4b034eb3b384b12 Mon Sep 17 00:00:00 2001 From: Dirkjan Ochtman Date: Sat, 1 Jul 2023 16:00:06 +0200 Subject: Extract askama_parser crate --- askama_parser/src/lib.rs | 384 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 384 insertions(+) create mode 100644 askama_parser/src/lib.rs (limited to 'askama_parser/src/lib.rs') diff --git a/askama_parser/src/lib.rs b/askama_parser/src/lib.rs new file mode 100644 index 0000000..336004e --- /dev/null +++ b/askama_parser/src/lib.rs @@ -0,0 +1,384 @@ +#![deny(unreachable_pub)] +#![deny(elided_lifetimes_in_paths)] + +use std::cell::Cell; +use std::{fmt, str}; + +use nom::branch::alt; +use nom::bytes::complete::{escaped, is_not, tag, take_till}; +use nom::character::complete::char; +use nom::character::complete::{anychar, digit1}; +use nom::combinator::{eof, map, not, opt, recognize, value}; +use nom::error::ErrorKind; +use nom::multi::separated_list1; +use nom::sequence::{delimited, pair, tuple}; +use nom::{error_position, AsChar, IResult, InputTakeAtPosition}; + +pub use self::expr::Expr; +pub use self::node::{Cond, CondTest, Loop, Macro, Node, Target, When, Whitespace, Ws}; + +mod expr; +mod node; +#[cfg(test)] +mod tests; + +struct State<'a> { + syntax: &'a Syntax<'a>, + loop_depth: Cell, +} + +impl<'a> State<'a> { + fn new(syntax: &'a Syntax<'a>) -> State<'a> { + State { + syntax, + loop_depth: Cell::new(0), + } + } + + fn enter_loop(&self) { + self.loop_depth.set(self.loop_depth.get() + 1); + } + + fn leave_loop(&self) { + self.loop_depth.set(self.loop_depth.get() - 1); + } + + fn is_in_loop(&self) -> bool { + self.loop_depth.get() > 0 + } +} + +impl From for Whitespace { + fn from(c: char) -> Self { + match c { + '+' => Self::Preserve, + '-' => Self::Suppress, + '~' => Self::Minimize, + _ => panic!("unsupported `Whitespace` conversion"), + } + } +} + +mod _parsed { + use std::mem; + + use super::{parse, Node, ParseError, Syntax}; + + pub struct Parsed { + #[allow(dead_code)] + source: String, + nodes: Vec>, + } + + impl Parsed { + pub fn new(source: String, syntax: &Syntax<'_>) -> Result { + // Self-referential borrowing: `self` will keep the source alive as `String`, + // internally we will transmute it to `&'static str` to satisfy the compiler. + // However, we only expose the nodes with a lifetime limited to `self`. + let src = unsafe { mem::transmute::<&str, &'static str>(source.as_str()) }; + let nodes = match parse(src, syntax) { + Ok(nodes) => nodes, + Err(e) => return Err(e), + }; + + Ok(Self { source, nodes }) + } + + // The return value's lifetime must be limited to `self` to uphold the unsafe invariant. + pub fn nodes(&self) -> &[Node<'_>] { + &self.nodes + } + } +} + +pub use _parsed::Parsed; + +pub fn parse<'a>(src: &'a str, syntax: &Syntax<'_>) -> Result>, ParseError> { + match Node::parse(src, &State::new(syntax)) { + Ok((left, res)) => { + if !left.is_empty() { + Err(ParseError(format!("unable to parse template:\n\n{left:?}"))) + } else { + Ok(res) + } + } + + Err(nom::Err::Error(err)) | Err(nom::Err::Failure(err)) => { + let nom::error::Error { input, .. } = err; + let offset = src.len() - input.len(); + let (source_before, source_after) = src.split_at(offset); + + let source_after = match source_after.char_indices().enumerate().take(41).last() { + Some((40, (i, _))) => format!("{:?}...", &source_after[..i]), + _ => format!("{source_after:?}"), + }; + + let (row, last_line) = source_before.lines().enumerate().last().unwrap(); + let column = last_line.chars().count(); + + let msg = format!( + "problems parsing template source at row {}, column {} near:\n{}", + row + 1, + column, + source_after, + ); + + Err(ParseError(msg)) + } + + Err(nom::Err::Incomplete(_)) => Err(ParseError("parsing incomplete".into())), + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ParseError(String); + +impl std::error::Error for ParseError {} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +fn is_ws(c: char) -> bool { + matches!(c, ' ' | '\t' | '\r' | '\n') +} + +fn not_ws(c: char) -> bool { + !is_ws(c) +} + +fn ws<'a, O>( + inner: impl FnMut(&'a str) -> IResult<&'a str, O>, +) -> impl FnMut(&'a str) -> IResult<&'a str, O> { + delimited(take_till(not_ws), inner, take_till(not_ws)) +} + +fn split_ws_parts(s: &str) -> Node<'_> { + let trimmed_start = s.trim_start_matches(is_ws); + let len_start = s.len() - trimmed_start.len(); + let trimmed = trimmed_start.trim_end_matches(is_ws); + Node::Lit(&s[..len_start], trimmed, &trimmed_start[trimmed.len()..]) +} + +/// Skips input until `end` was found, but does not consume it. +/// Returns tuple that would be returned when parsing `end`. +fn skip_till<'a, O>( + end: impl FnMut(&'a str) -> IResult<&'a str, O>, +) -> impl FnMut(&'a str) -> IResult<&'a str, (&'a str, O)> { + enum Next { + IsEnd(O), + NotEnd(char), + } + let mut next = alt((map(end, Next::IsEnd), map(anychar, Next::NotEnd))); + move |start: &'a str| { + let mut i = start; + loop { + let (j, is_end) = next(i)?; + match is_end { + Next::IsEnd(lookahead) => return Ok((i, (j, lookahead))), + Next::NotEnd(_) => i = j, + } + } + } +} + +fn keyword<'a>(k: &'a str) -> impl FnMut(&'a str) -> IResult<&'a str, &'a str> { + move |i: &'a str| -> IResult<&'a str, &'a str> { + let (j, v) = identifier(i)?; + if k == v { + Ok((j, v)) + } else { + Err(nom::Err::Error(error_position!(i, ErrorKind::Tag))) + } + } +} + +fn identifier(input: &str) -> IResult<&str, &str> { + recognize(pair(identifier_start, opt(identifier_tail)))(input) +} + +fn identifier_start(s: &str) -> IResult<&str, &str> { + s.split_at_position1_complete( + |c| !(c.is_alpha() || c == '_' || c >= '\u{0080}'), + nom::error::ErrorKind::Alpha, + ) +} + +fn identifier_tail(s: &str) -> IResult<&str, &str> { + s.split_at_position1_complete( + |c| !(c.is_alphanum() || c == '_' || c >= '\u{0080}'), + nom::error::ErrorKind::Alpha, + ) +} + +fn bool_lit(i: &str) -> IResult<&str, &str> { + alt((keyword("false"), keyword("true")))(i) +} + +fn num_lit(i: &str) -> IResult<&str, &str> { + recognize(pair(digit1, opt(pair(char('.'), digit1))))(i) +} + +fn str_lit(i: &str) -> IResult<&str, &str> { + let (i, s) = delimited( + char('"'), + opt(escaped(is_not("\\\""), '\\', anychar)), + char('"'), + )(i)?; + Ok((i, s.unwrap_or_default())) +} + +fn char_lit(i: &str) -> IResult<&str, &str> { + let (i, s) = delimited( + char('\''), + opt(escaped(is_not("\\\'"), '\\', anychar)), + char('\''), + )(i)?; + Ok((i, s.unwrap_or_default())) +} + +fn nested_parenthesis(i: &str) -> IResult<&str, ()> { + let mut nested = 0; + let mut last = 0; + let mut in_str = false; + let mut escaped = false; + + for (i, b) in i.chars().enumerate() { + if !(b == '(' || b == ')') || !in_str { + match b { + '(' => nested += 1, + ')' => { + if nested == 0 { + last = i; + break; + } + nested -= 1; + } + '"' => { + if in_str { + if !escaped { + in_str = false; + } + } else { + in_str = true; + } + } + '\\' => { + escaped = !escaped; + } + _ => (), + } + } + + if escaped && b != '\\' { + escaped = false; + } + } + + if nested == 0 { + Ok((&i[last..], ())) + } else { + Err(nom::Err::Error(error_position!( + i, + ErrorKind::SeparatedNonEmptyList + ))) + } +} + +fn path(i: &str) -> IResult<&str, Vec<&str>> { + let root = opt(value("", ws(tag("::")))); + let tail = separated_list1(ws(tag("::")), identifier); + + match tuple((root, identifier, ws(tag("::")), tail))(i) { + Ok((i, (root, start, _, rest))) => { + let mut path = Vec::new(); + path.extend(root); + path.push(start); + path.extend(rest); + Ok((i, path)) + } + Err(err) => { + if let Ok((i, name)) = identifier(i) { + // The returned identifier can be assumed to be path if: + // - Contains both a lowercase and uppercase character, i.e. a type name like `None` + // - Doesn't contain any lowercase characters, i.e. it's a constant + // In short, if it contains any uppercase characters it's a path. + if name.contains(char::is_uppercase) { + return Ok((i, vec![name])); + } + } + + // If `identifier()` fails then just return the original error + Err(err) + } + } +} + +fn take_content<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, Node<'a>> { + let p_start = alt(( + tag(s.syntax.block_start), + tag(s.syntax.comment_start), + tag(s.syntax.expr_start), + )); + + let (i, _) = not(eof)(i)?; + let (i, content) = opt(recognize(skip_till(p_start)))(i)?; + let (i, content) = match content { + Some("") => { + // {block,comment,expr}_start follows immediately. + return Err(nom::Err::Error(error_position!(i, ErrorKind::TakeUntil))); + } + Some(content) => (i, content), + None => ("", i), // there is no {block,comment,expr}_start: take everything + }; + Ok((i, split_ws_parts(content))) +} + +fn tag_block_start<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.block_start)(i) +} + +fn tag_block_end<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.block_end)(i) +} + +fn tag_comment_start<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.comment_start)(i) +} + +fn tag_comment_end<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.comment_end)(i) +} + +fn tag_expr_start<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.expr_start)(i) +} + +fn tag_expr_end<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.expr_end)(i) +} + +#[derive(Debug)] +pub struct Syntax<'a> { + pub block_start: &'a str, + pub block_end: &'a str, + pub expr_start: &'a str, + pub expr_end: &'a str, + pub comment_start: &'a str, + pub comment_end: &'a str, +} + +impl Default for Syntax<'static> { + fn default() -> Self { + Self { + block_start: "{%", + block_end: "%}", + expr_start: "{{", + expr_end: "}}", + comment_start: "{#", + comment_end: "#}", + } + } +} -- cgit