diff options
author | René Kijewski <rene.kijewski@fu-berlin.de> | 2023-01-30 12:52:50 +0100 |
---|---|---|
committer | Dirkjan Ochtman <dirkjan@ochtman.nl> | 2023-01-30 14:19:46 +0100 |
commit | 7b6f1df433a7f11612608644342b898cd6be8ff5 (patch) | |
tree | 478aafb88f9814fe235ee983213da5aa74a13e0a /askama_derive/src/parser/mod.rs | |
parent | 63b98ec7d379768d771966c6aa44de20862e0994 (diff) | |
download | askama-7b6f1df433a7f11612608644342b898cd6be8ff5.tar.gz askama-7b6f1df433a7f11612608644342b898cd6be8ff5.tar.bz2 askama-7b6f1df433a7f11612608644342b898cd6be8ff5.zip |
derive: refactor parser
`parser.rs` was a single file containing almost 2000 lines.
This PR split the file into multiple, smaller files. `Expr`, `Node`, and
`Target` each get an own file. Each struct gets a `parse()` method that
return `Result<Self>`, and every other detail is private to the file.
This PR should make this essential part of Askama more easy to
understand, and make future modifications easier.
Diffstat (limited to 'askama_derive/src/parser/mod.rs')
-rw-r--r-- | askama_derive/src/parser/mod.rs | 314 |
1 files changed, 314 insertions, 0 deletions
diff --git a/askama_derive/src/parser/mod.rs b/askama_derive/src/parser/mod.rs new file mode 100644 index 0000000..d345a81 --- /dev/null +++ b/askama_derive/src/parser/mod.rs @@ -0,0 +1,314 @@ +use std::cell::Cell; +use std::str; + +use nom::branch::alt; +use nom::bytes::complete::{escaped, is_not, tag, take_till}; +use nom::character::complete::char; +use nom::character::complete::{anychar, digit1}; +use nom::combinator::{eof, map, not, opt, recognize, value}; +use nom::error::ErrorKind; +use nom::multi::separated_list1; +use nom::sequence::{delimited, pair, tuple}; +use nom::{error_position, AsChar, IResult, InputTakeAtPosition}; + +pub(crate) use self::expr::Expr; +pub(crate) use self::node::{Cond, CondTest, Loop, Macro, Node, Target, When, Whitespace, Ws}; +use crate::config::Syntax; +use crate::CompileError; + +mod expr; +mod node; +#[cfg(test)] +mod tests; + +struct State<'a> { + syntax: &'a Syntax, + loop_depth: Cell<usize>, +} + +impl State<'_> { + fn new(syntax: &Syntax) -> State<'_> { + State { + syntax, + loop_depth: Cell::new(0), + } + } + + fn enter_loop(&self) { + self.loop_depth.set(self.loop_depth.get() + 1); + } + + fn leave_loop(&self) { + self.loop_depth.set(self.loop_depth.get() - 1); + } + + fn is_in_loop(&self) -> bool { + self.loop_depth.get() > 0 + } +} + +impl From<char> for Whitespace { + fn from(c: char) -> Self { + match c { + '+' => Self::Preserve, + '-' => Self::Suppress, + '~' => Self::Minimize, + _ => panic!("unsupported `Whitespace` conversion"), + } + } +} + +pub(crate) fn parse<'a>(src: &'a str, syntax: &'a Syntax) -> Result<Vec<Node<'a>>, CompileError> { + match Node::parse(src, &State::new(syntax)) { + Ok((left, res)) => { + if !left.is_empty() { + Err(format!("unable to parse template:\n\n{left:?}").into()) + } else { + Ok(res) + } + } + + Err(nom::Err::Error(err)) | Err(nom::Err::Failure(err)) => { + let nom::error::Error { input, .. } = err; + let offset = src.len() - input.len(); + let (source_before, source_after) = src.split_at(offset); + + let source_after = match source_after.char_indices().enumerate().take(41).last() { + Some((40, (i, _))) => format!("{:?}...", &source_after[..i]), + _ => format!("{source_after:?}"), + }; + + let (row, last_line) = source_before.lines().enumerate().last().unwrap(); + let column = last_line.chars().count(); + + let msg = format!( + "problems parsing template source at row {}, column {} near:\n{}", + row + 1, + column, + source_after, + ); + Err(msg.into()) + } + + Err(nom::Err::Incomplete(_)) => Err("parsing incomplete".into()), + } +} + +fn is_ws(c: char) -> bool { + matches!(c, ' ' | '\t' | '\r' | '\n') +} + +fn not_ws(c: char) -> bool { + !is_ws(c) +} + +fn ws<'a, O>( + inner: impl FnMut(&'a str) -> IResult<&'a str, O>, +) -> impl FnMut(&'a str) -> IResult<&'a str, O> { + delimited(take_till(not_ws), inner, take_till(not_ws)) +} + +fn split_ws_parts(s: &str) -> Node<'_> { + let trimmed_start = s.trim_start_matches(is_ws); + let len_start = s.len() - trimmed_start.len(); + let trimmed = trimmed_start.trim_end_matches(is_ws); + Node::Lit(&s[..len_start], trimmed, &trimmed_start[trimmed.len()..]) +} + +/// Skips input until `end` was found, but does not consume it. +/// Returns tuple that would be returned when parsing `end`. +fn skip_till<'a, O>( + end: impl FnMut(&'a str) -> IResult<&'a str, O>, +) -> impl FnMut(&'a str) -> IResult<&'a str, (&'a str, O)> { + enum Next<O> { + IsEnd(O), + NotEnd(char), + } + let mut next = alt((map(end, Next::IsEnd), map(anychar, Next::NotEnd))); + move |start: &'a str| { + let mut i = start; + loop { + let (j, is_end) = next(i)?; + match is_end { + Next::IsEnd(lookahead) => return Ok((i, (j, lookahead))), + Next::NotEnd(_) => i = j, + } + } + } +} + +fn keyword<'a>(k: &'a str) -> impl FnMut(&'a str) -> IResult<&'a str, &'a str> { + move |i: &'a str| -> IResult<&'a str, &'a str> { + let (j, v) = identifier(i)?; + if k == v { + Ok((j, v)) + } else { + Err(nom::Err::Error(error_position!(i, ErrorKind::Tag))) + } + } +} + +fn identifier(input: &str) -> IResult<&str, &str> { + recognize(pair(identifier_start, opt(identifier_tail)))(input) +} + +fn identifier_start(s: &str) -> IResult<&str, &str> { + s.split_at_position1_complete( + |c| !(c.is_alpha() || c == '_' || c >= '\u{0080}'), + nom::error::ErrorKind::Alpha, + ) +} + +fn identifier_tail(s: &str) -> IResult<&str, &str> { + s.split_at_position1_complete( + |c| !(c.is_alphanum() || c == '_' || c >= '\u{0080}'), + nom::error::ErrorKind::Alpha, + ) +} + +fn bool_lit(i: &str) -> IResult<&str, &str> { + alt((keyword("false"), keyword("true")))(i) +} + +fn num_lit(i: &str) -> IResult<&str, &str> { + recognize(pair(digit1, opt(pair(char('.'), digit1))))(i) +} + +fn str_lit(i: &str) -> IResult<&str, &str> { + let (i, s) = delimited( + char('"'), + opt(escaped(is_not("\\\""), '\\', anychar)), + char('"'), + )(i)?; + Ok((i, s.unwrap_or_default())) +} + +fn char_lit(i: &str) -> IResult<&str, &str> { + let (i, s) = delimited( + char('\''), + opt(escaped(is_not("\\\'"), '\\', anychar)), + char('\''), + )(i)?; + Ok((i, s.unwrap_or_default())) +} + +fn nested_parenthesis(i: &str) -> IResult<&str, ()> { + let mut nested = 0; + let mut last = 0; + let mut in_str = false; + let mut escaped = false; + + for (i, b) in i.chars().enumerate() { + if !(b == '(' || b == ')') || !in_str { + match b { + '(' => nested += 1, + ')' => { + if nested == 0 { + last = i; + break; + } + nested -= 1; + } + '"' => { + if in_str { + if !escaped { + in_str = false; + } + } else { + in_str = true; + } + } + '\\' => { + escaped = !escaped; + } + _ => (), + } + } + + if escaped && b != '\\' { + escaped = false; + } + } + + if nested == 0 { + Ok((&i[last..], ())) + } else { + Err(nom::Err::Error(error_position!( + i, + ErrorKind::SeparatedNonEmptyList + ))) + } +} + +fn path(i: &str) -> IResult<&str, Vec<&str>> { + let root = opt(value("", ws(tag("::")))); + let tail = separated_list1(ws(tag("::")), identifier); + + match tuple((root, identifier, ws(tag("::")), tail))(i) { + Ok((i, (root, start, _, rest))) => { + let mut path = Vec::new(); + path.extend(root); + path.push(start); + path.extend(rest); + Ok((i, path)) + } + Err(err) => { + if let Ok((i, name)) = identifier(i) { + // The returned identifier can be assumed to be path if: + // - Contains both a lowercase and uppercase character, i.e. a type name like `None` + // - Doesn't contain any lowercase characters, i.e. it's a constant + // In short, if it contains any uppercase characters it's a path. + if name.contains(char::is_uppercase) { + return Ok((i, vec![name])); + } + } + + // If `identifier()` fails then just return the original error + Err(err) + } + } +} + +fn take_content<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, Node<'a>> { + let p_start = alt(( + tag(s.syntax.block_start.as_str()), + tag(s.syntax.comment_start.as_str()), + tag(s.syntax.expr_start.as_str()), + )); + + let (i, _) = not(eof)(i)?; + let (i, content) = opt(recognize(skip_till(p_start)))(i)?; + let (i, content) = match content { + Some("") => { + // {block,comment,expr}_start follows immediately. + return Err(nom::Err::Error(error_position!(i, ErrorKind::TakeUntil))); + } + Some(content) => (i, content), + None => ("", i), // there is no {block,comment,expr}_start: take everything + }; + Ok((i, split_ws_parts(content))) +} + +fn tag_block_start<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.block_start.as_str())(i) +} + +fn tag_block_end<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.block_end.as_str())(i) +} + +fn tag_comment_start<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.comment_start.as_str())(i) +} + +fn tag_comment_end<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.comment_end.as_str())(i) +} + +fn tag_expr_start<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.expr_start.as_str())(i) +} + +fn tag_expr_end<'a>(i: &'a str, s: &State<'_>) -> IResult<&'a str, &'a str> { + tag(s.syntax.expr_end.as_str())(i) +} |