use std::char; use nom::{ branch::{alt, permutation}, bytes::streaming::{is_a, is_not, tag, take, take_till, take_until}, character::{ complete::one_of, streaming::{alpha1, char, digit1, none_of, satisfy}, }, combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify}, error::{Error, ErrorKind}, multi::{many0, many1, many_till}, sequence::{delimited, pair, preceded, tuple}, Err, IResult, Parser, }; // parser: parses tokens from lexer into events type Comment<'s> = &'s str; struct PI<'s> { target: &'s str, instruction: Option<&'s str>, } enum ContentItem<'s> { CharData(&'s str), Element(Element<'s>), // Reference(Reference<'s>), // CDSect(CDSect<'s>), } type Content<'s> = Option>>; struct Attribute<'s> { key: &'s str, value: &'s str, } /// Contains only latin characters or dash after first char type EncName<'s> = &'s str; struct DoctypeDecl<'s> { name: &'s str, // TODO: doctype declaration parsing } /// pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { todo!() } struct Element<'s> { name: &'s str, attributes: Vec>, content: Content<'s>, } /// Element pub fn element(input: &str) -> IResult<&str, Element> { todo!() } enum Misc<'s> { Comment(Comment<'s>), PI(PI<'s>), } /// Misc pub fn misc(input: &str) -> IResult<&str, Misc> { todo!() } type Document<'s> = (Prolog<'s>, Element<'s>, Vec>); /// [1] document ::= prolog element Misc* pub fn document(input: &str) -> IResult<&str, Document> { tuple((prolog, element, many0(misc)))(input) } type Char = char; /// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ pub fn xmlchar(input: &str) -> IResult<&str, Char> { satisfy( |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'), )(input) } type S<'s> = &'s str; /// [3] S ::= (#x20 | #x9 | #xD | #xA)+ pub fn s(input: &str) -> IResult<&str, S> { is_a("\u{20}\u{9}\u{D}\u{A}")(input) } type NameStartChar = char; /// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> { satisfy( |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'), )(input) } type NameChar = char; /// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] pub fn name_char(input: &str) -> IResult<&str, NameChar> { alt(( name_start_char, satisfy( |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'), ), ))(input) } type Name<'s> = &'s str; /// [5] Name ::= NameStartChar (NameChar)* pub fn name(input: &str) -> IResult<&str, Name> { recognize(pair(name_start_char, many0(name_char)))(input) } type Names<'s> = &'s str; /// [6] Names ::= Name (#x20 Name)* pub fn names(input: &str) -> IResult<&str, Names> { recognize(pair(name, many0(pair(char('\u{20}'), name))))(input) } type Nmtoken<'s> = &'s str; /// [7] Nmtoken ::= (NameChar)+ pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> { recognize(many1(name_char))(input) } type Nmtokens<'s> = &'s str; /// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> { recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input) } type EntityValue<'s> = &'s str; /// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' /// | "'" ([^%&'] | PEReference | Reference)* "'" pub fn entity_value(input: &str) -> IResult<&str, EntityValue> { alt(( delimited( char('"'), recognize(many0(alt((none_of("%&\""), pe_reference, reference)))), char('"'), ), delimited( char('\''), recognize(many0(alt((none_of("%&'"), pe_reference, reference)))), char('\''), ), ))(input) } type AttValue<'s> = &'s str; /// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' /// | "'" ([^<&'] | Reference)* "'" pub fn att_value(input: &str) -> IResult<&str, AttValue> { alt(( delimited( char('"'), recognize(many0(alt((none_of("<&\""), reference)))), char('"'), ), delimited( char('\''), recognize(many0(alt((none_of("<&'"), reference)))), char('\''), ), ))(input) } type SystemLiteral<'s> = &'s str; /// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> { alt(( delimited(char('"'), recognize(many0(none_of("\""))), char('"')), delimited(char('\''), recognize(many0(none_of("'"))), char('\'')), ))(input) } type PubidLiteral<'s> = &'s str; /// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> { alt(( delimited(char('"'), recognize(many0(pubid_char)), char('"')), delimited( char('\''), recognize(many0(recognize(not(char('\''))).and_then(pubid_char))), char('\''), ), ))(input) } type PubidChar<'s> = char; /// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> { satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))( input, ) } // TODO: wtf why doesn't this work how do i do thisjj type CharData<'s> = &'s str; /// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) pub fn char_data(input: &str) -> IResult<&str, CharData> { // tag(map( // peek(alt(( // map_parser( // peek(take_until("]]>")), // nom::bytes::complete::take_till(|c| c == '<' || c == '&'), // ), // map_parser( // peek(take_till(|c| c == '<' || c == '&')), // nom::bytes::complete::take_until("]]>"), // ), // ))), // |(first, _)| first, // ))(input) // map( // tuple((is_not("<&]"), peek(alt((tag("<"), tag("&"), tag("]]>")))))), // |(first, _)| first, // )(input) // map( // tuple((recognize(many0(none_of("<&"))), opt(peek(tag("]]>"))))), // |(first, _)| first, // )(input) // alt((recognize(many0(none_of("<&"))), take_until("]]>")))(input) let tagg: &str; if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) { if let Ok((_, tagg2)) = peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input) { if tagg1.len() < tagg2.len() { tagg = tagg1 } else { tagg = tagg2 } } else { tagg = tagg1; } } else { (_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)? } tag(tagg)(input) // let mut len = 0; // let ch = input.chars().collect::>(); // for (idx, char) in ch.as_ref().into_iter().enumerate() { // match char { // '<' | '&' => break, // ']' => { // if idx <= ch.len() - 3 {} // }, // _ => todo!(), // } // } // while let Some(char) = chars.next() { // if char == '<' || char == '&' { // break; // } else if char == ']' { // if let Some(next) = chars.peek() { // if next == ']' { // if let Some(next) = chars.next_if_eq() {} // } // } // } // len += 1; // } // todo!() // recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input) // recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input) // take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input) } type Prolog<'s> = ( Option, Vec>, Option<(DoctypeDecl<'s>, Vec>)>, ); /// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? pub fn prolog(input: &str) -> IResult<&str, Prolog> { tuple(( opt(xml_decl), many0(misc), opt(tuple((doctypedecl, many0(misc)))), ))(input) } struct XMLDecl { version_info: VersionInfo, // encoding_decl: Option, // sd_decl: Option, } /// [23] XMLDecl ::= '' pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { // (VersionInfo, Option, Option) let (leftover, (version_info /* encoding_decl, sd_decl */,)) = delimited( tag(""), )(input)?; Ok(( leftover, XMLDecl { version_info, // encoding_decl, // sd_decl, }, )) } type VersionInfo = VersionNum; /// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') pub fn version_info(input: &str) -> IResult<&str, VersionInfo> { preceded( tuple((s, tag("version"), eq)), alt(( delimited(char('\''), version_num, char('\'')), delimited(char('"'), version_num, char('"')), )), )(input) } /// [25] Eq ::= S? '=' S? pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> { tuple((opt(s), char('='), opt(s)))(input) } #[derive(Clone)] enum VersionNum { One, OneDotOne, } /// [26] VersionNum ::= '1.' [0-9]+ pub fn version_num(input: &str) -> IResult<&str, VersionNum> { preceded( tag("1."), alt(( value(VersionNum::One, char('0')), value(VersionNum::OneDotOne, char('1')), )), )(input) } pub fn reference(input: &str) -> IResult<&str, char> { todo!() } pub fn pe_reference(input: &str) -> IResult<&str, char> { todo!() } #[cfg(test)] mod tests { use std::num::NonZero; use super::*; #[test] fn test_char_data() { assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi")); assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi")); assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi")); assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi")); assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi")); assert_eq!( Err(Err::Incomplete(nom::Needed::Size( NonZero::new(1usize).unwrap() ))), char_data("abcdefghi") ); } }