use std::char; use nom::{ branch::alt, bytes::streaming::{is_a, tag, take, take_while}, character::{ complete::one_of, streaming::{char, none_of, satisfy}, }, combinator::{map, not, opt, peek, recognize, value}, error::{Error, ErrorKind}, multi::{many0, many1, many_till}, sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}, Err, IResult, Parser, }; // parser: parses tokens from lexer into events // no well formedness, validity, or data model, simple translation of input into rust types // output is a rust representation of the input xml // types could be used for xml production too? pub type Document<'s> = (Prolog<'s>, Element<'s>, Vec>); /// [1] document ::= prolog element Misc* pub fn document(input: &str) -> IResult<&str, Document> { tuple((prolog, element, many0(misc)))(input) } pub type Char = char; /// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ pub fn xmlchar(input: &str) -> IResult<&str, Char> { satisfy( |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'), )(input) } pub type S<'s> = &'s str; /// [3] S ::= (#x20 | #x9 | #xD | #xA)+ pub fn s(input: &str) -> IResult<&str, S> { is_a("\u{20}\u{9}\u{D}\u{A}")(input) } pub type NameStartChar = char; /// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> { satisfy( |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'), )(input) } pub type NameChar = char; /// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] pub fn name_char(input: &str) -> IResult<&str, NameChar> { alt(( name_start_char, satisfy( |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'), ), ))(input) } pub type Name<'s> = &'s str; /// [5] Name ::= NameStartChar (NameChar)* pub fn name(input: &str) -> IResult<&str, Name> { recognize(pair(name_start_char, many0(name_char)))(input) } pub type Names<'s> = &'s str; /// [6] Names ::= Name (#x20 Name)* pub fn names(input: &str) -> IResult<&str, Names> { recognize(pair(name, many0(pair(char('\u{20}'), name))))(input) } pub type Nmtoken<'s> = &'s str; /// [7] Nmtoken ::= (NameChar)+ pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> { recognize(many1(name_char))(input) } pub type Nmtokens<'s> = &'s str; /// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> { recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input) } #[derive(Clone, Debug)] pub enum LiteralData<'s> { String(&'s str), PEReference(PEReference<'s>), Reference(Reference<'s>), } pub type EntityValue<'s> = Vec>; /// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' /// | "'" ([^%&'] | PEReference | Reference)* "'" pub fn entity_value(input: &str) -> IResult<&str, EntityValue> { alt(( delimited( char('"'), many0(alt(( map( recognize(many_till(take(1usize), peek(one_of("%&\"")))), |string| LiteralData::String(string), ), map(pe_reference, |pe_reference| { LiteralData::PEReference(pe_reference) }), map(reference, |reference| LiteralData::Reference(reference)), ))), char('"'), ), delimited( char('\''), many0(alt(( map( recognize(many_till(take(1usize), peek(one_of("%&'")))), |string| LiteralData::String(string), ), map(pe_reference, |pe_reference| { LiteralData::PEReference(pe_reference) }), map(reference, |reference| LiteralData::Reference(reference)), ))), char('\''), ), ))(input) } pub type AttValue<'s> = Vec>; /// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' /// | "'" ([^<&'] | Reference)* "'" pub fn att_value(input: &str) -> IResult<&str, AttValue> { alt(( delimited( char('"'), many0(alt(( map( recognize(many_till(take(1usize), peek(one_of("%&\"")))), |string| LiteralData::String(string), ), map(reference, |reference| LiteralData::Reference(reference)), ))), char('"'), ), delimited( char('\''), many0(alt(( map( recognize(many_till(take(1usize), peek(one_of("%&'")))), |string| LiteralData::String(string), ), map(reference, |reference| LiteralData::Reference(reference)), ))), char('\''), ), ))(input) } pub type SystemLiteral<'s> = &'s str; /// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> { alt(( delimited(char('"'), recognize(many0(none_of("\""))), char('"')), delimited(char('\''), recognize(many0(none_of("'"))), char('\'')), ))(input) } pub type PubidLiteral<'s> = &'s str; /// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> { alt(( delimited(char('"'), recognize(many0(pubid_char)), char('"')), delimited( char('\''), recognize(many0(recognize(not(char('\''))).and_then(pubid_char))), char('\''), ), ))(input) } pub type PubidChar<'s> = char; /// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> { satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))( input, ) } pub type CharData<'s> = &'s str; /// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) pub fn char_data(input: &str) -> IResult<&str, CharData> { recognize(many_till( none_of("<&"), peek(alt((recognize(one_of("<&")), tag("]]>")))), ))(input) // let tagg: &str; // if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) { // if let Ok((_, tagg2)) = // peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input) // { // if tagg1.len() < tagg2.len() { // tagg = tagg1 // } else { // tagg = tagg2 // } // } else { // tagg = tagg1; // } // } else { // (_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)? // } // tag(tagg)(input) // recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input) // recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input) // take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input) } pub type Comment<'s> = &'s str; /// Comment ::= '' pub fn comment(input: &str) -> IResult<&str, Comment> { delimited( tag(""), )(input) } #[derive(Clone, Debug)] pub struct PI<'s> { target: &'s str, instruction: Option<&'s str>, } /// [16] PI ::= '' Char*)))? '?>' pub fn pi(input: &str) -> IResult<&str, PI> { map( delimited( tag("")))))), ), tag("?>"), ), |(target, instruction)| PI { target, instruction, }, )(input) } pub type PITarget<'s> = &'s str; /// [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) pub fn pi_target(input: &str) -> IResult<&str, PITarget> { let (rest, name) = name(input)?; if name.to_lowercase() == "xml" { return Err(Err::Error(Error { input, // TODO: check if better error to return code: ErrorKind::Tag, })); } else { return Ok((rest, name)); } } pub type CDSect<'s> = (CDStart<'s>, CData<'s>, CDEnd<'s>); /// [18] CDSect ::= CDStart CData CDEnd pub fn cd_sect(input: &str) -> IResult<&str, CDSect> { tuple((cd_start, cdata, cd_end))(input) } pub type CDStart<'s> = &'s str; /// [19] CDStart ::= ' IResult<&str, CDStart> { tag(" = &'s str; /// [20] CData ::= (Char* - (Char* ']]>' Char*)) pub fn cdata(input: &str) -> IResult<&str, CData> { recognize(many_till(xmlchar, peek(tag("]]>"))))(input) } pub type CDEnd<'s> = &'s str; /// [21] CDEnd ::= ']]>' pub fn cd_end(input: &str) -> IResult<&str, CDEnd> { tag("]]>")(input) } pub type Prolog<'s> = ( Option>, Vec>, Option<(DoctypeDecl<'s>, Vec>)>, ); /// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? pub fn prolog(input: &str) -> IResult<&str, Prolog> { tuple(( opt(xml_decl), many0(misc), opt(tuple((doctypedecl, many0(misc)))), ))(input) } #[derive(Debug)] pub struct XMLDecl<'s> { version_info: VersionInfo, encoding_decl: Option>, sd_decl: Option, } /// [23] XMLDecl ::= '' pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { map( delimited( tag("")), ), |(version_info, encoding_decl, sd_decl)| XMLDecl { version_info, encoding_decl, sd_decl, }, )(input) } pub type VersionInfo = VersionNum; /// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') pub fn version_info(input: &str) -> IResult<&str, VersionInfo> { preceded( tuple((s, tag("version"), eq)), alt(( delimited(char('\''), version_num, char('\'')), delimited(char('"'), version_num, char('"')), )), )(input) } /// [25] Eq ::= S? '=' S? pub fn eq(input: &str) -> IResult<&str, &str> { recognize(tuple((opt(s), char('='), opt(s))))(input) } #[derive(Clone, Debug)] pub enum VersionNum { One, OneDotOne, } /// [26] VersionNum ::= '1.' [0-9]+ pub fn version_num(input: &str) -> IResult<&str, VersionNum> { preceded( tag("1."), alt(( value(VersionNum::One, char('0')), value(VersionNum::OneDotOne, char('1')), )), )(input) } #[derive(Clone, Debug)] pub enum Misc<'s> { Comment(Comment<'s>), PI(PI<'s>), // TODO: how to deal with whitespace S, } /// [27] Misc ::= Comment | PI | S pub fn misc(input: &str) -> IResult<&str, Misc> { alt(( map(comment, |comment| Misc::Comment(comment)), map(pi, |pi| Misc::PI(pi)), value(Misc::S, s), ))(input) } #[derive(Debug)] pub struct DoctypeDecl<'s> { name: &'s str, external_id: Option>, int_subset: Option>, } /// [28] doctypedecl ::= '' pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { map( delimited( pair(tag(""), ), |(name, external_id, int_subset)| DoctypeDecl { name, external_id, int_subset, }, )(input) } #[derive(Clone, Debug)] pub enum DeclSep<'s> { PEReference(PEReference<'s>), // TODO: tackle whitespace S, } /// [28a] DeclSep ::= PEReference | S pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> { alt(( map(pe_reference, |pe_reference| { DeclSep::PEReference(pe_reference) }), value(DeclSep::S, s), ))(input) } #[derive(Debug)] pub enum IntSubsetDeclaration<'s> { MarkupDecl(MarkupDecl<'s>), DeclSep(DeclSep<'s>), } type IntSubset<'s> = Vec>; /// [28b] intSubset ::= (markupdecl | DeclSep)* pub fn int_subset(input: &str) -> IResult<&str, IntSubset> { many0(alt(( map(markup_decl, |markup_decl| { IntSubsetDeclaration::MarkupDecl(markup_decl) }), map(decl_sep, |decl_sep| IntSubsetDeclaration::DeclSep(decl_sep)), )))(input) } #[derive(Debug)] pub enum MarkupDecl<'s> { Elementdecl(Elementdecl<'s>), AttlistDecl(AttlistDecl<'s>), EntityDecl(EntityDecl<'s>), NotationDecl(NotationDecl<'s>), PI(PI<'s>), Comment(Comment<'s>), } /// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> { alt(( map(elementdecl, |elementdecl| { MarkupDecl::Elementdecl(elementdecl) }), map(attlist_decl, |attlist_decl| { MarkupDecl::AttlistDecl(attlist_decl) }), map(entity_decl, |entity_decl| { MarkupDecl::EntityDecl(entity_decl) }), map(notation_decl, |notation_decl| { MarkupDecl::NotationDecl(notation_decl) }), map(pi, |pi| MarkupDecl::PI(pi)), map(comment, |comment| MarkupDecl::Comment(comment)), ))(input) } pub struct ExtSubset<'s> { text_decl: Option>, ext_subset_decl: ExtSubsetDecl<'s>, } /// [30] extSubset ::= TextDecl? extSubsetDecl pub fn ext_subset(input: &str) -> IResult<&str, ExtSubset> { map( pair(opt(text_decl), ext_subset_decl), |(text_decl, ext_subset_decl)| ExtSubset { text_decl, ext_subset_decl, }, )(input) } pub enum ExtSubsetDeclaration<'s> { MarkupDecl(MarkupDecl<'s>), ConditionalSect(ConditionalSect<'s>), DeclSep(DeclSep<'s>), } type ExtSubsetDecl<'s> = Vec>; /// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* pub fn ext_subset_decl(input: &str) -> IResult<&str, ExtSubsetDecl> { many0(alt(( map(markup_decl, |markup_decl| { ExtSubsetDeclaration::MarkupDecl(markup_decl) }), map(conditional_sect, |conditional_sect| { ExtSubsetDeclaration::ConditionalSect(conditional_sect) }), map(decl_sep, |decl_sep| ExtSubsetDeclaration::DeclSep(decl_sep)), )))(input) } pub type SDDecl = bool; /// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> { preceded( tuple((s, tag("standalone"), eq)), alt(( delimited( char('\''), alt((value(true, tag("yes")), value(false, tag("no")))), char('\''), ), delimited( char('"'), alt((value(true, tag("yes")), value(false, tag("no")))), char('"'), ), )), )(input) } // (Productions 33 through 38 have been removed.) #[derive(Debug)] pub enum Element<'s> { Empty(EmptyElemTag<'s>), NotEmpty(STag<'s>, Content<'s>, ETag<'s>), } /// [39] element ::= EmptyElemTag | STag content ETag pub fn element(input: &str) -> IResult<&str, Element> { alt(( map(empty_elem_tag, |empty_elem_tag| { Element::Empty(empty_elem_tag) }), map(tuple((s_tag, content, e_tag)), |(s_tag, content, e_tag)| { Element::NotEmpty(s_tag, content, e_tag) }), ))(input) } #[derive(Debug)] pub struct STag<'s> { name: Name<'s>, attributes: Vec>, } /// [40] STag ::= '<' Name (S Attribute)* S? '>' pub fn s_tag(input: &str) -> IResult<&str, STag> { map( delimited( tag("<"), pair(name, many0(preceded(s, attribute))), pair(opt(s), tag(">")), ), |(name, attributes)| STag { name, attributes }, )(input) } pub type Attribute<'s> = (Name<'s>, AttValue<'s>); /// [41] Attribute ::= Name Eq AttValue pub fn attribute(input: &str) -> IResult<&str, Attribute> { separated_pair(name, eq, att_value)(input) } #[derive(Debug)] pub struct ETag<'s> { name: Name<'s>, } /// [42] ETag ::= '' pub fn e_tag(input: &str) -> IResult<&str, ETag> { map(delimited(tag(""))), |name| { ETag { name } })(input) } #[derive(Debug)] pub enum ContentItem<'s> { // CharData(&'s str), Element(Element<'s>), Reference(Reference<'s>), CDSect(CDSect<'s>), PI(PI<'s>), Comment(Comment<'s>), } #[derive(Debug)] pub struct Content<'s> { char_data: Option>, content: Vec<(ContentItem<'s>, Option>)>, } /// [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* pub fn content(input: &str) -> IResult<&str, Content> { map( pair( opt(char_data), many0(pair( alt(( map(element, |element| ContentItem::Element(element)), map(reference, |reference| ContentItem::Reference(reference)), map(cd_sect, |cd_sect| ContentItem::CDSect(cd_sect)), map(pi, |pi| ContentItem::PI(pi)), map(comment, |comment| ContentItem::Comment(comment)), )), opt(char_data), )), ), |(char_data, content)| Content { char_data, content }, )(input) } #[derive(Debug)] pub struct EmptyElemTag<'s> { name: Name<'s>, attributes: Vec>, } /// [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec] pub fn empty_elem_tag(input: &str) -> IResult<&str, EmptyElemTag> { map( delimited( tag("<"), pair(name, many0(preceded(s, attribute))), pair(opt(s), tag("/>")), ), |(name, attributes)| EmptyElemTag { name, attributes }, )(input) } #[derive(Debug)] pub struct Elementdecl<'s> { name: Name<'s>, contentspec: Contentspec<'s>, } /// [45] elementdecl ::= '' pub fn elementdecl(input: &str) -> IResult<&str, Elementdecl> { map( delimited( pair(tag("")), ), |(name, contentspec)| Elementdecl { name, contentspec }, )(input) } // TODO: casings??? #[derive(Clone, Debug)] pub enum Contentspec<'s> { Empty, Any, Mixed(Mixed<'s>), Children(Children<'s>), } /// [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children pub fn contentspec(input: &str) -> IResult<&str, Contentspec> { alt(( value(Contentspec::Empty, tag("EMPTY")), value(Contentspec::Any, tag("ANY")), map(mixed, |mixed| Contentspec::Mixed(mixed)), map(children, |children| Contentspec::Children(children)), ))(input) } #[derive(Clone, Debug)] pub enum Occurence { Once, Optional, Many0, Many1, } /// Occurence ::= ('?' | '*' | '+')? pub fn occurence(input: &str) -> IResult<&str, Occurence> { map( opt(alt((tag("?"), tag("*"), tag("+")))), |occurence| match occurence { Some("?") => Occurence::Optional, Some("*") => Occurence::Many0, Some("+") => Occurence::Many1, _ => Occurence::Once, }, )(input) } #[derive(Clone, Debug)] pub enum ChildrenKind<'s> { Choice(Choice<'s>), Seq(Seq<'s>), } #[derive(Clone, Debug)] pub struct Children<'s> { kind: ChildrenKind<'s>, occurence: Occurence, } /// [47] children ::= (choice | seq) ('?' | '*' | '+')? pub fn children(input: &str) -> IResult<&str, Children> { map( pair( alt(( map(choice, |choice| ChildrenKind::Choice(choice)), map(seq, |seq| ChildrenKind::Seq(seq)), )), occurence, ), |(kind, occurence)| Children { kind, occurence }, )(input) // alt(( // map(pair(choice, occurence), |(choice, occurence)| Children::Choice(choice, occurence)), // map(pair(seq, occurence), |(seq, occurence)| Children::Seq(seq, occurence)) // ))(input) } #[derive(Clone, Debug)] pub enum CpKind<'s> { Name(Name<'s>), Choice(Choice<'s>), Seq(Seq<'s>), } #[derive(Clone, Debug)] pub struct Cp<'s> { kind: CpKind<'s>, occurence: Occurence, } /// [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? pub fn cp(input: &str) -> IResult<&str, Cp> { map( pair( alt(( map(name, |name| CpKind::Name(name)), map(choice, |choice| CpKind::Choice(choice)), map(seq, |seq| CpKind::Seq(seq)), )), occurence, ), |(kind, occurence)| Cp { kind, occurence }, )(input) } #[derive(Clone, Debug)] pub struct Choice<'s>(Vec>); /// [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' pub fn choice(input: &str) -> IResult<&str, Choice> { map( delimited( pair(tag("("), opt(s)), pair(cp, many1(preceded(tuple((opt(s), tag("|"), opt(s))), cp))), pair(opt(s), tag(")")), ), |(head, tail)| { let choice = vec![vec![head], tail].concat(); Choice(choice) }, )(input) } #[derive(Clone, Debug)] pub struct Seq<'s>(Vec>); /// [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' pub fn seq(input: &str) -> IResult<&str, Seq> { map( delimited( pair(tag("("), opt(s)), pair(cp, many0(preceded(tuple((opt(s), tag(","), opt(s))), cp))), pair(opt(s), tag(")")), ), |(head, tail)| { let seq = vec![vec![head], tail].concat(); Seq(seq) }, )(input) } // always contains #PCDATA #[derive(Clone, Debug)] pub struct Mixed<'s>(Vec>); /// [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' pub fn mixed(input: &str) -> IResult<&str, Mixed> { alt(( map( delimited( tuple((tag("("), s, tag("#PCDATA"))), many0(preceded(tuple((opt(s), tag("|"), opt(s))), name)), pair(opt(s), tag(")*")), ), |names| Mixed(names), ), value( Mixed(Vec::new()), tuple((tag("("), opt(s), tag("#PCDATA"), opt(s), tag(")"))), ), ))(input) } #[derive(Debug)] pub struct AttlistDecl<'s> { element_type: Name<'s>, att_defs: Vec>, } /// [52] AttlistDecl ::= '' pub fn attlist_decl(input: &str) -> IResult<&str, AttlistDecl> { map( delimited( pair(tag("")), ), |(element_type, att_defs)| AttlistDecl { element_type, att_defs, }, )(input) } #[derive(Debug)] pub struct AttDef<'s> { name: Name<'s>, att_type: AttType<'s>, default_decl: DefaultDecl<'s>, } /// [53] AttDef ::= S Name S AttType S DefaultDecl pub fn att_def(input: &str) -> IResult<&str, AttDef> { map( tuple(( preceded(s, name), preceded(s, att_type), preceded(s, default_decl), )), |(name, att_type, default_decl)| AttDef { name, att_type, default_decl, }, )(input) } #[derive(Clone, Debug)] pub enum AttType<'s> { StringType, TokenizedType(TokenizedType), EnumeratedType(EnumeratedType<'s>), } /// [54] AttType ::= StringType | TokenizedType | EnumeratedType pub fn att_type(input: &str) -> IResult<&str, AttType> { alt(( value(AttType::StringType, string_type), map(tokenized_type, |tokenized_type| { AttType::TokenizedType(tokenized_type) }), map(enumerated_type, |enumerated_type| { AttType::EnumeratedType(enumerated_type) }), ))(input) } pub type StringType<'s> = &'s str; /// [55] StringType ::= 'CDATA' pub fn string_type(input: &str) -> IResult<&str, StringType> { tag("CDATA")(input) } #[derive(Clone, Debug)] pub enum TokenizedType { ID, IDRef, IDRefs, Entity, Entities, NMToken, NMTokens, } /// [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' pub fn tokenized_type(input: &str) -> IResult<&str, TokenizedType> { alt(( value(TokenizedType::ID, tag("ID")), // TODO: check if this is required // try idrefs first to avoid losing 'S' value(TokenizedType::IDRefs, tag("IDREFS")), value(TokenizedType::IDRef, tag("IDREF")), value(TokenizedType::Entity, tag("ENTITY")), value(TokenizedType::Entities, tag("ENTITIES")), // same here value(TokenizedType::NMTokens, tag("NMTOKENS")), value(TokenizedType::NMToken, tag("NMTOKEN")), ))(input) } #[derive(Debug, Clone)] pub enum EnumeratedType<'s> { NotationType(NotationType<'s>), Enumeration(Enumeration<'s>), } /// [57] EnumeratedType ::= NotationType | Enumeration pub fn enumerated_type(input: &str) -> IResult<&str, EnumeratedType> { alt(( map(notation_type, |notation_type| { EnumeratedType::NotationType(notation_type) }), map(enumeration, |enumeration| { EnumeratedType::Enumeration(enumeration) }), ))(input) } #[derive(Debug, Clone)] pub struct NotationType<'s>(Vec>); /// [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' pub fn notation_type(input: &str) -> IResult<&str, NotationType> { map( delimited( tuple((tag("NOTATION"), s, tag("("), opt(s))), pair( name, many0(preceded(tuple((opt(s), tag("|"), opt(s))), name)), ), pair(opt(s), tag(")")), ), |(head, tail)| { let notation_type = vec![vec![head], tail].concat(); NotationType(notation_type) }, )(input) } #[derive(Debug, Clone)] pub struct Enumeration<'s>(Vec>); /// [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' pub fn enumeration(input: &str) -> IResult<&str, Enumeration> { map( delimited( pair(tag("("), opt(s)), pair( nmtoken, many0(preceded(tuple((opt(s), tag("|"), opt(s))), nmtoken)), ), pair(opt(s), tag(")")), ), |(head, tail)| { let enumeration = vec![vec![head], tail].concat(); Enumeration(enumeration) }, )(input) } #[derive(Debug, Clone)] pub enum DefaultDecl<'s> { Required, Implied, Fixed(AttValue<'s>), } /// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) pub fn default_decl(input: &str) -> IResult<&str, DefaultDecl> { alt(( value(DefaultDecl::Required, tag("#REQUIRED")), value(DefaultDecl::Implied, tag("#IMPLIED")), map( preceded(opt(pair(tag("#FIXED"), s)), att_value), |att_value| DefaultDecl::Fixed(att_value), ), ))(input) } pub enum ConditionalSect<'s> { IncludeSect(IncludeSect<'s>), IgnoreSect(IgnoreSect<'s>), } /// [61] conditionalSect ::= includeSect | ignoreSect pub fn conditional_sect(input: &str) -> IResult<&str, ConditionalSect> { alt(( map(include_sect, |include_sect| { ConditionalSect::IncludeSect(include_sect) }), map(ignore_sect, |ignore_sect| { ConditionalSect::IgnoreSect(ignore_sect) }), ))(input) } pub struct IncludeSect<'s>(ExtSubsetDecl<'s>); /// [62] includeSect ::= '' pub fn include_sect(input: &str) -> IResult<&str, IncludeSect> { map( delimited( tuple((tag(""), ), |ext_subset_decl| IncludeSect(ext_subset_decl), )(input) } pub struct IgnoreSect<'s>(Vec>); /// [63] ignoreSect ::= '' pub fn ignore_sect(input: &str) -> IResult<&str, IgnoreSect> { map( delimited( tuple((tag(""), ), |ignore_sect_contents| IgnoreSect(ignore_sect_contents), )(input) } pub struct IgnoreSectContents<'s> { // TODO: what the fuck does this mean ignore: Ignore<'s>, ignore_list: Vec<(IgnoreSectContents<'s>, Ignore<'s>)>, } /// [64] ignoreSectContents ::= Ignore ('' Ignore)* pub fn ignore_sect_contents(input: &str) -> IResult<&str, IgnoreSectContents> { map( pair( ignore, many0(tuple(( delimited(tag("")), ignore, ))), ), |(ignore, ignore_list)| IgnoreSectContents { ignore, ignore_list, }, )(input) } pub type Ignore<'s> = &'s str; /// [65] Ignore ::= Char* - (Char* ('') Char*) pub fn ignore(input: &str) -> IResult<&str, Ignore> { recognize(many_till(xmlchar, peek(alt((tag(""))))))(input) } #[derive(Clone, Debug)] pub enum CharRef<'s> { Decimal(&'s str), Hexadecimal(&'s str), } /// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' pub fn char_ref(input: &str) -> IResult<&str, CharRef> { alt(( delimited( tag("&#"), map(take_while(|c| matches!(c, '0'..='9')), |decimal| { CharRef::Decimal(decimal) }), tag(";"), ), delimited( tag("&#x"), map( take_while(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' )), |hexadecimal| CharRef::Hexadecimal(hexadecimal), ), tag(";"), ), ))(input) } #[derive(Clone, Debug)] pub enum Reference<'s> { EntityRef(EntityRef<'s>), CharRef(CharRef<'s>), } /// [67] Reference ::= EntityRef | CharRef pub fn reference(input: &str) -> IResult<&str, Reference> { alt(( map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)), map(char_ref, |char_ref| Reference::CharRef(char_ref)), ))(input) } pub type EntityRef<'s> = &'s str; /// [68] EntityRef ::= '&' Name ';' pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> { delimited(tag("&"), name, tag(";"))(input) } pub type PEReference<'s> = &'s str; /// [69] PEReference ::= '%' Name ';' pub fn pe_reference(input: &str) -> IResult<&str, PEReference> { delimited(tag("%"), name, tag(";"))(input) } #[derive(Debug)] pub enum EntityDecl<'s> { GEDecl(GEDecl<'s>), PEDecl(PEDecl<'s>), } /// [70] EntityDecl ::= GEDecl | PEDecl pub fn entity_decl(input: &str) -> IResult<&str, EntityDecl> { alt(( map(ge_decl, |ge_decl| EntityDecl::GEDecl(ge_decl)), map(pe_decl, |pe_decl| EntityDecl::PEDecl(pe_decl)), ))(input) } #[derive(Debug)] pub struct GEDecl<'s> { name: Name<'s>, entity_def: EntityDef<'s>, } /// [71] GEDecl ::= '' pub fn ge_decl(input: &str) -> IResult<&str, GEDecl> { map( delimited( pair(tag("")), ), |(name, entity_def)| GEDecl { name, entity_def }, )(input) } #[derive(Debug)] pub struct PEDecl<'s> { name: Name<'s>, pe_def: PEDef<'s>, } /// [72] PEDecl ::= '' pub fn pe_decl(input: &str) -> IResult<&str, PEDecl> { map( delimited( tuple((tag("")), ), |(name, pe_def)| PEDecl { name, pe_def }, )(input) } #[derive(Debug)] pub enum EntityDef<'s> { EntityValue(EntityValue<'s>), ExternalID { external_id: ExternalID<'s>, ndata_decl: Option>, }, } /// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) pub fn entity_def(input: &str) -> IResult<&str, EntityDef> { alt(( map(entity_value, |entity_value| { EntityDef::EntityValue(entity_value) }), map( pair(external_id, opt(ndata_decl)), |(external_id, ndata_decl)| EntityDef::ExternalID { external_id, ndata_decl, }, ), ))(input) } #[derive(Debug)] pub enum PEDef<'s> { EntityValue(EntityValue<'s>), ExternalID(ExternalID<'s>), } /// [74] PEDef ::= EntityValue | ExternalID pub fn pe_def(input: &str) -> IResult<&str, PEDef> { alt(( map(entity_value, |entity_value| { PEDef::EntityValue(entity_value) }), map(external_id, |external_id| PEDef::ExternalID(external_id)), ))(input) } #[derive(Debug)] pub enum ExternalID<'s> { SYSTEM { system_identifier: &'s str, }, PUBLIC { public_identifier: &'s str, system_identifier: &'s str, }, } /// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral // pub fn external_id(input: &str) -> IResult<&str, ExternalID> { pub fn external_id(input: &str) -> IResult<&str, ExternalID> { alt(( map( preceded(pair(tag("SYSTEM"), s), system_literal), |system_identifier| ExternalID::SYSTEM { system_identifier }, ), map( preceded( pair(tag("PUBLIC"), s), separated_pair(pubid_literal, s, system_literal), ), |(public_identifier, system_identifier)| ExternalID::PUBLIC { public_identifier, system_identifier, }, ), ))(input) } pub type NDataDecl<'s> = &'s str; /// [76] NDataDecl ::= S 'NDATA' S Name pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> { preceded(tuple((s, tag("NDATA"), s)), name)(input) } pub struct TextDecl<'s> { version_info: Option, encoding_decl: EncodingDecl<'s>, } /// [77] TextDecl ::= '' pub fn text_decl(input: &str) -> IResult<&str, TextDecl> { map( delimited( tag(""), ), |(version_info, encoding_decl)| TextDecl { version_info, encoding_decl, }, )(input) } pub struct ExtParsedEnt<'s> { text_decl: Option>, content: Content<'s>, } /// [78] extParsedEnt ::= TextDecl? content pub fn ext_parsed_ent(input: &str) -> IResult<&str, ExtParsedEnt> { map(pair(opt(text_decl), content), |(text_decl, content)| { ExtParsedEnt { text_decl, content } })(input) } pub type EncodingDecl<'s> = EncName<'s>; /// [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName pub fn encoding_decl(input: &str) -> IResult<&str, EncodingDecl> { preceded( tuple((s, tag("encoding"), eq)), alt(( delimited(char('"'), enc_name, char('"')), delimited(char('\''), enc_name, char('\'')), )), )(input) } pub type EncName<'s> = &'s str; /// [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* pub fn enc_name(input: &str) -> IResult<&str, EncName> { recognize(pair( satisfy(|c| matches!(c, 'A'..='Z' | 'a'..='z' )), many0(satisfy( |c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | '-' ), )), ))(input) } #[derive(Debug)] pub struct NotationDecl<'s> { name: &'s str, id: NotationDeclID<'s>, } #[derive(Debug)] pub enum NotationDeclID<'s> { External(ExternalID<'s>), Public(PublicID<'s>), } /// [82] NotationDecl ::= '' pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> { map( delimited( pair(tag("")), ), |(name, id)| NotationDecl { name, id }, )(input) } pub type PublicID<'s> = &'s str; /// [83] PublicID ::= 'PUBLIC' S PubidLiteral pub fn public_id(input: &str) -> IResult<&str, PublicID> { preceded(pair(tag("PUBLIC"), s), pubid_literal)(input) } #[cfg(test)] mod tests { use std::num::NonZero; use super::*; #[test] fn test_char_data() { assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi")); assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi")); assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi")); assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi")); assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi")); assert_eq!( Err(Err::Incomplete(nom::Needed::Size( NonZero::new(3usize).unwrap() ))), char_data("abcdefghi") ); } #[test] fn test_comment() { assert_eq!(Ok(("", "")), comment("")); assert_eq!(Ok(("", "asdf")), comment("")); assert_eq!(Ok(("", "as-df")), comment("")); assert_eq!( Err(Err::Incomplete(nom::Needed::Size( NonZero::new(2usize).unwrap() ))), comment("