From c08b4504ab326203b2c11abe566e518b6466613a Mon Sep 17 00:00:00 2001 From: cel 🌸 Date: Thu, 27 Jun 2024 20:22:05 +0100 Subject: namespace parsing --- src/xml/mod.rs | 1525 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/xml/parsers.rs | 1 + 2 files changed, 1526 insertions(+) create mode 100644 src/xml/mod.rs create mode 100644 src/xml/parsers.rs (limited to 'src/xml') diff --git a/src/xml/mod.rs b/src/xml/mod.rs new file mode 100644 index 0000000..47c1779 --- /dev/null +++ b/src/xml/mod.rs @@ -0,0 +1,1525 @@ +use std::char; + +use nom::{ + branch::alt, + bytes::streaming::{is_a, tag, take, take_while}, + character::{ + complete::one_of, + streaming::{char, none_of, satisfy}, + }, + combinator::{map, not, opt, peek, recognize, value}, + error::{Error, ErrorKind}, + multi::{many0, many1, many_till}, + sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}, + Err, IResult, Parser, +}; + +// parser: parses tokens from lexer into events +// no well formedness, validity, or data model, simple translation of input into rust types +// output is a rust representation of the input xml +// types could be used for xml production too? + +mod parsers; + +#[derive(Clone, Debug)] +pub enum NSAttName<'s> { + PrefixedAttName(PrefixedAttName<'s>), + DefaultAttName, +} +/// [1] NSAttName ::= PrefixedAttName | DefaultAttName +pub fn ns_att_name(input: &str) -> IResult<&str, NSAttName> { + alt(( + map(prefixed_att_name, |prefixed_att_name| { + NSAttName::PrefixedAttName(prefixed_att_name) + }), + value(NSAttName::DefaultAttName, default_att_name), + ))(input) +} + +#[derive(Clone, Debug)] +pub struct PrefixedAttName<'s>(NCName<'s>); +/// [2] PrefixedAttName ::= 'xmlns:' NCName +pub fn prefixed_att_name(input: &str) -> IResult<&str, PrefixedAttName> { + map(preceded(tag("xmlns:"), nc_name), |nc_name| { + PrefixedAttName(nc_name) + })(input) +} + +#[derive(Clone, Debug)] +pub struct DefaultAttName; +/// [3] DefaultAttName ::= 'xmlns'; +pub fn default_att_name(input: &str) -> IResult<&str, DefaultAttName> { + value(DefaultAttName, tag("xmlns"))(input) +} + +#[derive(Clone, Debug)] +pub struct NCName<'s>(&'s str); +/// [4] NCName ::= Name - (Char* ':' Char*) +pub fn nc_name(input: &str) -> IResult<&str, NCName> { + map( + recognize(pair( + recognize(name_start_char).and_then(satisfy(|c| c != ':')), + many_till(name_char, peek(char(':'))), + )), + |nc_name| NCName(nc_name), + )(input) +} + +#[derive(Clone, Debug)] +pub enum QName<'s> { + PrefixedName(PrefixedName<'s>), + UnprefixedName(UnprefixedName<'s>), +} +/// [7] QName ::= PrefixedName | UnprefixedName +pub fn q_name(input: &str) -> IResult<&str, QName> { + alt(( + map(prefixed_name, |prefixed_name| { + QName::PrefixedName(prefixed_name) + }), + map(unprefixed_name, |unprefixed_name| { + QName::UnprefixedName(unprefixed_name) + }), + ))(input) +} + +#[derive(Clone, Debug)] +pub struct PrefixedName<'s> { + prefix: Prefix<'s>, + local_part: LocalPart<'s>, +} +/// [8] PrefixedName ::= Prefix ':' LocalPart +pub fn prefixed_name(input: &str) -> IResult<&str, PrefixedName> { + map( + separated_pair(prefix, char(':'), local_part), + |(prefix, local_part)| PrefixedName { prefix, local_part }, + )(input) +} + +#[derive(Clone, Debug)] +pub struct UnprefixedName<'s>(LocalPart<'s>); +/// [9] UnprefixedName ::= LocalPart +pub fn unprefixed_name(input: &str) -> IResult<&str, UnprefixedName> { + map(local_part, |local_part| UnprefixedName(local_part))(input) +} + +#[derive(Clone, Debug)] +pub struct Prefix<'s>(NCName<'s>); +/// [10] Prefix ::= NCName +pub fn prefix(input: &str) -> IResult<&str, Prefix> { + map(nc_name, |nc_name| Prefix(nc_name))(input) +} + +#[derive(Clone, Debug)] +pub struct LocalPart<'s>(NCName<'s>); +/// [11] LocalPart ::= NCName +pub fn local_part(input: &str) -> IResult<&str, LocalPart> { + map(nc_name, |nc_name| LocalPart(nc_name))(input) +} + +// xml spec + +pub type Document<'s> = (Prolog<'s>, Element<'s>, Vec>); +/// [1] document ::= prolog element Misc* +pub fn document(input: &str) -> IResult<&str, Document> { + tuple((prolog, element, many0(misc)))(input) +} + +pub type Char = char; +/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ +pub fn xmlchar(input: &str) -> IResult<&str, Char> { + satisfy( + |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'), + )(input) +} + +pub type S<'s> = &'s str; +/// [3] S ::= (#x20 | #x9 | #xD | #xA)+ +pub fn s(input: &str) -> IResult<&str, S> { + is_a("\u{20}\u{9}\u{D}\u{A}")(input) +} + +pub type NameStartChar = char; +/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> { + satisfy( + |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'), + )(input) +} + +pub type NameChar = char; +/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] +pub fn name_char(input: &str) -> IResult<&str, NameChar> { + alt(( + name_start_char, + satisfy( + |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'), + ), + ))(input) +} + +pub type Name<'s> = &'s str; +/// [5] Name ::= NameStartChar (NameChar)* +pub fn name(input: &str) -> IResult<&str, Name> { + recognize(pair(name_start_char, many0(name_char)))(input) +} + +pub type Names<'s> = &'s str; +/// [6] Names ::= Name (#x20 Name)* +pub fn names(input: &str) -> IResult<&str, Names> { + recognize(pair(name, many0(pair(char('\u{20}'), name))))(input) +} + +pub type Nmtoken<'s> = &'s str; +/// [7] Nmtoken ::= (NameChar)+ +pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> { + recognize(many1(name_char))(input) +} + +pub type Nmtokens<'s> = &'s str; +/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> { + recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input) +} + +#[derive(Clone, Debug)] +pub enum LiteralData<'s> { + String(&'s str), + PEReference(PEReference<'s>), + Reference(Reference<'s>), +} + +pub type EntityValue<'s> = Vec>; +/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' +/// | "'" ([^%&'] | PEReference | Reference)* "'" +pub fn entity_value(input: &str) -> IResult<&str, EntityValue> { + alt(( + delimited( + char('"'), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&\"")))), + |string| LiteralData::String(string), + ), + map(pe_reference, |pe_reference| { + LiteralData::PEReference(pe_reference) + }), + map(reference, |reference| LiteralData::Reference(reference)), + ))), + char('"'), + ), + delimited( + char('\''), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&'")))), + |string| LiteralData::String(string), + ), + map(pe_reference, |pe_reference| { + LiteralData::PEReference(pe_reference) + }), + map(reference, |reference| LiteralData::Reference(reference)), + ))), + char('\''), + ), + ))(input) +} + +pub type AttValue<'s> = Vec>; +/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' +/// | "'" ([^<&'] | Reference)* "'" +pub fn att_value(input: &str) -> IResult<&str, AttValue> { + alt(( + delimited( + char('"'), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&\"")))), + |string| LiteralData::String(string), + ), + map(reference, |reference| LiteralData::Reference(reference)), + ))), + char('"'), + ), + delimited( + char('\''), + many0(alt(( + map( + recognize(many_till(take(1usize), peek(one_of("%&'")))), + |string| LiteralData::String(string), + ), + map(reference, |reference| LiteralData::Reference(reference)), + ))), + char('\''), + ), + ))(input) +} + +pub type SystemLiteral<'s> = &'s str; +/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") +pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> { + alt(( + delimited(char('"'), recognize(many0(none_of("\""))), char('"')), + delimited(char('\''), recognize(many0(none_of("'"))), char('\'')), + ))(input) +} + +pub type PubidLiteral<'s> = &'s str; +/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" +pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> { + alt(( + delimited(char('"'), recognize(many0(pubid_char)), char('"')), + delimited( + char('\''), + recognize(many0(recognize(not(char('\''))).and_then(pubid_char))), + char('\''), + ), + ))(input) +} + +pub type PubidChar<'s> = char; +/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] +pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> { + satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))( + input, + ) +} + +pub type CharData<'s> = &'s str; +/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) +pub fn char_data(input: &str) -> IResult<&str, CharData> { + recognize(many_till( + none_of("<&"), + peek(alt((recognize(one_of("<&")), tag("]]>")))), + ))(input) + + // let tagg: &str; + // if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) { + // if let Ok((_, tagg2)) = + // peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input) + // { + // if tagg1.len() < tagg2.len() { + // tagg = tagg1 + // } else { + // tagg = tagg2 + // } + // } else { + // tagg = tagg1; + // } + // } else { + // (_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)? + // } + // tag(tagg)(input) + + // recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input) + // recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input) + // take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input) +} + +pub type Comment<'s> = &'s str; +/// Comment ::= '' +pub fn comment(input: &str) -> IResult<&str, Comment> { + delimited( + tag(""), + )(input) +} + +#[derive(Clone, Debug)] +pub struct PI<'s> { + target: &'s str, + instruction: Option<&'s str>, +} +/// [16] PI ::= '' Char*)))? '?>' +pub fn pi(input: &str) -> IResult<&str, PI> { + map( + delimited( + tag("")))))), + ), + tag("?>"), + ), + |(target, instruction)| PI { + target, + instruction, + }, + )(input) +} + +pub type PITarget<'s> = &'s str; +/// [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) +pub fn pi_target(input: &str) -> IResult<&str, PITarget> { + let (rest, name) = name(input)?; + if name.to_lowercase() == "xml" { + return Err(Err::Error(Error { + input, + // TODO: check if better error to return + code: ErrorKind::Tag, + })); + } else { + return Ok((rest, name)); + } +} + +pub type CDSect<'s> = (CDStart<'s>, CData<'s>, CDEnd<'s>); +/// [18] CDSect ::= CDStart CData CDEnd +pub fn cd_sect(input: &str) -> IResult<&str, CDSect> { + tuple((cd_start, cdata, cd_end))(input) +} + +pub type CDStart<'s> = &'s str; +/// [19] CDStart ::= ' IResult<&str, CDStart> { + tag(" = &'s str; +/// [20] CData ::= (Char* - (Char* ']]>' Char*)) +pub fn cdata(input: &str) -> IResult<&str, CData> { + recognize(many_till(xmlchar, peek(tag("]]>"))))(input) +} + +pub type CDEnd<'s> = &'s str; +/// [21] CDEnd ::= ']]>' +pub fn cd_end(input: &str) -> IResult<&str, CDEnd> { + tag("]]>")(input) +} + +pub type Prolog<'s> = ( + Option>, + Vec>, + Option<(DoctypeDecl<'s>, Vec>)>, +); +/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +pub fn prolog(input: &str) -> IResult<&str, Prolog> { + tuple(( + opt(xml_decl), + many0(misc), + opt(tuple((doctypedecl, many0(misc)))), + ))(input) +} + +#[derive(Debug)] +pub struct XMLDecl<'s> { + version_info: VersionInfo, + encoding_decl: Option>, + sd_decl: Option, +} +/// [23] XMLDecl ::= '' +pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { + map( + delimited( + tag("")), + ), + |(version_info, encoding_decl, sd_decl)| XMLDecl { + version_info, + encoding_decl, + sd_decl, + }, + )(input) +} + +pub type VersionInfo = VersionNum; +/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') +pub fn version_info(input: &str) -> IResult<&str, VersionInfo> { + preceded( + tuple((s, tag("version"), eq)), + alt(( + delimited(char('\''), version_num, char('\'')), + delimited(char('"'), version_num, char('"')), + )), + )(input) +} + +/// [25] Eq ::= S? '=' S? +pub fn eq(input: &str) -> IResult<&str, &str> { + recognize(tuple((opt(s), char('='), opt(s))))(input) +} + +#[derive(Clone, Debug)] +pub enum VersionNum { + One, + OneDotOne, +} +/// [26] VersionNum ::= '1.' [0-9]+ +pub fn version_num(input: &str) -> IResult<&str, VersionNum> { + preceded( + tag("1."), + alt(( + value(VersionNum::One, char('0')), + value(VersionNum::OneDotOne, char('1')), + )), + )(input) +} + +#[derive(Clone, Debug)] +pub enum Misc<'s> { + Comment(Comment<'s>), + PI(PI<'s>), + // TODO: how to deal with whitespace + S, +} +/// [27] Misc ::= Comment | PI | S +pub fn misc(input: &str) -> IResult<&str, Misc> { + alt(( + map(comment, |comment| Misc::Comment(comment)), + map(pi, |pi| Misc::PI(pi)), + value(Misc::S, s), + ))(input) +} + +#[derive(Debug)] +pub struct DoctypeDecl<'s> { + name: QName<'s>, + external_id: Option>, + int_subset: Option>, +} +/// [16] doctypedecl ::= '' +/// [28] doctypedecl ::= '' +pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { + map( + delimited( + pair(tag(""), + ), + |(name, external_id, int_subset)| DoctypeDecl { + name, + external_id, + int_subset, + }, + )(input) +} + +#[derive(Clone, Debug)] +pub enum DeclSep<'s> { + PEReference(PEReference<'s>), + // TODO: tackle whitespace + S, +} +/// [28a] DeclSep ::= PEReference | S +pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> { + alt(( + map(pe_reference, |pe_reference| { + DeclSep::PEReference(pe_reference) + }), + value(DeclSep::S, s), + ))(input) +} + +#[derive(Debug)] +pub enum IntSubsetDeclaration<'s> { + MarkupDecl(MarkupDecl<'s>), + DeclSep(DeclSep<'s>), +} +type IntSubset<'s> = Vec>; +/// [28b] intSubset ::= (markupdecl | DeclSep)* +pub fn int_subset(input: &str) -> IResult<&str, IntSubset> { + many0(alt(( + map(markup_decl, |markup_decl| { + IntSubsetDeclaration::MarkupDecl(markup_decl) + }), + map(decl_sep, |decl_sep| IntSubsetDeclaration::DeclSep(decl_sep)), + )))(input) +} + +#[derive(Debug)] +pub enum MarkupDecl<'s> { + Elementdecl(Elementdecl<'s>), + AttlistDecl(AttlistDecl<'s>), + EntityDecl(EntityDecl<'s>), + NotationDecl(NotationDecl<'s>), + PI(PI<'s>), + Comment(Comment<'s>), +} +/// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment +pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> { + alt(( + map(elementdecl, |elementdecl| { + MarkupDecl::Elementdecl(elementdecl) + }), + map(attlist_decl, |attlist_decl| { + MarkupDecl::AttlistDecl(attlist_decl) + }), + map(entity_decl, |entity_decl| { + MarkupDecl::EntityDecl(entity_decl) + }), + map(notation_decl, |notation_decl| { + MarkupDecl::NotationDecl(notation_decl) + }), + map(pi, |pi| MarkupDecl::PI(pi)), + map(comment, |comment| MarkupDecl::Comment(comment)), + ))(input) +} + +pub struct ExtSubset<'s> { + text_decl: Option>, + ext_subset_decl: ExtSubsetDecl<'s>, +} +/// [30] extSubset ::= TextDecl? extSubsetDecl +pub fn ext_subset(input: &str) -> IResult<&str, ExtSubset> { + map( + pair(opt(text_decl), ext_subset_decl), + |(text_decl, ext_subset_decl)| ExtSubset { + text_decl, + ext_subset_decl, + }, + )(input) +} + +pub enum ExtSubsetDeclaration<'s> { + MarkupDecl(MarkupDecl<'s>), + ConditionalSect(ConditionalSect<'s>), + DeclSep(DeclSep<'s>), +} +type ExtSubsetDecl<'s> = Vec>; +/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* +pub fn ext_subset_decl(input: &str) -> IResult<&str, ExtSubsetDecl> { + many0(alt(( + map(markup_decl, |markup_decl| { + ExtSubsetDeclaration::MarkupDecl(markup_decl) + }), + map(conditional_sect, |conditional_sect| { + ExtSubsetDeclaration::ConditionalSect(conditional_sect) + }), + map(decl_sep, |decl_sep| ExtSubsetDeclaration::DeclSep(decl_sep)), + )))(input) +} + +pub type SDDecl = bool; +/// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) +pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> { + preceded( + tuple((s, tag("standalone"), eq)), + alt(( + delimited( + char('\''), + alt((value(true, tag("yes")), value(false, tag("no")))), + char('\''), + ), + delimited( + char('"'), + alt((value(true, tag("yes")), value(false, tag("no")))), + char('"'), + ), + )), + )(input) +} + +// (Productions 33 through 38 have been removed.) + +#[derive(Debug, Clone)] +pub enum Element<'s> { + Empty(EmptyElemTag<'s>), + NotEmpty(STag<'s>, Content<'s>, ETag<'s>), +} +/// [39] element ::= EmptyElemTag | STag content ETag +pub fn element(input: &str) -> IResult<&str, Element> { + alt(( + map(empty_elem_tag, |empty_elem_tag| { + Element::Empty(empty_elem_tag) + }), + map(tuple((s_tag, content, e_tag)), |(s_tag, content, e_tag)| { + Element::NotEmpty(s_tag, content, e_tag) + }), + ))(input) +} + +#[derive(Debug, Clone)] +pub struct STag<'s> { + name: QName<'s>, + attributes: Vec>, +} +/// [12] STag ::= '<' QName (S Attribute)* S? '>' +/// [40] STag ::= '<' Name (S Attribute)* S? '>' +pub fn s_tag(input: &str) -> IResult<&str, STag> { + map( + delimited( + tag("<"), + pair(q_name, many0(preceded(s, attribute))), + pair(opt(s), tag(">")), + ), + |(name, attributes)| STag { name, attributes }, + )(input) +} + +#[derive(Debug, Clone)] +pub enum Attribute<'s> { + NamespaceDeclaration { + ns_name: NSAttName<'s>, + value: AttValue<'s>, + }, + Attribute { + name: QName<'s>, + value: AttValue<'s>, + }, +} +/// [15] Attribute ::= NSAttName Eq AttValue | QName Eq AttValue +pub fn attribute(input: &str) -> IResult<&str, Attribute> { + alt(( + map( + separated_pair(ns_att_name, eq, att_value), + |(ns_name, value)| Attribute::NamespaceDeclaration { ns_name, value }, + ), + map(separated_pair(q_name, eq, att_value), |(name, value)| { + Attribute::Attribute { name, value } + }), + ))(input) +} +// pub type Attribute<'s> = (Name<'s>, AttValue<'s>); +/// [41] Attribute ::= Name Eq AttValue +// pub fn attribute(input: &str) -> IResult<&str, Attribute> { +// separated_pair(name, eq, att_value)(input) +// } + +#[derive(Debug, Clone)] +pub struct ETag<'s> { + name: QName<'s>, +} +/// [13] ETag ::= '' +/// [42] ETag ::= '' +pub fn e_tag(input: &str) -> IResult<&str, ETag> { + map( + delimited(tag(""))), + |name| ETag { name }, + )(input) +} + +#[derive(Debug, Clone)] +pub enum ContentItem<'s> { + // CharData(&'s str), + Element(Element<'s>), + Reference(Reference<'s>), + CDSect(CDSect<'s>), + PI(PI<'s>), + Comment(Comment<'s>), +} +#[derive(Debug, Clone)] +pub struct Content<'s> { + char_data: Option>, + content: Vec<(ContentItem<'s>, Option>)>, +} +/// [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* +pub fn content(input: &str) -> IResult<&str, Content> { + map( + pair( + opt(char_data), + many0(pair( + alt(( + map(element, |element| ContentItem::Element(element)), + map(reference, |reference| ContentItem::Reference(reference)), + map(cd_sect, |cd_sect| ContentItem::CDSect(cd_sect)), + map(pi, |pi| ContentItem::PI(pi)), + map(comment, |comment| ContentItem::Comment(comment)), + )), + opt(char_data), + )), + ), + |(char_data, content)| Content { char_data, content }, + )(input) +} + +#[derive(Debug, Clone)] +pub struct EmptyElemTag<'s> { + name: QName<'s>, + attributes: Vec>, +} +/// [14] EmptyElemTag ::= '<' QName (S Attribute)* S? '/>' +/// [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec] +pub fn empty_elem_tag(input: &str) -> IResult<&str, EmptyElemTag> { + map( + delimited( + tag("<"), + pair(q_name, many0(preceded(s, attribute))), + pair(opt(s), tag("/>")), + ), + |(name, attributes)| EmptyElemTag { name, attributes }, + )(input) +} + +#[derive(Debug)] +pub struct Elementdecl<'s> { + name: QName<'s>, + contentspec: Contentspec<'s>, +} +/// [17] elementdecl ::= '' +/// [45] elementdecl ::= '' +pub fn elementdecl(input: &str) -> IResult<&str, Elementdecl> { + map( + delimited( + pair(tag("")), + ), + |(name, contentspec)| Elementdecl { name, contentspec }, + )(input) +} + +// TODO: casings??? +#[derive(Clone, Debug)] +pub enum Contentspec<'s> { + Empty, + Any, + Mixed(Mixed<'s>), + Children(Children<'s>), +} +/// [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children +pub fn contentspec(input: &str) -> IResult<&str, Contentspec> { + alt(( + value(Contentspec::Empty, tag("EMPTY")), + value(Contentspec::Any, tag("ANY")), + map(mixed, |mixed| Contentspec::Mixed(mixed)), + map(children, |children| Contentspec::Children(children)), + ))(input) +} + +#[derive(Clone, Debug)] +pub enum Occurence { + Once, + Optional, + Many0, + Many1, +} +/// Occurence ::= ('?' | '*' | '+')? +pub fn occurence(input: &str) -> IResult<&str, Occurence> { + map( + opt(alt((tag("?"), tag("*"), tag("+")))), + |occurence| match occurence { + Some("?") => Occurence::Optional, + Some("*") => Occurence::Many0, + Some("+") => Occurence::Many1, + _ => Occurence::Once, + }, + )(input) +} + +#[derive(Clone, Debug)] +pub enum ChildrenKind<'s> { + Choice(Choice<'s>), + Seq(Seq<'s>), +} +#[derive(Clone, Debug)] +pub struct Children<'s> { + kind: ChildrenKind<'s>, + occurence: Occurence, +} +/// [47] children ::= (choice | seq) ('?' | '*' | '+')? +pub fn children(input: &str) -> IResult<&str, Children> { + map( + pair( + alt(( + map(choice, |choice| ChildrenKind::Choice(choice)), + map(seq, |seq| ChildrenKind::Seq(seq)), + )), + occurence, + ), + |(kind, occurence)| Children { kind, occurence }, + )(input) + // alt(( + // map(pair(choice, occurence), |(choice, occurence)| Children::Choice(choice, occurence)), + // map(pair(seq, occurence), |(seq, occurence)| Children::Seq(seq, occurence)) + // ))(input) +} + +#[derive(Clone, Debug)] +pub enum CpKind<'s> { + Name(QName<'s>), + Choice(Choice<'s>), + Seq(Seq<'s>), +} +#[derive(Clone, Debug)] +pub struct Cp<'s> { + kind: CpKind<'s>, + occurence: Occurence, +} +/// [18] cp ::= (QName | choice | seq) ('?' | '*' | '+')? +/// [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? +pub fn cp(input: &str) -> IResult<&str, Cp> { + map( + pair( + alt(( + map(q_name, |name| CpKind::Name(name)), + map(choice, |choice| CpKind::Choice(choice)), + map(seq, |seq| CpKind::Seq(seq)), + )), + occurence, + ), + |(kind, occurence)| Cp { kind, occurence }, + )(input) +} + +#[derive(Clone, Debug)] +pub struct Choice<'s>(Vec>); +/// [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' +pub fn choice(input: &str) -> IResult<&str, Choice> { + map( + delimited( + pair(tag("("), opt(s)), + pair(cp, many1(preceded(tuple((opt(s), tag("|"), opt(s))), cp))), + pair(opt(s), tag(")")), + ), + |(head, tail)| { + let choice = vec![vec![head], tail].concat(); + Choice(choice) + }, + )(input) +} + +#[derive(Clone, Debug)] +pub struct Seq<'s>(Vec>); +/// [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' +pub fn seq(input: &str) -> IResult<&str, Seq> { + map( + delimited( + pair(tag("("), opt(s)), + pair(cp, many0(preceded(tuple((opt(s), tag(","), opt(s))), cp))), + pair(opt(s), tag(")")), + ), + |(head, tail)| { + let seq = vec![vec![head], tail].concat(); + Seq(seq) + }, + )(input) +} + +// always contains #PCDATA +#[derive(Clone, Debug)] +pub struct Mixed<'s>(Vec>); +/// [19] Mixed ::= '(' S? '#PCDATA' (S? '|' S? QName)* S? ')*' | '(' S? '#PCDATA' S? ')' +/// [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' +pub fn mixed(input: &str) -> IResult<&str, Mixed> { + alt(( + map( + delimited( + tuple((tag("("), s, tag("#PCDATA"))), + many0(preceded(tuple((opt(s), tag("|"), opt(s))), q_name)), + pair(opt(s), tag(")*")), + ), + |names| Mixed(names), + ), + value( + Mixed(Vec::new()), + tuple((tag("("), opt(s), tag("#PCDATA"), opt(s), tag(")"))), + ), + ))(input) +} + +#[derive(Debug)] +pub struct AttlistDecl<'s> { + element_type: QName<'s>, + att_defs: Vec>, +} +/// [20] AttlistDecl ::= '' +/// [52] AttlistDecl ::= '' +pub fn attlist_decl(input: &str) -> IResult<&str, AttlistDecl> { + map( + delimited( + pair(tag("")), + ), + |(element_type, att_defs)| AttlistDecl { + element_type, + att_defs, + }, + )(input) +} + +#[derive(Debug)] +pub enum AttDefName<'s> { + QName(QName<'s>), + NSAttName(NSAttName<'s>), +} +#[derive(Debug)] +pub struct AttDef<'s> { + name: AttDefName<'s>, + att_type: AttType<'s>, + default_decl: DefaultDecl<'s>, +} +/// [21] AttDef ::= S (QName | NSAttName) S AttType S DefaultDecl +/// [53] AttDef ::= S Name S AttType S DefaultDecl +pub fn att_def(input: &str) -> IResult<&str, AttDef> { + map( + tuple(( + preceded( + s, + alt(( + map(q_name, |q_name| AttDefName::QName(q_name)), + map(ns_att_name, |ns_att_name| { + AttDefName::NSAttName(ns_att_name) + }), + )), + ), + preceded(s, att_type), + preceded(s, default_decl), + )), + |(name, att_type, default_decl)| AttDef { + name, + att_type, + default_decl, + }, + )(input) +} + +#[derive(Clone, Debug)] +pub enum AttType<'s> { + StringType, + TokenizedType(TokenizedType), + EnumeratedType(EnumeratedType<'s>), +} +/// [54] AttType ::= StringType | TokenizedType | EnumeratedType +pub fn att_type(input: &str) -> IResult<&str, AttType> { + alt(( + value(AttType::StringType, string_type), + map(tokenized_type, |tokenized_type| { + AttType::TokenizedType(tokenized_type) + }), + map(enumerated_type, |enumerated_type| { + AttType::EnumeratedType(enumerated_type) + }), + ))(input) +} + +pub type StringType<'s> = &'s str; +/// [55] StringType ::= 'CDATA' +pub fn string_type(input: &str) -> IResult<&str, StringType> { + tag("CDATA")(input) +} + +#[derive(Clone, Debug)] +pub enum TokenizedType { + ID, + IDRef, + IDRefs, + Entity, + Entities, + NMToken, + NMTokens, +} +/// [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' +pub fn tokenized_type(input: &str) -> IResult<&str, TokenizedType> { + alt(( + value(TokenizedType::ID, tag("ID")), + // TODO: check if this is required + // try idrefs first to avoid losing 'S' + value(TokenizedType::IDRefs, tag("IDREFS")), + value(TokenizedType::IDRef, tag("IDREF")), + value(TokenizedType::Entity, tag("ENTITY")), + value(TokenizedType::Entities, tag("ENTITIES")), + // same here + value(TokenizedType::NMTokens, tag("NMTOKENS")), + value(TokenizedType::NMToken, tag("NMTOKEN")), + ))(input) +} + +#[derive(Debug, Clone)] +pub enum EnumeratedType<'s> { + NotationType(NotationType<'s>), + Enumeration(Enumeration<'s>), +} +/// [57] EnumeratedType ::= NotationType | Enumeration +pub fn enumerated_type(input: &str) -> IResult<&str, EnumeratedType> { + alt(( + map(notation_type, |notation_type| { + EnumeratedType::NotationType(notation_type) + }), + map(enumeration, |enumeration| { + EnumeratedType::Enumeration(enumeration) + }), + ))(input) +} + +#[derive(Debug, Clone)] +pub struct NotationType<'s>(Vec>); +/// [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' +pub fn notation_type(input: &str) -> IResult<&str, NotationType> { + map( + delimited( + tuple((tag("NOTATION"), s, tag("("), opt(s))), + pair( + name, + many0(preceded(tuple((opt(s), tag("|"), opt(s))), name)), + ), + pair(opt(s), tag(")")), + ), + |(head, tail)| { + let notation_type = vec![vec![head], tail].concat(); + NotationType(notation_type) + }, + )(input) +} + +#[derive(Debug, Clone)] +pub struct Enumeration<'s>(Vec>); +/// [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' +pub fn enumeration(input: &str) -> IResult<&str, Enumeration> { + map( + delimited( + pair(tag("("), opt(s)), + pair( + nmtoken, + many0(preceded(tuple((opt(s), tag("|"), opt(s))), nmtoken)), + ), + pair(opt(s), tag(")")), + ), + |(head, tail)| { + let enumeration = vec![vec![head], tail].concat(); + Enumeration(enumeration) + }, + )(input) +} + +#[derive(Debug, Clone)] +pub enum DefaultDecl<'s> { + Required, + Implied, + Fixed(AttValue<'s>), +} +/// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) +pub fn default_decl(input: &str) -> IResult<&str, DefaultDecl> { + alt(( + value(DefaultDecl::Required, tag("#REQUIRED")), + value(DefaultDecl::Implied, tag("#IMPLIED")), + map( + preceded(opt(pair(tag("#FIXED"), s)), att_value), + |att_value| DefaultDecl::Fixed(att_value), + ), + ))(input) +} + +pub enum ConditionalSect<'s> { + IncludeSect(IncludeSect<'s>), + IgnoreSect(IgnoreSect<'s>), +} +/// [61] conditionalSect ::= includeSect | ignoreSect +pub fn conditional_sect(input: &str) -> IResult<&str, ConditionalSect> { + alt(( + map(include_sect, |include_sect| { + ConditionalSect::IncludeSect(include_sect) + }), + map(ignore_sect, |ignore_sect| { + ConditionalSect::IgnoreSect(ignore_sect) + }), + ))(input) +} + +pub struct IncludeSect<'s>(ExtSubsetDecl<'s>); +/// [62] includeSect ::= '' +pub fn include_sect(input: &str) -> IResult<&str, IncludeSect> { + map( + delimited( + tuple((tag(""), + ), + |ext_subset_decl| IncludeSect(ext_subset_decl), + )(input) +} + +pub struct IgnoreSect<'s>(Vec>); +/// [63] ignoreSect ::= '' +pub fn ignore_sect(input: &str) -> IResult<&str, IgnoreSect> { + map( + delimited( + tuple((tag(""), + ), + |ignore_sect_contents| IgnoreSect(ignore_sect_contents), + )(input) +} + +pub struct IgnoreSectContents<'s> { + // TODO: what the fuck does this mean + ignore: Ignore<'s>, + ignore_list: Vec<(IgnoreSectContents<'s>, Ignore<'s>)>, +} +/// [64] ignoreSectContents ::= Ignore ('' Ignore)* +pub fn ignore_sect_contents(input: &str) -> IResult<&str, IgnoreSectContents> { + map( + pair( + ignore, + many0(tuple(( + delimited(tag("")), + ignore, + ))), + ), + |(ignore, ignore_list)| IgnoreSectContents { + ignore, + ignore_list, + }, + )(input) +} + +pub type Ignore<'s> = &'s str; +/// [65] Ignore ::= Char* - (Char* ('') Char*) +pub fn ignore(input: &str) -> IResult<&str, Ignore> { + recognize(many_till(xmlchar, peek(alt((tag(""))))))(input) +} + +#[derive(Clone, Debug)] +pub enum CharRef<'s> { + Decimal(&'s str), + Hexadecimal(&'s str), +} +/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' +pub fn char_ref(input: &str) -> IResult<&str, CharRef> { + alt(( + delimited( + tag("&#"), + map(take_while(|c| matches!(c, '0'..='9')), |decimal| { + CharRef::Decimal(decimal) + }), + tag(";"), + ), + delimited( + tag("&#x"), + map( + take_while(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' )), + |hexadecimal| CharRef::Hexadecimal(hexadecimal), + ), + tag(";"), + ), + ))(input) +} + +#[derive(Clone, Debug)] +pub enum Reference<'s> { + EntityRef(EntityRef<'s>), + CharRef(CharRef<'s>), +} +/// [67] Reference ::= EntityRef | CharRef +pub fn reference(input: &str) -> IResult<&str, Reference> { + alt(( + map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)), + map(char_ref, |char_ref| Reference::CharRef(char_ref)), + ))(input) +} + +pub type EntityRef<'s> = &'s str; +/// [68] EntityRef ::= '&' Name ';' +pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> { + delimited(tag("&"), name, tag(";"))(input) +} + +pub type PEReference<'s> = &'s str; +/// [69] PEReference ::= '%' Name ';' +pub fn pe_reference(input: &str) -> IResult<&str, PEReference> { + delimited(tag("%"), name, tag(";"))(input) +} + +#[derive(Debug)] +pub enum EntityDecl<'s> { + GEDecl(GEDecl<'s>), + PEDecl(PEDecl<'s>), +} +/// [70] EntityDecl ::= GEDecl | PEDecl +pub fn entity_decl(input: &str) -> IResult<&str, EntityDecl> { + alt(( + map(ge_decl, |ge_decl| EntityDecl::GEDecl(ge_decl)), + map(pe_decl, |pe_decl| EntityDecl::PEDecl(pe_decl)), + ))(input) +} + +#[derive(Debug)] +pub struct GEDecl<'s> { + name: Name<'s>, + entity_def: EntityDef<'s>, +} +/// [71] GEDecl ::= '' +pub fn ge_decl(input: &str) -> IResult<&str, GEDecl> { + map( + delimited( + pair(tag("")), + ), + |(name, entity_def)| GEDecl { name, entity_def }, + )(input) +} + +#[derive(Debug)] +pub struct PEDecl<'s> { + name: Name<'s>, + pe_def: PEDef<'s>, +} +/// [72] PEDecl ::= '' +pub fn pe_decl(input: &str) -> IResult<&str, PEDecl> { + map( + delimited( + tuple((tag("")), + ), + |(name, pe_def)| PEDecl { name, pe_def }, + )(input) +} + +#[derive(Debug)] +pub enum EntityDef<'s> { + EntityValue(EntityValue<'s>), + ExternalID { + external_id: ExternalID<'s>, + ndata_decl: Option>, + }, +} +/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) +pub fn entity_def(input: &str) -> IResult<&str, EntityDef> { + alt(( + map(entity_value, |entity_value| { + EntityDef::EntityValue(entity_value) + }), + map( + pair(external_id, opt(ndata_decl)), + |(external_id, ndata_decl)| EntityDef::ExternalID { + external_id, + ndata_decl, + }, + ), + ))(input) +} + +#[derive(Debug)] +pub enum PEDef<'s> { + EntityValue(EntityValue<'s>), + ExternalID(ExternalID<'s>), +} +/// [74] PEDef ::= EntityValue | ExternalID +pub fn pe_def(input: &str) -> IResult<&str, PEDef> { + alt(( + map(entity_value, |entity_value| { + PEDef::EntityValue(entity_value) + }), + map(external_id, |external_id| PEDef::ExternalID(external_id)), + ))(input) +} + +#[derive(Debug)] +pub enum ExternalID<'s> { + SYSTEM { + system_identifier: &'s str, + }, + PUBLIC { + public_identifier: &'s str, + system_identifier: &'s str, + }, +} +/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral +// pub fn external_id(input: &str) -> IResult<&str, ExternalID> { +pub fn external_id(input: &str) -> IResult<&str, ExternalID> { + alt(( + map( + preceded(pair(tag("SYSTEM"), s), system_literal), + |system_identifier| ExternalID::SYSTEM { system_identifier }, + ), + map( + preceded( + pair(tag("PUBLIC"), s), + separated_pair(pubid_literal, s, system_literal), + ), + |(public_identifier, system_identifier)| ExternalID::PUBLIC { + public_identifier, + system_identifier, + }, + ), + ))(input) +} + +pub type NDataDecl<'s> = &'s str; +/// [76] NDataDecl ::= S 'NDATA' S Name +pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> { + preceded(tuple((s, tag("NDATA"), s)), name)(input) +} + +pub struct TextDecl<'s> { + version_info: Option, + encoding_decl: EncodingDecl<'s>, +} +/// [77] TextDecl ::= '' +pub fn text_decl(input: &str) -> IResult<&str, TextDecl> { + map( + delimited( + tag(""), + ), + |(version_info, encoding_decl)| TextDecl { + version_info, + encoding_decl, + }, + )(input) +} + +pub struct ExtParsedEnt<'s> { + text_decl: Option>, + content: Content<'s>, +} +/// [78] extParsedEnt ::= TextDecl? content +pub fn ext_parsed_ent(input: &str) -> IResult<&str, ExtParsedEnt> { + map(pair(opt(text_decl), content), |(text_decl, content)| { + ExtParsedEnt { text_decl, content } + })(input) +} + +pub type EncodingDecl<'s> = EncName<'s>; +/// [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName +pub fn encoding_decl(input: &str) -> IResult<&str, EncodingDecl> { + preceded( + tuple((s, tag("encoding"), eq)), + alt(( + delimited(char('"'), enc_name, char('"')), + delimited(char('\''), enc_name, char('\'')), + )), + )(input) +} + +pub type EncName<'s> = &'s str; +/// [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* +pub fn enc_name(input: &str) -> IResult<&str, EncName> { + recognize(pair( + satisfy(|c| matches!(c, 'A'..='Z' | 'a'..='z' )), + many0(satisfy( + |c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | '-' ), + )), + ))(input) +} + +#[derive(Debug)] +pub struct NotationDecl<'s> { + name: &'s str, + id: NotationDeclID<'s>, +} +#[derive(Debug)] +pub enum NotationDeclID<'s> { + External(ExternalID<'s>), + Public(PublicID<'s>), +} +/// [82] NotationDecl ::= '' +pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> { + map( + delimited( + pair(tag("")), + ), + |(name, id)| NotationDecl { name, id }, + )(input) +} + +pub type PublicID<'s> = &'s str; +/// [83] PublicID ::= 'PUBLIC' S PubidLiteral +pub fn public_id(input: &str) -> IResult<&str, PublicID> { + preceded(pair(tag("PUBLIC"), s), pubid_literal)(input) +} + +#[cfg(test)] +mod tests { + use std::num::NonZero; + + use super::*; + + #[test] + fn test_char_data() { + assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi")); + assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi")); + assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi")); + assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi")); + assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi")); + assert_eq!( + Err(Err::Incomplete(nom::Needed::Size( + NonZero::new(3usize).unwrap() + ))), + char_data("abcdefghi") + ); + } + + #[test] + fn test_comment() { + assert_eq!(Ok(("", "")), comment("")); + assert_eq!(Ok(("", "asdf")), comment("")); + assert_eq!(Ok(("", "as-df")), comment("")); + assert_eq!( + Err(Err::Incomplete(nom::Needed::Size( + NonZero::new(2usize).unwrap() + ))), + comment("