use std::char;
use nom::{
branch::alt,
bytes::streaming::{is_a, tag, take, take_while},
character::{
complete::one_of,
streaming::{char, none_of, satisfy},
},
combinator::{map, not, opt, peek, recognize, value},
error::{Error, ErrorKind},
multi::{many0, many1, many_till},
sequence::{delimited, pair, preceded, separated_pair, terminated, tuple},
Err, IResult, Parser,
};
// parser: parses tokens from lexer into events
// no well formedness, validity, or data model, simple translation of input into rust types
// output is a rust representation of the input xml
// types could be used for xml production too?
pub type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
/// [1] document ::= prolog element Misc*
pub fn document(input: &str) -> IResult<&str, Document> {
tuple((prolog, element, many0(misc)))(input)
}
pub type Char = char;
/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
pub fn xmlchar(input: &str) -> IResult<&str, Char> {
satisfy(
|c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
)(input)
}
pub type S<'s> = &'s str;
/// [3] S ::= (#x20 | #x9 | #xD | #xA)+
pub fn s(input: &str) -> IResult<&str, S> {
is_a("\u{20}\u{9}\u{D}\u{A}")(input)
}
pub type NameStartChar = char;
/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
satisfy(
|c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
)(input)
}
pub type NameChar = char;
/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
pub fn name_char(input: &str) -> IResult<&str, NameChar> {
alt((
name_start_char,
satisfy(
|c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
),
))(input)
}
pub type Name<'s> = &'s str;
/// [5] Name ::= NameStartChar (NameChar)*
pub fn name(input: &str) -> IResult<&str, Name> {
recognize(pair(name_start_char, many0(name_char)))(input)
}
pub type Names<'s> = &'s str;
/// [6] Names ::= Name (#x20 Name)*
pub fn names(input: &str) -> IResult<&str, Names> {
recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
}
pub type Nmtoken<'s> = &'s str;
/// [7] Nmtoken ::= (NameChar)+
pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
recognize(many1(name_char))(input)
}
pub type Nmtokens<'s> = &'s str;
/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
}
#[derive(Clone, Debug)]
pub enum LiteralData<'s> {
String(&'s str),
PEReference(PEReference<'s>),
Reference(Reference<'s>),
}
pub type EntityValue<'s> = Vec<LiteralData<'s>>;
/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
/// | "'" ([^%&'] | PEReference | Reference)* "'"
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
alt((
delimited(
char('"'),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&\"")))),
|string| LiteralData::String(string),
),
map(pe_reference, |pe_reference| {
LiteralData::PEReference(pe_reference)
}),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('"'),
),
delimited(
char('\''),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&'")))),
|string| LiteralData::String(string),
),
map(pe_reference, |pe_reference| {
LiteralData::PEReference(pe_reference)
}),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('\''),
),
))(input)
}
pub type AttValue<'s> = Vec<LiteralData<'s>>;
/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
/// | "'" ([^<&'] | Reference)* "'"
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
alt((
delimited(
char('"'),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&\"")))),
|string| LiteralData::String(string),
),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('"'),
),
delimited(
char('\''),
many0(alt((
map(
recognize(many_till(take(1usize), peek(one_of("%&'")))),
|string| LiteralData::String(string),
),
map(reference, |reference| LiteralData::Reference(reference)),
))),
char('\''),
),
))(input)
}
pub type SystemLiteral<'s> = &'s str;
/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
alt((
delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
))(input)
}
pub type PubidLiteral<'s> = &'s str;
/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
alt((
delimited(char('"'), recognize(many0(pubid_char)), char('"')),
delimited(
char('\''),
recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
char('\''),
),
))(input)
}
pub type PubidChar<'s> = char;
/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
input,
)
}
pub type CharData<'s> = &'s str;
/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
pub fn char_data(input: &str) -> IResult<&str, CharData> {
recognize(many_till(
none_of("<&"),
peek(alt((recognize(one_of("<&")), tag("]]>")))),
))(input)
// let tagg: &str;
// if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) {
// if let Ok((_, tagg2)) =
// peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input)
// {
// if tagg1.len() < tagg2.len() {
// tagg = tagg1
// } else {
// tagg = tagg2
// }
// } else {
// tagg = tagg1;
// }
// } else {
// (_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)?
// }
// tag(tagg)(input)
// recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input)
// recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input)
// take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input)
}
pub type Comment<'s> = &'s str;
/// Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
pub fn comment(input: &str) -> IResult<&str, Comment> {
delimited(
tag("<!--"),
recognize(many_till(xmlchar, peek(tag("--")))),
tag("-->"),
)(input)
}
#[derive(Clone, Debug)]
pub struct PI<'s> {
target: &'s str,
instruction: Option<&'s str>,
}
/// [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
pub fn pi(input: &str) -> IResult<&str, PI> {
map(
delimited(
tag("<?"),
pair(
pi_target,
opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))),
),
tag("?>"),
),
|(target, instruction)| PI {
target,
instruction,
},
)(input)
}
pub type PITarget<'s> = &'s str;
/// [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
pub fn pi_target(input: &str) -> IResult<&str, PITarget> {
let (rest, name) = name(input)?;
if name.to_lowercase() == "xml" {
return Err(Err::Error(Error {
input,
// TODO: check if better error to return
code: ErrorKind::Tag,
}));
} else {
return Ok((rest, name));
}
}
pub type CDSect<'s> = (CDStart<'s>, CData<'s>, CDEnd<'s>);
/// [18] CDSect ::= CDStart CData CDEnd
pub fn cd_sect(input: &str) -> IResult<&str, CDSect> {
tuple((cd_start, cdata, cd_end))(input)
}
pub type CDStart<'s> = &'s str;
/// [19] CDStart ::= '<![CDATA['
pub fn cd_start(input: &str) -> IResult<&str, CDStart> {
tag("<![CDATA[")(input)
}
pub type CData<'s> = &'s str;
/// [20] CData ::= (Char* - (Char* ']]>' Char*))
pub fn cdata(input: &str) -> IResult<&str, CData> {
recognize(many_till(xmlchar, peek(tag("]]>"))))(input)
}
pub type CDEnd<'s> = &'s str;
/// [21] CDEnd ::= ']]>'
pub fn cd_end(input: &str) -> IResult<&str, CDEnd> {
tag("]]>")(input)
}
pub type Prolog<'s> = (
Option<XMLDecl<'s>>,
Vec<Misc<'s>>,
Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
);
/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
pub fn prolog(input: &str) -> IResult<&str, Prolog> {
tuple((
opt(xml_decl),
many0(misc),
opt(tuple((doctypedecl, many0(misc)))),
))(input)
}
#[derive(Debug)]
pub struct XMLDecl<'s> {
version_info: VersionInfo,
encoding_decl: Option<EncodingDecl<'s>>,
sd_decl: Option<SDDecl>,
}
/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
map(
delimited(
tag("<?xml"),
tuple((version_info, opt(encoding_decl), opt(sd_decl))),
pair(opt(s), tag("?>")),
),
|(version_info, encoding_decl, sd_decl)| XMLDecl {
version_info,
encoding_decl,
sd_decl,
},
)(input)
}
pub type VersionInfo = VersionNum;
/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
preceded(
tuple((s, tag("version"), eq)),
alt((
delimited(char('\''), version_num, char('\'')),
delimited(char('"'), version_num, char('"')),
)),
)(input)
}
/// [25] Eq ::= S? '=' S?
pub fn eq(input: &str) -> IResult<&str, &str> {
recognize(tuple((opt(s), char('='), opt(s))))(input)
}
#[derive(Clone, Debug)]
pub enum VersionNum {
One,
OneDotOne,
}
/// [26] VersionNum ::= '1.' [0-9]+
pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
preceded(
tag("1."),
alt((
value(VersionNum::One, char('0')),
value(VersionNum::OneDotOne, char('1')),
)),
)(input)
}
#[derive(Clone, Debug)]
pub enum Misc<'s> {
Comment(Comment<'s>),
PI(PI<'s>),
// TODO: how to deal with whitespace
S,
}
/// [27] Misc ::= Comment | PI | S
pub fn misc(input: &str) -> IResult<&str, Misc> {
alt((
map(comment, |comment| Misc::Comment(comment)),
map(pi, |pi| Misc::PI(pi)),
value(Misc::S, s),
))(input)
}
#[derive(Debug)]
pub struct DoctypeDecl<'s> {
name: &'s str,
external_id: Option<ExternalID<'s>>,
int_subset: Option<IntSubset<'s>>,
}
/// [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
map(
delimited(
pair(tag("<!DOCTYPE"), s),
tuple((
name,
opt(preceded(s, external_id)),
preceded(
opt(s),
opt(terminated(
delimited(tag("["), int_subset, tag("]")),
opt(s),
)),
),
)),
tag(">"),
),
|(name, external_id, int_subset)| DoctypeDecl {
name,
external_id,
int_subset,
},
)(input)
}
#[derive(Clone, Debug)]
pub enum DeclSep<'s> {
PEReference(PEReference<'s>),
// TODO: tackle whitespace
S,
}
/// [28a] DeclSep ::= PEReference | S
pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> {
alt((
map(pe_reference, |pe_reference| {
DeclSep::PEReference(pe_reference)
}),
value(DeclSep::S, s),
))(input)
}
#[derive(Debug)]
pub enum IntSubsetDeclaration<'s> {
MarkupDecl(MarkupDecl<'s>),
DeclSep(DeclSep<'s>),
}
type IntSubset<'s> = Vec<IntSubsetDeclaration<'s>>;
/// [28b] intSubset ::= (markupdecl | DeclSep)*
pub fn int_subset(input: &str) -> IResult<&str, IntSubset> {
many0(alt((
map(markup_decl, |markup_decl| {
IntSubsetDeclaration::MarkupDecl(markup_decl)
}),
map(decl_sep, |decl_sep| IntSubsetDeclaration::DeclSep(decl_sep)),
)))(input)
}
#[derive(Debug)]
pub enum MarkupDecl<'s> {
Elementdecl(Elementdecl<'s>),
AttlistDecl(AttlistDecl<'s>),
EntityDecl(EntityDecl<'s>),
NotationDecl(NotationDecl<'s>),
PI(PI<'s>),
Comment(Comment<'s>),
}
/// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> {
alt((
map(elementdecl, |elementdecl| {
MarkupDecl::Elementdecl(elementdecl)
}),
map(attlist_decl, |attlist_decl| {
MarkupDecl::AttlistDecl(attlist_decl)
}),
map(entity_decl, |entity_decl| {
MarkupDecl::EntityDecl(entity_decl)
}),
map(notation_decl, |notation_decl| {
MarkupDecl::NotationDecl(notation_decl)
}),
map(pi, |pi| MarkupDecl::PI(pi)),
map(comment, |comment| MarkupDecl::Comment(comment)),
))(input)
}
pub struct ExtSubset<'s> {
text_decl: Option<TextDecl<'s>>,
ext_subset_decl: ExtSubsetDecl<'s>,
}
/// [30] extSubset ::= TextDecl? extSubsetDecl
pub fn ext_subset(input: &str) -> IResult<&str, ExtSubset> {
map(
pair(opt(text_decl), ext_subset_decl),
|(text_decl, ext_subset_decl)| ExtSubset {
text_decl,
ext_subset_decl,
},
)(input)
}
pub enum ExtSubsetDeclaration<'s> {
MarkupDecl(MarkupDecl<'s>),
ConditionalSect(ConditionalSect<'s>),
DeclSep(DeclSep<'s>),
}
type ExtSubsetDecl<'s> = Vec<ExtSubsetDeclaration<'s>>;
/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
pub fn ext_subset_decl(input: &str) -> IResult<&str, ExtSubsetDecl> {
many0(alt((
map(markup_decl, |markup_decl| {
ExtSubsetDeclaration::MarkupDecl(markup_decl)
}),
map(conditional_sect, |conditional_sect| {
ExtSubsetDeclaration::ConditionalSect(conditional_sect)
}),
map(decl_sep, |decl_sep| ExtSubsetDeclaration::DeclSep(decl_sep)),
)))(input)
}
pub type SDDecl = bool;
/// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> {
preceded(
tuple((s, tag("standalone"), eq)),
alt((
delimited(
char('\''),
alt((value(true, tag("yes")), value(false, tag("no")))),
char('\''),
),
delimited(
char('"'),
alt((value(true, tag("yes")), value(false, tag("no")))),
char('"'),
),
)),
)(input)
}
// (Productions 33 through 38 have been removed.)
#[derive(Debug)]
pub enum Element<'s> {
Empty(EmptyElemTag<'s>),
NotEmpty(STag<'s>, Content<'s>, ETag<'s>),
}
/// [39] element ::= EmptyElemTag | STag content ETag
pub fn element(input: &str) -> IResult<&str, Element> {
alt((
map(empty_elem_tag, |empty_elem_tag| {
Element::Empty(empty_elem_tag)
}),
map(tuple((s_tag, content, e_tag)), |(s_tag, content, e_tag)| {
Element::NotEmpty(s_tag, content, e_tag)
}),
))(input)
}
#[derive(Debug)]
pub struct STag<'s> {
name: Name<'s>,
attributes: Vec<Attribute<'s>>,
}
/// [40] STag ::= '<' Name (S Attribute)* S? '>'
pub fn s_tag(input: &str) -> IResult<&str, STag> {
map(
delimited(
tag("<"),
pair(name, many0(preceded(s, attribute))),
pair(opt(s), tag(">")),
),
|(name, attributes)| STag { name, attributes },
)(input)
}
pub type Attribute<'s> = (Name<'s>, AttValue<'s>);
/// [41] Attribute ::= Name Eq AttValue
pub fn attribute(input: &str) -> IResult<&str, Attribute> {
separated_pair(name, eq, att_value)(input)
}
#[derive(Debug)]
pub struct ETag<'s> {
name: Name<'s>,
}
/// [42] ETag ::= '</' Name S? '>'
pub fn e_tag(input: &str) -> IResult<&str, ETag> {
map(delimited(tag("</"), name, pair(opt(s), tag(">"))), |name| {
ETag { name }
})(input)
}
#[derive(Debug)]
pub enum ContentItem<'s> {
// CharData(&'s str),
Element(Element<'s>),
Reference(Reference<'s>),
CDSect(CDSect<'s>),
PI(PI<'s>),
Comment(Comment<'s>),
}
#[derive(Debug)]
pub struct Content<'s> {
char_data: Option<CharData<'s>>,
content: Vec<(ContentItem<'s>, Option<CharData<'s>>)>,
}
/// [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
pub fn content(input: &str) -> IResult<&str, Content> {
map(
pair(
opt(char_data),
many0(pair(
alt((
map(element, |element| ContentItem::Element(element)),
map(reference, |reference| ContentItem::Reference(reference)),
map(cd_sect, |cd_sect| ContentItem::CDSect(cd_sect)),
map(pi, |pi| ContentItem::PI(pi)),
map(comment, |comment| ContentItem::Comment(comment)),
)),
opt(char_data),
)),
),
|(char_data, content)| Content { char_data, content },
)(input)
}
#[derive(Debug)]
pub struct EmptyElemTag<'s> {
name: Name<'s>,
attributes: Vec<Attribute<'s>>,
}
/// [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec]
pub fn empty_elem_tag(input: &str) -> IResult<&str, EmptyElemTag> {
map(
delimited(
tag("<"),
pair(name, many0(preceded(s, attribute))),
pair(opt(s), tag("/>")),
),
|(name, attributes)| EmptyElemTag { name, attributes },
)(input)
}
#[derive(Debug)]
pub struct Elementdecl<'s> {
name: Name<'s>,
contentspec: Contentspec<'s>,
}
/// [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
pub fn elementdecl(input: &str) -> IResult<&str, Elementdecl> {
map(
delimited(
pair(tag("<!ELEMENT"), s),
separated_pair(name, s, contentspec),
pair(opt(s), tag(">")),
),
|(name, contentspec)| Elementdecl { name, contentspec },
)(input)
}
// TODO: casings???
#[derive(Clone, Debug)]
pub enum Contentspec<'s> {
Empty,
Any,
Mixed(Mixed<'s>),
Children(Children<'s>),
}
/// [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
pub fn contentspec(input: &str) -> IResult<&str, Contentspec> {
alt((
value(Contentspec::Empty, tag("EMPTY")),
value(Contentspec::Any, tag("ANY")),
map(mixed, |mixed| Contentspec::Mixed(mixed)),
map(children, |children| Contentspec::Children(children)),
))(input)
}
#[derive(Clone, Debug)]
pub enum Occurence {
Once,
Optional,
Many0,
Many1,
}
/// Occurence ::= ('?' | '*' | '+')?
pub fn occurence(input: &str) -> IResult<&str, Occurence> {
map(
opt(alt((tag("?"), tag("*"), tag("+")))),
|occurence| match occurence {
Some("?") => Occurence::Optional,
Some("*") => Occurence::Many0,
Some("+") => Occurence::Many1,
_ => Occurence::Once,
},
)(input)
}
#[derive(Clone, Debug)]
pub enum ChildrenKind<'s> {
Choice(Choice<'s>),
Seq(Seq<'s>),
}
#[derive(Clone, Debug)]
pub struct Children<'s> {
kind: ChildrenKind<'s>,
occurence: Occurence,
}
/// [47] children ::= (choice | seq) ('?' | '*' | '+')?
pub fn children(input: &str) -> IResult<&str, Children> {
map(
pair(
alt((
map(choice, |choice| ChildrenKind::Choice(choice)),
map(seq, |seq| ChildrenKind::Seq(seq)),
)),
occurence,
),
|(kind, occurence)| Children { kind, occurence },
)(input)
// alt((
// map(pair(choice, occurence), |(choice, occurence)| Children::Choice(choice, occurence)),
// map(pair(seq, occurence), |(seq, occurence)| Children::Seq(seq, occurence))
// ))(input)
}
#[derive(Clone, Debug)]
pub enum CpKind<'s> {
Name(Name<'s>),
Choice(Choice<'s>),
Seq(Seq<'s>),
}
#[derive(Clone, Debug)]
pub struct Cp<'s> {
kind: CpKind<'s>,
occurence: Occurence,
}
/// [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
pub fn cp(input: &str) -> IResult<&str, Cp> {
map(
pair(
alt((
map(name, |name| CpKind::Name(name)),
map(choice, |choice| CpKind::Choice(choice)),
map(seq, |seq| CpKind::Seq(seq)),
)),
occurence,
),
|(kind, occurence)| Cp { kind, occurence },
)(input)
}
#[derive(Clone, Debug)]
pub struct Choice<'s>(Vec<Cp<'s>>);
/// [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
pub fn choice(input: &str) -> IResult<&str, Choice> {
map(
delimited(
pair(tag("("), opt(s)),
pair(cp, many1(preceded(tuple((opt(s), tag("|"), opt(s))), cp))),
pair(opt(s), tag(")")),
),
|(head, tail)| {
let choice = vec![vec![head], tail].concat();
Choice(choice)
},
)(input)
}
#[derive(Clone, Debug)]
pub struct Seq<'s>(Vec<Cp<'s>>);
/// [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
pub fn seq(input: &str) -> IResult<&str, Seq> {
map(
delimited(
pair(tag("("), opt(s)),
pair(cp, many0(preceded(tuple((opt(s), tag(","), opt(s))), cp))),
pair(opt(s), tag(")")),
),
|(head, tail)| {
let seq = vec![vec![head], tail].concat();
Seq(seq)
},
)(input)
}
// always contains #PCDATA
#[derive(Clone, Debug)]
pub struct Mixed<'s>(Vec<Name<'s>>);
/// [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')'
pub fn mixed(input: &str) -> IResult<&str, Mixed> {
alt((
map(
delimited(
tuple((tag("("), s, tag("#PCDATA"))),
many0(preceded(tuple((opt(s), tag("|"), opt(s))), name)),
pair(opt(s), tag(")*")),
),
|names| Mixed(names),
),
value(
Mixed(Vec::new()),
tuple((tag("("), opt(s), tag("#PCDATA"), opt(s), tag(")"))),
),
))(input)
}
#[derive(Debug)]
pub struct AttlistDecl<'s> {
element_type: Name<'s>,
att_defs: Vec<AttDef<'s>>,
}
/// [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
pub fn attlist_decl(input: &str) -> IResult<&str, AttlistDecl> {
map(
delimited(
pair(tag("<!ATTLIST"), s),
pair(name, many0(att_def)),
pair(opt(s), tag(">")),
),
|(element_type, att_defs)| AttlistDecl {
element_type,
att_defs,
},
)(input)
}
#[derive(Debug)]
pub struct AttDef<'s> {
name: Name<'s>,
att_type: AttType<'s>,
default_decl: DefaultDecl<'s>,
}
/// [53] AttDef ::= S Name S AttType S DefaultDecl
pub fn att_def(input: &str) -> IResult<&str, AttDef> {
map(
tuple((
preceded(s, name),
preceded(s, att_type),
preceded(s, default_decl),
)),
|(name, att_type, default_decl)| AttDef {
name,
att_type,
default_decl,
},
)(input)
}
#[derive(Clone, Debug)]
pub enum AttType<'s> {
StringType,
TokenizedType(TokenizedType),
EnumeratedType(EnumeratedType<'s>),
}
/// [54] AttType ::= StringType | TokenizedType | EnumeratedType
pub fn att_type(input: &str) -> IResult<&str, AttType> {
alt((
value(AttType::StringType, string_type),
map(tokenized_type, |tokenized_type| {
AttType::TokenizedType(tokenized_type)
}),
map(enumerated_type, |enumerated_type| {
AttType::EnumeratedType(enumerated_type)
}),
))(input)
}
pub type StringType<'s> = &'s str;
/// [55] StringType ::= 'CDATA'
pub fn string_type(input: &str) -> IResult<&str, StringType> {
tag("CDATA")(input)
}
#[derive(Clone, Debug)]
pub enum TokenizedType {
ID,
IDRef,
IDRefs,
Entity,
Entities,
NMToken,
NMTokens,
}
/// [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
pub fn tokenized_type(input: &str) -> IResult<&str, TokenizedType> {
alt((
value(TokenizedType::ID, tag("ID")),
// TODO: check if this is required
// try idrefs first to avoid losing 'S'
value(TokenizedType::IDRefs, tag("IDREFS")),
value(TokenizedType::IDRef, tag("IDREF")),
value(TokenizedType::Entity, tag("ENTITY")),
value(TokenizedType::Entities, tag("ENTITIES")),
// same here
value(TokenizedType::NMTokens, tag("NMTOKENS")),
value(TokenizedType::NMToken, tag("NMTOKEN")),
))(input)
}
#[derive(Debug, Clone)]
pub enum EnumeratedType<'s> {
NotationType(NotationType<'s>),
Enumeration(Enumeration<'s>),
}
/// [57] EnumeratedType ::= NotationType | Enumeration
pub fn enumerated_type(input: &str) -> IResult<&str, EnumeratedType> {
alt((
map(notation_type, |notation_type| {
EnumeratedType::NotationType(notation_type)
}),
map(enumeration, |enumeration| {
EnumeratedType::Enumeration(enumeration)
}),
))(input)
}
#[derive(Debug, Clone)]
pub struct NotationType<'s>(Vec<Name<'s>>);
/// [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
pub fn notation_type(input: &str) -> IResult<&str, NotationType> {
map(
delimited(
tuple((tag("NOTATION"), s, tag("("), opt(s))),
pair(
name,
many0(preceded(tuple((opt(s), tag("|"), opt(s))), name)),
),
pair(opt(s), tag(")")),
),
|(head, tail)| {
let notation_type = vec![vec![head], tail].concat();
NotationType(notation_type)
},
)(input)
}
#[derive(Debug, Clone)]
pub struct Enumeration<'s>(Vec<Nmtoken<'s>>);
/// [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
pub fn enumeration(input: &str) -> IResult<&str, Enumeration> {
map(
delimited(
pair(tag("("), opt(s)),
pair(
nmtoken,
many0(preceded(tuple((opt(s), tag("|"), opt(s))), nmtoken)),
),
pair(opt(s), tag(")")),
),
|(head, tail)| {
let enumeration = vec![vec![head], tail].concat();
Enumeration(enumeration)
},
)(input)
}
#[derive(Debug, Clone)]
pub enum DefaultDecl<'s> {
Required,
Implied,
Fixed(AttValue<'s>),
}
/// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
pub fn default_decl(input: &str) -> IResult<&str, DefaultDecl> {
alt((
value(DefaultDecl::Required, tag("#REQUIRED")),
value(DefaultDecl::Implied, tag("#IMPLIED")),
map(
preceded(opt(pair(tag("#FIXED"), s)), att_value),
|att_value| DefaultDecl::Fixed(att_value),
),
))(input)
}
pub enum ConditionalSect<'s> {
IncludeSect(IncludeSect<'s>),
IgnoreSect(IgnoreSect<'s>),
}
/// [61] conditionalSect ::= includeSect | ignoreSect
pub fn conditional_sect(input: &str) -> IResult<&str, ConditionalSect> {
alt((
map(include_sect, |include_sect| {
ConditionalSect::IncludeSect(include_sect)
}),
map(ignore_sect, |ignore_sect| {
ConditionalSect::IgnoreSect(ignore_sect)
}),
))(input)
}
pub struct IncludeSect<'s>(ExtSubsetDecl<'s>);
/// [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
pub fn include_sect(input: &str) -> IResult<&str, IncludeSect> {
map(
delimited(
tuple((tag("<!["), opt(s), tag("INCLUDE"), opt(s), tag("["))),
ext_subset_decl,
tag("]]>"),
),
|ext_subset_decl| IncludeSect(ext_subset_decl),
)(input)
}
pub struct IgnoreSect<'s>(Vec<IgnoreSectContents<'s>>);
/// [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
pub fn ignore_sect(input: &str) -> IResult<&str, IgnoreSect> {
map(
delimited(
tuple((tag("<!["), opt(s), tag("IGNORE"), opt(s), tag("["))),
many0(ignore_sect_contents),
tag("]]>"),
),
|ignore_sect_contents| IgnoreSect(ignore_sect_contents),
)(input)
}
pub struct IgnoreSectContents<'s> {
// TODO: what the fuck does this mean
ignore: Ignore<'s>,
ignore_list: Vec<(IgnoreSectContents<'s>, Ignore<'s>)>,
}
/// [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
pub fn ignore_sect_contents(input: &str) -> IResult<&str, IgnoreSectContents> {
map(
pair(
ignore,
many0(tuple((
delimited(tag("<!["), ignore_sect_contents, tag("]]>")),
ignore,
))),
),
|(ignore, ignore_list)| IgnoreSectContents {
ignore,
ignore_list,
},
)(input)
}
pub type Ignore<'s> = &'s str;
/// [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
pub fn ignore(input: &str) -> IResult<&str, Ignore> {
recognize(many_till(xmlchar, peek(alt((tag("<!["), tag("]]>"))))))(input)
}
#[derive(Clone, Debug)]
pub enum CharRef<'s> {
Decimal(&'s str),
Hexadecimal(&'s str),
}
/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
pub fn char_ref(input: &str) -> IResult<&str, CharRef> {
alt((
delimited(
tag("&#"),
map(take_while(|c| matches!(c, '0'..='9')), |decimal| {
CharRef::Decimal(decimal)
}),
tag(";"),
),
delimited(
tag("&#x"),
map(
take_while(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' )),
|hexadecimal| CharRef::Hexadecimal(hexadecimal),
),
tag(";"),
),
))(input)
}
#[derive(Clone, Debug)]
pub enum Reference<'s> {
EntityRef(EntityRef<'s>),
CharRef(CharRef<'s>),
}
/// [67] Reference ::= EntityRef | CharRef
pub fn reference(input: &str) -> IResult<&str, Reference> {
alt((
map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)),
map(char_ref, |char_ref| Reference::CharRef(char_ref)),
))(input)
}
pub type EntityRef<'s> = &'s str;
/// [68] EntityRef ::= '&' Name ';'
pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> {
delimited(tag("&"), name, tag(";"))(input)
}
pub type PEReference<'s> = &'s str;
/// [69] PEReference ::= '%' Name ';'
pub fn pe_reference(input: &str) -> IResult<&str, PEReference> {
delimited(tag("%"), name, tag(";"))(input)
}
#[derive(Debug)]
pub enum EntityDecl<'s> {
GEDecl(GEDecl<'s>),
PEDecl(PEDecl<'s>),
}
/// [70] EntityDecl ::= GEDecl | PEDecl
pub fn entity_decl(input: &str) -> IResult<&str, EntityDecl> {
alt((
map(ge_decl, |ge_decl| EntityDecl::GEDecl(ge_decl)),
map(pe_decl, |pe_decl| EntityDecl::PEDecl(pe_decl)),
))(input)
}
#[derive(Debug)]
pub struct GEDecl<'s> {
name: Name<'s>,
entity_def: EntityDef<'s>,
}
/// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
pub fn ge_decl(input: &str) -> IResult<&str, GEDecl> {
map(
delimited(
pair(tag("<!ENTITY"), s),
separated_pair(name, s, entity_def),
pair(opt(s), tag(">")),
),
|(name, entity_def)| GEDecl { name, entity_def },
)(input)
}
#[derive(Debug)]
pub struct PEDecl<'s> {
name: Name<'s>,
pe_def: PEDef<'s>,
}
/// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
pub fn pe_decl(input: &str) -> IResult<&str, PEDecl> {
map(
delimited(
tuple((tag("<!ENTITY"), s, tag("%"), s)),
separated_pair(name, s, pe_def),
pair(opt(s), tag(">")),
),
|(name, pe_def)| PEDecl { name, pe_def },
)(input)
}
#[derive(Debug)]
pub enum EntityDef<'s> {
EntityValue(EntityValue<'s>),
ExternalID {
external_id: ExternalID<'s>,
ndata_decl: Option<NDataDecl<'s>>,
},
}
/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
pub fn entity_def(input: &str) -> IResult<&str, EntityDef> {
alt((
map(entity_value, |entity_value| {
EntityDef::EntityValue(entity_value)
}),
map(
pair(external_id, opt(ndata_decl)),
|(external_id, ndata_decl)| EntityDef::ExternalID {
external_id,
ndata_decl,
},
),
))(input)
}
#[derive(Debug)]
pub enum PEDef<'s> {
EntityValue(EntityValue<'s>),
ExternalID(ExternalID<'s>),
}
/// [74] PEDef ::= EntityValue | ExternalID
pub fn pe_def(input: &str) -> IResult<&str, PEDef> {
alt((
map(entity_value, |entity_value| {
PEDef::EntityValue(entity_value)
}),
map(external_id, |external_id| PEDef::ExternalID(external_id)),
))(input)
}
#[derive(Debug)]
pub enum ExternalID<'s> {
SYSTEM {
system_identifier: &'s str,
},
PUBLIC {
public_identifier: &'s str,
system_identifier: &'s str,
},
}
/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
// pub fn external_id(input: &str) -> IResult<&str, ExternalID> {
pub fn external_id(input: &str) -> IResult<&str, ExternalID> {
alt((
map(
preceded(pair(tag("SYSTEM"), s), system_literal),
|system_identifier| ExternalID::SYSTEM { system_identifier },
),
map(
preceded(
pair(tag("PUBLIC"), s),
separated_pair(pubid_literal, s, system_literal),
),
|(public_identifier, system_identifier)| ExternalID::PUBLIC {
public_identifier,
system_identifier,
},
),
))(input)
}
pub type NDataDecl<'s> = &'s str;
/// [76] NDataDecl ::= S 'NDATA' S Name
pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> {
preceded(tuple((s, tag("NDATA"), s)), name)(input)
}
pub struct TextDecl<'s> {
version_info: Option<VersionInfo>,
encoding_decl: EncodingDecl<'s>,
}
/// [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
pub fn text_decl(input: &str) -> IResult<&str, TextDecl> {
map(
delimited(
tag("<?xml"),
pair(opt(version_info), terminated(encoding_decl, opt(s))),
tag("?>"),
),
|(version_info, encoding_decl)| TextDecl {
version_info,
encoding_decl,
},
)(input)
}
pub struct ExtParsedEnt<'s> {
text_decl: Option<TextDecl<'s>>,
content: Content<'s>,
}
/// [78] extParsedEnt ::= TextDecl? content
pub fn ext_parsed_ent(input: &str) -> IResult<&str, ExtParsedEnt> {
map(pair(opt(text_decl), content), |(text_decl, content)| {
ExtParsedEnt { text_decl, content }
})(input)
}
pub type EncodingDecl<'s> = EncName<'s>;
/// [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName
pub fn encoding_decl(input: &str) -> IResult<&str, EncodingDecl> {
preceded(
tuple((s, tag("encoding"), eq)),
alt((
delimited(char('"'), enc_name, char('"')),
delimited(char('\''), enc_name, char('\'')),
)),
)(input)
}
pub type EncName<'s> = &'s str;
/// [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
pub fn enc_name(input: &str) -> IResult<&str, EncName> {
recognize(pair(
satisfy(|c| matches!(c, 'A'..='Z' | 'a'..='z' )),
many0(satisfy(
|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | '-' ),
)),
))(input)
}
#[derive(Debug)]
pub struct NotationDecl<'s> {
name: &'s str,
id: NotationDeclID<'s>,
}
#[derive(Debug)]
pub enum NotationDeclID<'s> {
External(ExternalID<'s>),
Public(PublicID<'s>),
}
/// [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> {
map(
delimited(
pair(tag("<!NOTATION"), s),
separated_pair(
name,
s,
alt((
map(external_id, |external_id| {
NotationDeclID::External(external_id)
}),
map(public_id, |public_id| NotationDeclID::Public(public_id)),
)),
),
pair(opt(s), tag(">")),
),
|(name, id)| NotationDecl { name, id },
)(input)
}
pub type PublicID<'s> = &'s str;
/// [83] PublicID ::= 'PUBLIC' S PubidLiteral
pub fn public_id(input: &str) -> IResult<&str, PublicID> {
preceded(pair(tag("PUBLIC"), s), pubid_literal)(input)
}
#[cfg(test)]
mod tests {
use std::num::NonZero;
use super::*;
#[test]
fn test_char_data() {
assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi"));
assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi"));
assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi"));
assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi"));
assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi"));
assert_eq!(
Err(Err::Incomplete(nom::Needed::Size(
NonZero::new(3usize).unwrap()
))),
char_data("abcdefghi")
);
}
#[test]
fn test_comment() {
assert_eq!(Ok(("", "")), comment("<!---->"));
assert_eq!(Ok(("", "asdf")), comment("<!--asdf-->"));
assert_eq!(Ok(("", "as-df")), comment("<!--as-df-->"));
assert_eq!(
Err(Err::Incomplete(nom::Needed::Size(
NonZero::new(2usize).unwrap()
))),
comment("<!--asdf")
);
}
#[test]
fn test_pi_target() {
assert_eq!(Ok((" ", "asdf")), pi_target("asdf "));
assert_eq!(Ok((" ", "xmlasdf")), pi_target("xmlasdf "));
assert_eq!(
Err(Err::Error(Error {
input: "xml ",
code: ErrorKind::Tag
})),
pi_target("xml ")
);
assert_eq!(
Err(Err::Error(Error {
input: "xMl ",
code: ErrorKind::Tag
})),
pi_target("xMl ")
);
}
#[test]
fn test_cd_sect() {
assert_eq!(
Ok((
"",
("<![CDATA[", "<greeting>Hello, world!</greeting>", "]]>")
)),
cd_sect("<![CDATA[<greeting>Hello, world!</greeting>]]>")
)
}
#[test]
fn test_cd_start() {
assert_eq!(Ok(("asdf", "<![CDATA[")), cd_start("<![CDATA[asdf"))
}
#[test]
fn test_cdata() {
assert_eq!(Ok(("]]>asdf", "asdf")), cdata("asdf]]>asdf"));
assert_eq!(
Ok(("]]>asdf", "<![CDATA[asdf")),
cdata("<![CDATA[asdf]]>asdf")
);
assert_eq!(
Ok(("]]>asdf", "<greeting>Hello, world!</greeting>")),
cdata("<greeting>Hello, world!</greeting>]]>asdf")
)
}
#[test]
fn test_cd_end() {
assert_eq!(Ok(("asdf", "]]>")), cd_end("]]>asdf"))
}
}