diff options
| author | 2024-06-24 18:02:21 +0100 | |
|---|---|---|
| committer | 2024-06-24 18:02:21 +0100 | |
| commit | afda87a8d7f347b0c4d34aa798f041d05b41bff0 (patch) | |
| tree | fbfb9df53552f3f380f8d454b2a2f8c89092bfc8 /src | |
| parent | feb13be926cbfb5204fa651d7c86809e20954f9d (diff) | |
| download | peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.tar.gz peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.tar.bz2 peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.zip | |
WIP: dtd garbo
Diffstat (limited to '')
| -rw-r--r-- | src/parser.rs | 282 | 
1 files changed, 244 insertions, 38 deletions
| diff --git a/src/parser.rs b/src/parser.rs index d049c5c..e689a53 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -10,11 +10,12 @@ use nom::{      combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify},      error::{Error, ErrorKind},      multi::{many0, many1, many_till}, -    sequence::{delimited, pair, preceded, tuple}, +    sequence::{delimited, pair, preceded, separated_pair, terminated, tuple},      Err, IResult, Parser,  };  // parser: parses tokens from lexer into events +// no well formedness, validity, or data model, simple translation of input into rust types  enum ContentItem<'s> {      CharData(&'s str), @@ -25,15 +26,6 @@ enum ContentItem<'s> {  type Content<'s> = Option<Vec<ContentItem<'s>>>; -struct DoctypeDecl<'s> { -    name: &'s str, -    // TODO: doctype declaration parsing -} -/// -pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { -    todo!() -} -  type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);  /// [1]   	document	   ::=   	prolog element Misc*  pub fn document(input: &str) -> IResult<&str, Document> { @@ -211,21 +203,20 @@ struct PI<'s> {  }  /// [16]   	PI	   ::=   	'<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'  pub fn pi(input: &str) -> IResult<&str, PI> { -    let (rest, (target, instruction)) = delimited( -        tag("<?"), -        pair( -            pi_target, -            opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))), +    map( +        delimited( +            tag("<?"), +            pair( +                pi_target, +                opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))), +            ), +            tag("?>"),          ), -        tag("?>"), -    )(input)?; -    Ok(( -        rest, -        PI { +        |(target, instruction)| PI {              target,              instruction,          }, -    )) +    )(input)  }  type PITarget<'s> = &'s str; @@ -288,21 +279,18 @@ struct XMLDecl<'s> {  }  /// [23]   	XMLDecl	   ::=   	'<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'  pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { -    // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>) -    let (leftover, (version_info, encoding_decl, sd_decl)) = delimited( -        tag("<?xml"), -        tuple((version_info, opt(encoding_decl), opt(sd_decl))), -        pair(opt(s), tag("?>")), -    )(input)?; -    // TODO: change to map -    Ok(( -        leftover, -        XMLDecl { +    map( +        delimited( +            tag("<?xml"), +            tuple((version_info, opt(encoding_decl), opt(sd_decl))), +            pair(opt(s), tag("?>")), +        ), +        |(version_info, encoding_decl, sd_decl)| XMLDecl {              version_info,              encoding_decl,              sd_decl,          }, -    )) +    )(input)  }  type VersionInfo = VersionNum; @@ -342,6 +330,7 @@ pub fn version_num(input: &str) -> IResult<&str, VersionNum> {  enum Misc<'s> {      Comment(Comment<'s>),      PI(PI<'s>), +    // TODO: how to deal with whitespace      S,  }  /// [27]   	Misc	   ::=   	Comment | PI | S @@ -353,6 +342,100 @@ pub fn misc(input: &str) -> IResult<&str, Misc> {      ))(input)  } +struct DoctypeDecl<'s> { +    name: &'s str, +    external_id: Option<ExternalID<'s>>, +    int_subset: Option<IntSubset<'s>>, +} +/// [28]   	doctypedecl	   ::=   	'<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' +pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { +    map( +        delimited( +            pair(tag("<!DOCTYPE"), s), +            tuple(( +                name, +                opt(preceded(s, external_id)), +                preceded( +                    opt(s), +                    opt(terminated( +                        delimited(tag("["), int_subset, tag("]")), +                        opt(s), +                    )), +                ), +            )), +            tag(">"), +        ), +        |(name, external_id, int_subset)| DoctypeDecl { +            name, +            external_id, +            int_subset, +        }, +    )(input) +} + +#[derive(Clone)] +enum DeclSep<'s> { +    PEReference(PEReference<'s>), +    // TODO: tackle whitespace +    S, +} +/// [28a]   	DeclSep	   ::=   	PEReference | S +pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> { +    alt(( +        map(pe_reference, |pe_reference| { +            DeclSep::PEReference(pe_reference) +        }), +        value(DeclSep::S, s), +    ))(input) +} + +enum IntSubsetItem<'s> { +    MarkupDecl(MarkupDecl<'s>), +    DeclSep(DeclSep<'s>), +} +type IntSubset<'s> = Vec<IntSubsetItem<'s>>; +/// [28b]   	intSubset	   ::=   	(markupdecl | DeclSep)* +pub fn int_subset(input: &str) -> IResult<&str, IntSubset> { +    many0(alt(( +        map(markup_decl, |markup_decl| { +            IntSubsetItem::MarkupDecl(markup_decl) +        }), +        map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)), +    )))(input) +} + +enum MarkupDecl<'s> { +    ElementDecl(ElementDecl<'s>), +    AttlistDecl(AttlistDecl<'s>), +    EntityDecl(EntityDecl<'s>), +    NotationDecl(NotationDecl<'s>), +    PI(PI<'s>), +    Comment(Comment<'s>), +} +/// [29]   	markupdecl	   ::=   	elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment +pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> { +    alt(( +        map(element_decl, |element_decl| { +            MarkupDecl::ElementDecl(element_decl) +        }), +        map(attlist_decl, |attlist_decl| { +            MarkupDecl::AttlistDecl(attlist_decl) +        }), +        map(entity_decl, |entity_decl| { +            MarkupDecl::EntityDecl(entity_decl) +        }), +        map(notation_decl, |notation_decl| { +            MarkupDecl::NotationDecl(notation_decl) +        }), +        map(pi, |pi| MarkupDecl::PI(pi)), +        map(comment, |comment| MarkupDecl::Comment(comment)), +    ))(input) +} + +/// [30]   	extSubset	   ::=   	TextDecl? extSubsetDecl + +/// [31]   	extSubsetDecl	   ::=   	( markupdecl | conditionalSect | DeclSep)* +  type SDDecl = bool;  /// [32]   	SDDecl	   ::=   	S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))  pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> { @@ -388,18 +471,106 @@ pub fn element(input: &str) -> IResult<&str, Element> {      ))(input)  } -let  +// let STag<'s> = (Name<'s>, );  /// [40]   	STag	   ::=   	'<' Name (S Attribute)* S? '>' -type Attribute<'s> = (&'s str, &'s str) -/// [41]   	Attribute	   ::=   	Name Eq AttValue  +type Attribute<'s> = (Name<'s>, AttValue<'s>); +/// [41]   	Attribute	   ::=   	Name Eq AttValue +pub fn attribute(input: &str) -> IResult<&str, Attribute> { +    separated_pair(name, eq, att_value)(input) +} -pub fn reference(input: &str) -> IResult<&str, char> { +type CharRef<'s> = &'s str; +/// [66]   	CharRef	   ::=   	'&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' +pub fn char_ref(input: &str) -> IResult<&str, CharRef> {      todo!()  } -pub fn pe_reference(input: &str) -> IResult<&str, char> { -    todo!() +enum Reference<'s> { +    EntityRef(EntityRef<'s>), +    CharRef(CharRef<'s>), +} +/// [67]   	Reference	   ::=   	EntityRef | CharRef +pub fn reference(input: &str) -> IResult<&str, Reference> { +    alt(( +        map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)), +        map(char_ref, |char_ref| Reference::CharRef(char_ref)), +    ))(input) +} + +type EntityRef<'s> = &'s str; +/// [68]   	EntityRef	   ::=   	'&' Name ';' +pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> { +    delimited(tag("&"), name, tag(";"))(input) +} + +type PEReference<'s> = &'s str; +/// [69]   	PEReference	   ::=   	'%' Name ';' +pub fn pe_reference(input: &str) -> IResult<&str, PEReference> { +    delimited(tag("%"), name, tag(";"))(input) +} + +/// TODO: entity declarations + +enum ExternalID<'s> { +    SYSTEM { +        system_identifier: &'s str, +    }, +    PUBLIC { +        public_identifier: &'s str, +        system_identifier: &'s str, +    }, +} +/// [75]   	ExternalID	   ::=   	'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral +// pub fn external_id(input: &str) -> IResult<&str, ExternalID> { +pub fn external_id(input: &str) -> IResult<&str, ExternalID> { +    alt(( +        map( +            preceded(pair(tag("SYSTEM"), s), system_literal), +            |system_identifier| ExternalID::SYSTEM { system_identifier }, +        ), +        map( +            preceded( +                pair(tag("PUBLIC"), s), +                separated_pair(pubid_literal, s, system_literal), +            ), +            |(public_identifier, system_identifier)| ExternalID::PUBLIC { +                public_identifier, +                system_identifier, +            }, +        ), +    ))(input) +} + +type NDataDecl<'s> = &'s str; +/// [76]   	NDataDecl	   ::=   	S 'NDATA' S Name +pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> { +    preceded(tuple((s, tag("NDATA"), s)), name)(input) +} + +struct TextDecl<'s> { +    version_info: Option<VersionInfo>, +    encoding_decl: EncodingDecl<'s>, +} +/// [77]   	TextDecl	   ::=   	'<?xml' VersionInfo? EncodingDecl S? '?>' +pub fn text_decl(input: &str) -> IResult<&str, TextDecl> { +    map( +        delimited( +            tag("<?xml"), +            pair(opt(version_info), terminated(encoding_decl, opt(s))), +            tag("?>"), +        ), +        |(version_info, encoding_decl)| TextDecl { +            version_info, +            encoding_decl, +        }, +    )(input) +} + +type extParsedEnt<'s> = (Option<TextDecl<'s>>, Content<'s>); +/// [78]   	extParsedEnt	   ::=   	TextDecl? content +pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> { +    pair(opt(text_decl), content)(input)  }  type EncodingDecl<'s> = EncName<'s>; @@ -425,6 +596,41 @@ pub fn enc_name(input: &str) -> IResult<&str, EncName> {      ))(input)  } +struct NotationDecl<'s> { +    name: &'s str, +    id: NotationDeclID<'s>, +} +enum NotationDeclID<'s> { +    External(ExternalID<'s>), +    Public(PublicID<'s>), +} +/// [82]   	NotationDecl	   ::=   	'<!NOTATION' S Name S (ExternalID | PublicID) S? '>' +pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> { +    map( +        delimited( +            pair(tag("<!NOTATION"), s), +            separated_pair( +                name, +                s, +                alt(( +                    map(external_id, |external_id| { +                        NotationDeclID::External(external_id) +                    }), +                    map(public_id, |public_id| NotationDeclID::Public(public_id)), +                )), +            ), +            pair(opt(s), tag(">")), +        ), +        |(name, id)| NotationDecl { name, id }, +    )(input) +} + +type PublicID<'s> = &'s str; +/// [83]   	PublicID	   ::=   	'PUBLIC' S PubidLiteral +pub fn public_id(input: &str) -> IResult<&str, PublicID> { +    preceded(pair(tag("PUBLIC"), s), pubid_literal)(input) +} +  #[cfg(test)]  mod tests {      use std::num::NonZero; | 
