diff options
author | cel 🌸 <cel@blos.sm> | 2024-06-24 18:02:21 +0100 |
---|---|---|
committer | cel 🌸 <cel@blos.sm> | 2024-06-24 18:02:21 +0100 |
commit | afda87a8d7f347b0c4d34aa798f041d05b41bff0 (patch) | |
tree | fbfb9df53552f3f380f8d454b2a2f8c89092bfc8 /src/parser.rs | |
parent | feb13be926cbfb5204fa651d7c86809e20954f9d (diff) | |
download | peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.tar.gz peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.tar.bz2 peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.zip |
WIP: dtd garbo
Diffstat (limited to 'src/parser.rs')
-rw-r--r-- | src/parser.rs | 282 |
1 files changed, 244 insertions, 38 deletions
diff --git a/src/parser.rs b/src/parser.rs index d049c5c..e689a53 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -10,11 +10,12 @@ use nom::{ combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify}, error::{Error, ErrorKind}, multi::{many0, many1, many_till}, - sequence::{delimited, pair, preceded, tuple}, + sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}, Err, IResult, Parser, }; // parser: parses tokens from lexer into events +// no well formedness, validity, or data model, simple translation of input into rust types enum ContentItem<'s> { CharData(&'s str), @@ -25,15 +26,6 @@ enum ContentItem<'s> { type Content<'s> = Option<Vec<ContentItem<'s>>>; -struct DoctypeDecl<'s> { - name: &'s str, - // TODO: doctype declaration parsing -} -/// -pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { - todo!() -} - type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>); /// [1] document ::= prolog element Misc* pub fn document(input: &str) -> IResult<&str, Document> { @@ -211,21 +203,20 @@ struct PI<'s> { } /// [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' pub fn pi(input: &str) -> IResult<&str, PI> { - let (rest, (target, instruction)) = delimited( - tag("<?"), - pair( - pi_target, - opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))), + map( + delimited( + tag("<?"), + pair( + pi_target, + opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))), + ), + tag("?>"), ), - tag("?>"), - )(input)?; - Ok(( - rest, - PI { + |(target, instruction)| PI { target, instruction, }, - )) + )(input) } type PITarget<'s> = &'s str; @@ -288,21 +279,18 @@ struct XMLDecl<'s> { } /// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { - // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>) - let (leftover, (version_info, encoding_decl, sd_decl)) = delimited( - tag("<?xml"), - tuple((version_info, opt(encoding_decl), opt(sd_decl))), - pair(opt(s), tag("?>")), - )(input)?; - // TODO: change to map - Ok(( - leftover, - XMLDecl { + map( + delimited( + tag("<?xml"), + tuple((version_info, opt(encoding_decl), opt(sd_decl))), + pair(opt(s), tag("?>")), + ), + |(version_info, encoding_decl, sd_decl)| XMLDecl { version_info, encoding_decl, sd_decl, }, - )) + )(input) } type VersionInfo = VersionNum; @@ -342,6 +330,7 @@ pub fn version_num(input: &str) -> IResult<&str, VersionNum> { enum Misc<'s> { Comment(Comment<'s>), PI(PI<'s>), + // TODO: how to deal with whitespace S, } /// [27] Misc ::= Comment | PI | S @@ -353,6 +342,100 @@ pub fn misc(input: &str) -> IResult<&str, Misc> { ))(input) } +struct DoctypeDecl<'s> { + name: &'s str, + external_id: Option<ExternalID<'s>>, + int_subset: Option<IntSubset<'s>>, +} +/// [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' +pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { + map( + delimited( + pair(tag("<!DOCTYPE"), s), + tuple(( + name, + opt(preceded(s, external_id)), + preceded( + opt(s), + opt(terminated( + delimited(tag("["), int_subset, tag("]")), + opt(s), + )), + ), + )), + tag(">"), + ), + |(name, external_id, int_subset)| DoctypeDecl { + name, + external_id, + int_subset, + }, + )(input) +} + +#[derive(Clone)] +enum DeclSep<'s> { + PEReference(PEReference<'s>), + // TODO: tackle whitespace + S, +} +/// [28a] DeclSep ::= PEReference | S +pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> { + alt(( + map(pe_reference, |pe_reference| { + DeclSep::PEReference(pe_reference) + }), + value(DeclSep::S, s), + ))(input) +} + +enum IntSubsetItem<'s> { + MarkupDecl(MarkupDecl<'s>), + DeclSep(DeclSep<'s>), +} +type IntSubset<'s> = Vec<IntSubsetItem<'s>>; +/// [28b] intSubset ::= (markupdecl | DeclSep)* +pub fn int_subset(input: &str) -> IResult<&str, IntSubset> { + many0(alt(( + map(markup_decl, |markup_decl| { + IntSubsetItem::MarkupDecl(markup_decl) + }), + map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)), + )))(input) +} + +enum MarkupDecl<'s> { + ElementDecl(ElementDecl<'s>), + AttlistDecl(AttlistDecl<'s>), + EntityDecl(EntityDecl<'s>), + NotationDecl(NotationDecl<'s>), + PI(PI<'s>), + Comment(Comment<'s>), +} +/// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment +pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> { + alt(( + map(element_decl, |element_decl| { + MarkupDecl::ElementDecl(element_decl) + }), + map(attlist_decl, |attlist_decl| { + MarkupDecl::AttlistDecl(attlist_decl) + }), + map(entity_decl, |entity_decl| { + MarkupDecl::EntityDecl(entity_decl) + }), + map(notation_decl, |notation_decl| { + MarkupDecl::NotationDecl(notation_decl) + }), + map(pi, |pi| MarkupDecl::PI(pi)), + map(comment, |comment| MarkupDecl::Comment(comment)), + ))(input) +} + +/// [30] extSubset ::= TextDecl? extSubsetDecl + +/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* + type SDDecl = bool; /// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> { @@ -388,18 +471,106 @@ pub fn element(input: &str) -> IResult<&str, Element> { ))(input) } -let +// let STag<'s> = (Name<'s>, ); /// [40] STag ::= '<' Name (S Attribute)* S? '>' -type Attribute<'s> = (&'s str, &'s str) -/// [41] Attribute ::= Name Eq AttValue +type Attribute<'s> = (Name<'s>, AttValue<'s>); +/// [41] Attribute ::= Name Eq AttValue +pub fn attribute(input: &str) -> IResult<&str, Attribute> { + separated_pair(name, eq, att_value)(input) +} -pub fn reference(input: &str) -> IResult<&str, char> { +type CharRef<'s> = &'s str; +/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' +pub fn char_ref(input: &str) -> IResult<&str, CharRef> { todo!() } -pub fn pe_reference(input: &str) -> IResult<&str, char> { - todo!() +enum Reference<'s> { + EntityRef(EntityRef<'s>), + CharRef(CharRef<'s>), +} +/// [67] Reference ::= EntityRef | CharRef +pub fn reference(input: &str) -> IResult<&str, Reference> { + alt(( + map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)), + map(char_ref, |char_ref| Reference::CharRef(char_ref)), + ))(input) +} + +type EntityRef<'s> = &'s str; +/// [68] EntityRef ::= '&' Name ';' +pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> { + delimited(tag("&"), name, tag(";"))(input) +} + +type PEReference<'s> = &'s str; +/// [69] PEReference ::= '%' Name ';' +pub fn pe_reference(input: &str) -> IResult<&str, PEReference> { + delimited(tag("%"), name, tag(";"))(input) +} + +/// TODO: entity declarations + +enum ExternalID<'s> { + SYSTEM { + system_identifier: &'s str, + }, + PUBLIC { + public_identifier: &'s str, + system_identifier: &'s str, + }, +} +/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral +// pub fn external_id(input: &str) -> IResult<&str, ExternalID> { +pub fn external_id(input: &str) -> IResult<&str, ExternalID> { + alt(( + map( + preceded(pair(tag("SYSTEM"), s), system_literal), + |system_identifier| ExternalID::SYSTEM { system_identifier }, + ), + map( + preceded( + pair(tag("PUBLIC"), s), + separated_pair(pubid_literal, s, system_literal), + ), + |(public_identifier, system_identifier)| ExternalID::PUBLIC { + public_identifier, + system_identifier, + }, + ), + ))(input) +} + +type NDataDecl<'s> = &'s str; +/// [76] NDataDecl ::= S 'NDATA' S Name +pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> { + preceded(tuple((s, tag("NDATA"), s)), name)(input) +} + +struct TextDecl<'s> { + version_info: Option<VersionInfo>, + encoding_decl: EncodingDecl<'s>, +} +/// [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' +pub fn text_decl(input: &str) -> IResult<&str, TextDecl> { + map( + delimited( + tag("<?xml"), + pair(opt(version_info), terminated(encoding_decl, opt(s))), + tag("?>"), + ), + |(version_info, encoding_decl)| TextDecl { + version_info, + encoding_decl, + }, + )(input) +} + +type extParsedEnt<'s> = (Option<TextDecl<'s>>, Content<'s>); +/// [78] extParsedEnt ::= TextDecl? content +pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> { + pair(opt(text_decl), content)(input) } type EncodingDecl<'s> = EncName<'s>; @@ -425,6 +596,41 @@ pub fn enc_name(input: &str) -> IResult<&str, EncName> { ))(input) } +struct NotationDecl<'s> { + name: &'s str, + id: NotationDeclID<'s>, +} +enum NotationDeclID<'s> { + External(ExternalID<'s>), + Public(PublicID<'s>), +} +/// [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' +pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> { + map( + delimited( + pair(tag("<!NOTATION"), s), + separated_pair( + name, + s, + alt(( + map(external_id, |external_id| { + NotationDeclID::External(external_id) + }), + map(public_id, |public_id| NotationDeclID::Public(public_id)), + )), + ), + pair(opt(s), tag(">")), + ), + |(name, id)| NotationDecl { name, id }, + )(input) +} + +type PublicID<'s> = &'s str; +/// [83] PublicID ::= 'PUBLIC' S PubidLiteral +pub fn public_id(input: &str) -> IResult<&str, PublicID> { + preceded(pair(tag("PUBLIC"), s), pubid_literal)(input) +} + #[cfg(test)] mod tests { use std::num::NonZero; |