aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar cel 🌸 <cel@blos.sm>2024-06-24 18:02:21 +0100
committerLibravatar cel 🌸 <cel@blos.sm>2024-06-24 18:02:21 +0100
commitafda87a8d7f347b0c4d34aa798f041d05b41bff0 (patch)
treefbfb9df53552f3f380f8d454b2a2f8c89092bfc8
parentfeb13be926cbfb5204fa651d7c86809e20954f9d (diff)
downloadpeanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.tar.gz
peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.tar.bz2
peanuts-afda87a8d7f347b0c4d34aa798f041d05b41bff0.zip
WIP: dtd garbo
Diffstat (limited to '')
-rw-r--r--src/parser.rs282
1 files changed, 244 insertions, 38 deletions
diff --git a/src/parser.rs b/src/parser.rs
index d049c5c..e689a53 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -10,11 +10,12 @@ use nom::{
combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify},
error::{Error, ErrorKind},
multi::{many0, many1, many_till},
- sequence::{delimited, pair, preceded, tuple},
+ sequence::{delimited, pair, preceded, separated_pair, terminated, tuple},
Err, IResult, Parser,
};
// parser: parses tokens from lexer into events
+// no well formedness, validity, or data model, simple translation of input into rust types
enum ContentItem<'s> {
CharData(&'s str),
@@ -25,15 +26,6 @@ enum ContentItem<'s> {
type Content<'s> = Option<Vec<ContentItem<'s>>>;
-struct DoctypeDecl<'s> {
- name: &'s str,
- // TODO: doctype declaration parsing
-}
-///
-pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
- todo!()
-}
-
type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
/// [1] document ::= prolog element Misc*
pub fn document(input: &str) -> IResult<&str, Document> {
@@ -211,21 +203,20 @@ struct PI<'s> {
}
/// [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
pub fn pi(input: &str) -> IResult<&str, PI> {
- let (rest, (target, instruction)) = delimited(
- tag("<?"),
- pair(
- pi_target,
- opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))),
+ map(
+ delimited(
+ tag("<?"),
+ pair(
+ pi_target,
+ opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))),
+ ),
+ tag("?>"),
),
- tag("?>"),
- )(input)?;
- Ok((
- rest,
- PI {
+ |(target, instruction)| PI {
target,
instruction,
},
- ))
+ )(input)
}
type PITarget<'s> = &'s str;
@@ -288,21 +279,18 @@ struct XMLDecl<'s> {
}
/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
- // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
- let (leftover, (version_info, encoding_decl, sd_decl)) = delimited(
- tag("<?xml"),
- tuple((version_info, opt(encoding_decl), opt(sd_decl))),
- pair(opt(s), tag("?>")),
- )(input)?;
- // TODO: change to map
- Ok((
- leftover,
- XMLDecl {
+ map(
+ delimited(
+ tag("<?xml"),
+ tuple((version_info, opt(encoding_decl), opt(sd_decl))),
+ pair(opt(s), tag("?>")),
+ ),
+ |(version_info, encoding_decl, sd_decl)| XMLDecl {
version_info,
encoding_decl,
sd_decl,
},
- ))
+ )(input)
}
type VersionInfo = VersionNum;
@@ -342,6 +330,7 @@ pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
enum Misc<'s> {
Comment(Comment<'s>),
PI(PI<'s>),
+ // TODO: how to deal with whitespace
S,
}
/// [27] Misc ::= Comment | PI | S
@@ -353,6 +342,100 @@ pub fn misc(input: &str) -> IResult<&str, Misc> {
))(input)
}
+struct DoctypeDecl<'s> {
+ name: &'s str,
+ external_id: Option<ExternalID<'s>>,
+ int_subset: Option<IntSubset<'s>>,
+}
+/// [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
+pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
+ map(
+ delimited(
+ pair(tag("<!DOCTYPE"), s),
+ tuple((
+ name,
+ opt(preceded(s, external_id)),
+ preceded(
+ opt(s),
+ opt(terminated(
+ delimited(tag("["), int_subset, tag("]")),
+ opt(s),
+ )),
+ ),
+ )),
+ tag(">"),
+ ),
+ |(name, external_id, int_subset)| DoctypeDecl {
+ name,
+ external_id,
+ int_subset,
+ },
+ )(input)
+}
+
+#[derive(Clone)]
+enum DeclSep<'s> {
+ PEReference(PEReference<'s>),
+ // TODO: tackle whitespace
+ S,
+}
+/// [28a] DeclSep ::= PEReference | S
+pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> {
+ alt((
+ map(pe_reference, |pe_reference| {
+ DeclSep::PEReference(pe_reference)
+ }),
+ value(DeclSep::S, s),
+ ))(input)
+}
+
+enum IntSubsetItem<'s> {
+ MarkupDecl(MarkupDecl<'s>),
+ DeclSep(DeclSep<'s>),
+}
+type IntSubset<'s> = Vec<IntSubsetItem<'s>>;
+/// [28b] intSubset ::= (markupdecl | DeclSep)*
+pub fn int_subset(input: &str) -> IResult<&str, IntSubset> {
+ many0(alt((
+ map(markup_decl, |markup_decl| {
+ IntSubsetItem::MarkupDecl(markup_decl)
+ }),
+ map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)),
+ )))(input)
+}
+
+enum MarkupDecl<'s> {
+ ElementDecl(ElementDecl<'s>),
+ AttlistDecl(AttlistDecl<'s>),
+ EntityDecl(EntityDecl<'s>),
+ NotationDecl(NotationDecl<'s>),
+ PI(PI<'s>),
+ Comment(Comment<'s>),
+}
+/// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
+pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> {
+ alt((
+ map(element_decl, |element_decl| {
+ MarkupDecl::ElementDecl(element_decl)
+ }),
+ map(attlist_decl, |attlist_decl| {
+ MarkupDecl::AttlistDecl(attlist_decl)
+ }),
+ map(entity_decl, |entity_decl| {
+ MarkupDecl::EntityDecl(entity_decl)
+ }),
+ map(notation_decl, |notation_decl| {
+ MarkupDecl::NotationDecl(notation_decl)
+ }),
+ map(pi, |pi| MarkupDecl::PI(pi)),
+ map(comment, |comment| MarkupDecl::Comment(comment)),
+ ))(input)
+}
+
+/// [30] extSubset ::= TextDecl? extSubsetDecl
+
+/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
+
type SDDecl = bool;
/// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> {
@@ -388,18 +471,106 @@ pub fn element(input: &str) -> IResult<&str, Element> {
))(input)
}
-let
+// let STag<'s> = (Name<'s>, );
/// [40] STag ::= '<' Name (S Attribute)* S? '>'
-type Attribute<'s> = (&'s str, &'s str)
-/// [41] Attribute ::= Name Eq AttValue
+type Attribute<'s> = (Name<'s>, AttValue<'s>);
+/// [41] Attribute ::= Name Eq AttValue
+pub fn attribute(input: &str) -> IResult<&str, Attribute> {
+ separated_pair(name, eq, att_value)(input)
+}
-pub fn reference(input: &str) -> IResult<&str, char> {
+type CharRef<'s> = &'s str;
+/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+pub fn char_ref(input: &str) -> IResult<&str, CharRef> {
todo!()
}
-pub fn pe_reference(input: &str) -> IResult<&str, char> {
- todo!()
+enum Reference<'s> {
+ EntityRef(EntityRef<'s>),
+ CharRef(CharRef<'s>),
+}
+/// [67] Reference ::= EntityRef | CharRef
+pub fn reference(input: &str) -> IResult<&str, Reference> {
+ alt((
+ map(entity_ref, |entity_ref| Reference::EntityRef(entity_ref)),
+ map(char_ref, |char_ref| Reference::CharRef(char_ref)),
+ ))(input)
+}
+
+type EntityRef<'s> = &'s str;
+/// [68] EntityRef ::= '&' Name ';'
+pub fn entity_ref(input: &str) -> IResult<&str, EntityRef> {
+ delimited(tag("&"), name, tag(";"))(input)
+}
+
+type PEReference<'s> = &'s str;
+/// [69] PEReference ::= '%' Name ';'
+pub fn pe_reference(input: &str) -> IResult<&str, PEReference> {
+ delimited(tag("%"), name, tag(";"))(input)
+}
+
+/// TODO: entity declarations
+
+enum ExternalID<'s> {
+ SYSTEM {
+ system_identifier: &'s str,
+ },
+ PUBLIC {
+ public_identifier: &'s str,
+ system_identifier: &'s str,
+ },
+}
+/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
+// pub fn external_id(input: &str) -> IResult<&str, ExternalID> {
+pub fn external_id(input: &str) -> IResult<&str, ExternalID> {
+ alt((
+ map(
+ preceded(pair(tag("SYSTEM"), s), system_literal),
+ |system_identifier| ExternalID::SYSTEM { system_identifier },
+ ),
+ map(
+ preceded(
+ pair(tag("PUBLIC"), s),
+ separated_pair(pubid_literal, s, system_literal),
+ ),
+ |(public_identifier, system_identifier)| ExternalID::PUBLIC {
+ public_identifier,
+ system_identifier,
+ },
+ ),
+ ))(input)
+}
+
+type NDataDecl<'s> = &'s str;
+/// [76] NDataDecl ::= S 'NDATA' S Name
+pub fn ndata_decl(input: &str) -> IResult<&str, NDataDecl> {
+ preceded(tuple((s, tag("NDATA"), s)), name)(input)
+}
+
+struct TextDecl<'s> {
+ version_info: Option<VersionInfo>,
+ encoding_decl: EncodingDecl<'s>,
+}
+/// [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
+pub fn text_decl(input: &str) -> IResult<&str, TextDecl> {
+ map(
+ delimited(
+ tag("<?xml"),
+ pair(opt(version_info), terminated(encoding_decl, opt(s))),
+ tag("?>"),
+ ),
+ |(version_info, encoding_decl)| TextDecl {
+ version_info,
+ encoding_decl,
+ },
+ )(input)
+}
+
+type extParsedEnt<'s> = (Option<TextDecl<'s>>, Content<'s>);
+/// [78] extParsedEnt ::= TextDecl? content
+pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> {
+ pair(opt(text_decl), content)(input)
}
type EncodingDecl<'s> = EncName<'s>;
@@ -425,6 +596,41 @@ pub fn enc_name(input: &str) -> IResult<&str, EncName> {
))(input)
}
+struct NotationDecl<'s> {
+ name: &'s str,
+ id: NotationDeclID<'s>,
+}
+enum NotationDeclID<'s> {
+ External(ExternalID<'s>),
+ Public(PublicID<'s>),
+}
+/// [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
+pub fn notation_decl(input: &str) -> IResult<&str, NotationDecl> {
+ map(
+ delimited(
+ pair(tag("<!NOTATION"), s),
+ separated_pair(
+ name,
+ s,
+ alt((
+ map(external_id, |external_id| {
+ NotationDeclID::External(external_id)
+ }),
+ map(public_id, |public_id| NotationDeclID::Public(public_id)),
+ )),
+ ),
+ pair(opt(s), tag(">")),
+ ),
+ |(name, id)| NotationDecl { name, id },
+ )(input)
+}
+
+type PublicID<'s> = &'s str;
+/// [83] PublicID ::= 'PUBLIC' S PubidLiteral
+pub fn public_id(input: &str) -> IResult<&str, PublicID> {
+ preceded(pair(tag("PUBLIC"), s), pubid_literal)(input)
+}
+
#[cfg(test)]
mod tests {
use std::num::NonZero;