aboutsummaryrefslogtreecommitdiffstats
path: root/src/parser.rs
diff options
context:
space:
mode:
authorLibravatar cel 🌸 <cel@blos.sm>2024-06-25 00:18:18 +0100
committerLibravatar cel 🌸 <cel@blos.sm>2024-06-25 00:18:18 +0100
commit0b11cbbfd8904c11f425eb43aa10ebe3e69a758c (patch)
treede80bffb3e21ea50f65e7e8bd61a70d66e495e02 /src/parser.rs
parentafda87a8d7f347b0c4d34aa798f041d05b41bff0 (diff)
downloadpeanuts-0b11cbbfd8904c11f425eb43aa10ebe3e69a758c.tar.gz
peanuts-0b11cbbfd8904c11f425eb43aa10ebe3e69a758c.tar.bz2
peanuts-0b11cbbfd8904c11f425eb43aa10ebe3e69a758c.zip
WIP: extSubset
Diffstat (limited to 'src/parser.rs')
-rw-r--r--src/parser.rs208
1 files changed, 188 insertions, 20 deletions
diff --git a/src/parser.rs b/src/parser.rs
index e689a53..882ebae 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,7 +2,7 @@ use std::char;
use nom::{
branch::{alt, permutation},
- bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until},
+ bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until, take_while},
character::{
complete::one_of,
streaming::{alpha1, char, digit1, none_of, satisfy},
@@ -16,6 +16,8 @@ use nom::{
// parser: parses tokens from lexer into events
// no well formedness, validity, or data model, simple translation of input into rust types
+// output is a rust representation of the input xml
+// types could be used for xml production too?
enum ContentItem<'s> {
CharData(&'s str),
@@ -89,37 +91,73 @@ pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
}
-type EntityValue<'s> = &'s str;
+enum LiteralData<'s> {
+ String(&'s str),
+ PEReference(PEReference<'s>),
+ Reference(Reference<'s>),
+}
+
+type EntityValue<'s> = Vec<LiteralData<'s>>;
/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
/// | "'" ([^%&'] | PEReference | Reference)* "'"
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
alt((
delimited(
char('"'),
- recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
+ many0(alt((
+ map(
+ recognize(many_till(take(1usize), peek(one_of("%&\"")))),
+ |string| LiteralData::String(string),
+ ),
+ map(pe_reference, |pe_reference| {
+ LiteralData::PEReference(pe_reference)
+ }),
+ map(reference, |reference| LiteralData::Reference(reference)),
+ ))),
char('"'),
),
delimited(
char('\''),
- recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
+ many0(alt((
+ map(
+ recognize(many_till(take(1usize), peek(one_of("%&'")))),
+ |string| LiteralData::String(string),
+ ),
+ map(pe_reference, |pe_reference| {
+ LiteralData::PEReference(pe_reference)
+ }),
+ map(reference, |reference| LiteralData::Reference(reference)),
+ ))),
char('\''),
),
))(input)
}
-type AttValue<'s> = &'s str;
+type AttValue<'s> = Vec<LiteralData<'s>>;
/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
/// | "'" ([^<&'] | Reference)* "'"
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
alt((
delimited(
char('"'),
- recognize(many0(alt((none_of("<&\""), reference)))),
+ many0(alt((
+ map(
+ recognize(many_till(take(1usize), peek(one_of("%&\"")))),
+ |string| LiteralData::String(string),
+ ),
+ map(reference, |reference| LiteralData::Reference(reference)),
+ ))),
char('"'),
),
delimited(
char('\''),
- recognize(many0(alt((none_of("<&'"), reference)))),
+ many0(alt((
+ map(
+ recognize(many_till(take(1usize), peek(one_of("%&'")))),
+ |string| LiteralData::String(string),
+ ),
+ map(reference, |reference| LiteralData::Reference(reference)),
+ ))),
char('\''),
),
))(input)
@@ -389,18 +427,18 @@ pub fn decl_sep(input: &str) -> IResult<&str, DeclSep> {
))(input)
}
-enum IntSubsetItem<'s> {
+enum IntSubsetDeclaration<'s> {
MarkupDecl(MarkupDecl<'s>),
DeclSep(DeclSep<'s>),
}
-type IntSubset<'s> = Vec<IntSubsetItem<'s>>;
+type IntSubset<'s> = Vec<IntSubsetDeclaration<'s>>;
/// [28b] intSubset ::= (markupdecl | DeclSep)*
pub fn int_subset(input: &str) -> IResult<&str, IntSubset> {
many0(alt((
map(markup_decl, |markup_decl| {
- IntSubsetItem::MarkupDecl(markup_decl)
+ IntSubsetDeclaration::MarkupDecl(markup_decl)
}),
- map(decl_sep, |decl_sep| IntSubsetItem::DeclSep(decl_sep)),
+ map(decl_sep, |decl_sep| IntSubsetDeclaration::DeclSep(decl_sep)),
)))(input)
}
@@ -432,9 +470,39 @@ pub fn markup_decl(input: &str) -> IResult<&str, MarkupDecl> {
))(input)
}
+struct ExtSubset<'s> {
+ text_decl: Option<TextDecl<'s>>,
+ ext_subset_decl: ExtSubsetDecl<'s>,
+}
/// [30] extSubset ::= TextDecl? extSubsetDecl
+pub fn ext_subset(input: &str) -> IResult<&str, ExtSubset> {
+ map(
+ pair(opt(text_decl), ext_subset_decl),
+ |(text_decl, ext_subset_decl)| ExtSubset {
+ text_decl,
+ ext_subset_decl,
+ },
+ )(input)
+}
+enum ExtSubsetDeclaration<'s> {
+ MarkupDecl(MarkupDecl<'s>),
+ ConditionalSect(ConditionalSect<'s>),
+ DeclSep(DeclSep<'s>),
+}
+type ExtSubsetDecl<'s> = Vec<ExtSubsetDeclaration<'s>>;
/// [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
+pub fn ext_subset_decl(input: &str) -> IResult<&str, ExtSubsetDecl> {
+ many0(alt((
+ map(markup_decl, |markup_decl| {
+ ExtSubsetDeclaration::MarkupDecl(markup_decl)
+ }),
+ map(conditional_sect, |conditional_sect| {
+ ExtSubsetDeclaration::ConditionalSect(conditional_sect)
+ }),
+ map(decl_sep, |decl_sep| ExtSubsetDeclaration::DeclSep(decl_sep)),
+ )))(input)
+}
type SDDecl = bool;
/// [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
@@ -458,10 +526,9 @@ pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> {
// (Productions 33 through 38 have been removed.)
-struct Element<'s> {
- name: &'s str,
- attributes: Vec<Attribute<'s>>,
- content: Content<'s>,
+enum Element<'s> {
+ Empty(EmptyElemTag<'s>),
+ NotEmpty(STag<'s>, Content<'s>, ETag<'s>),
}
/// [39] element ::= EmptyElemTag | STag content ETag
pub fn element(input: &str) -> IResult<&str, Element> {
@@ -480,10 +547,29 @@ pub fn attribute(input: &str) -> IResult<&str, Attribute> {
separated_pair(name, eq, att_value)(input)
}
-type CharRef<'s> = &'s str;
+enum CharRef<'s> {
+ Decimal(&'s str),
+ Hexadecimal(&'s str),
+}
/// [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
pub fn char_ref(input: &str) -> IResult<&str, CharRef> {
- todo!()
+ alt((
+ delimited(
+ tag("&#"),
+ map(take_while(|c| matches!(c, '0'..='9')), |decimal| {
+ CharRef::Decimal(decimal)
+ }),
+ tag(";"),
+ ),
+ delimited(
+ tag("&#x"),
+ map(
+ take_while(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' )),
+ |hexadecimal| CharRef::Hexadecimal(hexadecimal),
+ ),
+ tag(";"),
+ ),
+ ))(input)
}
enum Reference<'s> {
@@ -510,7 +596,86 @@ pub fn pe_reference(input: &str) -> IResult<&str, PEReference> {
delimited(tag("%"), name, tag(";"))(input)
}
-/// TODO: entity declarations
+enum EntityDecl<'s> {
+ GEDecl(GEDecl<'s>),
+ PEDecl(PEDecl<'s>),
+}
+/// [70] EntityDecl ::= GEDecl | PEDecl
+pub fn entity_decl(input: &str) -> IResult<&str, EntityDecl> {
+ alt((
+ map(ge_decl, |ge_decl| EntityDecl::GEDecl(ge_decl)),
+ map(pe_decl, |pe_decl| EntityDecl::PEDecl(pe_decl)),
+ ))(input)
+}
+
+struct GEDecl<'s> {
+ name: Name<'s>,
+ entity_def: EntityDef<'s>,
+}
+/// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
+pub fn ge_decl(input: &str) -> IResult<&str, GEDecl> {
+ map(
+ delimited(
+ pair(tag("<!ENTITY"), s),
+ separated_pair(name, s, entity_def),
+ pair(opt(s), tag(">")),
+ ),
+ |(name, entity_def)| GEDecl { name, entity_def },
+ )(input)
+}
+
+struct PEDecl<'s> {
+ name: Name<'s>,
+ pe_def: PEDef<'s>,
+}
+/// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
+pub fn pe_decl(input: &str) -> IResult<&str, PEDecl> {
+ map(
+ delimited(
+ tuple((tag("<!ENTITY"), s, tag("%"), s)),
+ separated_pair(name, s, pe_def),
+ pair(opt(s), tag(">")),
+ ),
+ |(name, pe_def)| PEDecl { name, pe_def },
+ )(input)
+}
+
+enum EntityDef<'s> {
+ EntityValue(EntityValue<'s>),
+ ExternalID {
+ external_id: ExternalID<'s>,
+ ndata_decl: Option<NDataDecl<'s>>,
+ },
+}
+/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
+pub fn entity_def(input: &str) -> IResult<&str, EntityDef> {
+ alt((
+ map(entity_value, |entity_value| {
+ EntityDef::EntityValue(entity_value)
+ }),
+ map(
+ pair(external_id, opt(ndata_decl)),
+ |(external_id, ndata_decl)| EntityDef::ExternalID {
+ external_id,
+ ndata_decl,
+ },
+ ),
+ ))(input)
+}
+
+enum PEDef<'s> {
+ EntityValue(EntityValue<'s>),
+ ExternalID(ExternalID<'s>),
+}
+/// [74] PEDef ::= EntityValue | ExternalID
+pub fn pe_def(input: &str) -> IResult<&str, PEDef> {
+ alt((
+ map(entity_value, |entity_value| {
+ PEDef::EntityValue(entity_value)
+ }),
+ map(external_id, |external_id| PEDef::ExternalID(external_id)),
+ ))(input)
+}
enum ExternalID<'s> {
SYSTEM {
@@ -567,9 +732,12 @@ pub fn text_decl(input: &str) -> IResult<&str, TextDecl> {
)(input)
}
-type extParsedEnt<'s> = (Option<TextDecl<'s>>, Content<'s>);
+struct ExtParsedEnt<'s> {
+ text_decl: Option<TextDecl<'s>>,
+ content: Content<'s>,
+}
/// [78] extParsedEnt ::= TextDecl? content
-pub fn ext_parsed_ent(input: &str) -> IResult<&str, extParsedEnt> {
+pub fn ext_parsed_ent(input: &str) -> IResult<&str, ExtParsedEnt> {
pair(opt(text_decl), content)(input)
}