aboutsummaryrefslogtreecommitdiffstats
path: root/src/parser.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/parser.rs')
-rw-r--r--src/parser.rs283
1 files changed, 283 insertions, 0 deletions
diff --git a/src/parser.rs b/src/parser.rs
index b2a8579..518aad4 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1 +1,284 @@
+use std::char;
+
+use nom::{
+ branch::alt,
+ bytes::{
+ complete::take_until,
+ streaming::{is_a, tag, take},
+ },
+ character::{
+ complete::one_of,
+ streaming::{char, digit1, none_of, satisfy},
+ },
+ combinator::{cond, map, map_parser, map_res, not, opt, recognize, value, verify},
+ error::ErrorKind,
+ multi::{many0, many1},
+ sequence::{delimited, pair, preceded, tuple},
+ Err, IResult, Parser,
+};
+
// parser: parses tokens from lexer into events
+
+enum Misc<'s> {
+ Comment(Comment<'s>),
+ PI(PI<'s>),
+}
+
+type Comment<'s> = &'s str;
+
+struct PI<'s> {
+ target: &'s str,
+ instruction: Option<&'s str>,
+}
+
+enum ContentItem<'s> {
+ CharData(&'s str),
+ Element(Element<'s>),
+ Reference(Reference<'s>),
+ CDSect(CDSect<'s>),
+}
+
+type Content<'s> = Option<Vec<ContentItem<'s>>>;
+
+struct Element<'s> {
+ name: &'s str,
+ attributes: Vec<Attribute<'s>>,
+ content: Content<'s>,
+}
+
+struct Attribute<'s> {
+ key: &'s str,
+ value: &'s str,
+}
+
+// type VersionNum<'s> = &'s str;
+/// Contains only latin characters or dash after first char
+type EncName<'s> = &'s str;
+
+// struct XMLDecl<'s> {
+// version_info: VersionNum<'s>,
+// encoding_decl: Option<EncName<'s>>,
+// sd_decl: Option<bool>,
+// }
+
+struct DoctypeDecl<'s> {
+ name: &'s str,
+ // TODO
+}
+
+pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
+ todo!()
+}
+
+pub fn element(input: &str) -> IResult<&str, Element> {
+ todo!()
+}
+
+pub fn misc(input: &str) -> IResult<&str, Misc> {
+ todo!()
+}
+
+type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
+/// [1] document ::= prolog element Misc*
+pub fn document(input: &str) -> IResult<&str, Document> {
+ tuple((prolog, element, many0(misc)))(input)
+}
+
+type Char = char;
+/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+pub fn xmlchar(input: &str) -> IResult<&str, Char> {
+ satisfy(
+ |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
+ )(input)
+}
+
+type S<'s> = &'s str;
+/// [3] S ::= (#x20 | #x9 | #xD | #xA)+
+pub fn s(input: &str) -> IResult<&str, S> {
+ is_a("\u{20}\u{9}\u{D}\u{A}")(input)
+}
+
+type NameStartChar = char;
+/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
+ satisfy(
+ |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
+ )(input)
+}
+
+type NameChar = char;
+/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+pub fn name_char(input: &str) -> IResult<&str, NameChar> {
+ alt((
+ name_start_char,
+ satisfy(
+ |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
+ ),
+ ))(input)
+}
+
+type Name<'s> = &'s str;
+/// [5] Name ::= NameStartChar (NameChar)*
+pub fn name(input: &str) -> IResult<&str, Name> {
+ recognize(pair(name_start_char, many0(name_char)))(input)
+}
+
+type Names<'s> = &'s str;
+/// [6] Names ::= Name (#x20 Name)*
+pub fn names(input: &str) -> IResult<&str, Names> {
+ recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
+}
+
+type Nmtoken<'s> = &'s str;
+/// [7] Nmtoken ::= (NameChar)+
+pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
+ recognize(many1(name_char))(input)
+}
+
+type Nmtokens<'s> = &'s str;
+/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
+ recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
+}
+
+type EntityValue<'s> = &'s str;
+/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
+/// | "'" ([^%&'] | PEReference | Reference)* "'"
+pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
+ alt((
+ delimited(
+ char('"'),
+ recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
+ char('"'),
+ ),
+ delimited(
+ char('\''),
+ recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
+ char('\''),
+ ),
+ ))(input)
+}
+
+type AttValue<'s> = &'s str;
+/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
+/// | "'" ([^<&'] | Reference)* "'"
+pub fn att_value(input: &str) -> IResult<&str, AttValue> {
+ alt((
+ delimited(
+ char('"'),
+ recognize(many0(alt((none_of("<&\""), reference)))),
+ char('"'),
+ ),
+ delimited(
+ char('\''),
+ recognize(many0(alt((none_of("<&'"), reference)))),
+ char('\''),
+ ),
+ ))(input)
+}
+
+type SystemLiteral<'s> = &'s str;
+/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
+pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
+ alt((
+ delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
+ delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
+ ))(input)
+}
+
+type PubidLiteral<'s> = &'s str;
+/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
+pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
+ alt((
+ delimited(char('"'), recognize(many0(pubid_char)), char('"')),
+ delimited(
+ char('\''),
+ recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
+ char('\''),
+ ),
+ ))(input)
+}
+
+type PubidChar<'s> = char;
+/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
+pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
+ satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
+ input,
+ )
+}
+
+type CharData<'s> = &'s str;
+/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+pub fn char_data(input: &str) -> IResult<&str, CharData> {
+ take_until()(input)
+}
+
+type Prolog<'s> = (
+ Option<XMLDecl>,
+ Vec<Misc<'s>>,
+ Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
+);
+/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+pub fn prolog(input: &str) -> IResult<&str, Prolog> {
+ tuple((
+ opt(xml_decl),
+ many0(misc),
+ opt(tuple((doctypedecl, many0(misc)))),
+ ))(input)
+}
+
+struct XMLDecl {
+ version_info: VersionInfo,
+ encoding_decl: Option<EncodingDecl>,
+ sd_decl: Option<SDDecl>,
+}
+/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
+ // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
+ let (leftover, (version_info, encoding_decl, sd_decl)) = delimited(
+ tag("<?xml"),
+ tuple((version_info, opt(encoding_decl), opt(sd_decl))),
+ tag("?>"),
+ )(input)?;
+ Ok((
+ leftover,
+ XMLDecl {
+ version_info,
+ encoding_decl,
+ sd_decl,
+ },
+ ))
+}
+
+type VersionInfo = VersionNum;
+/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
+pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
+ preceded(
+ tuple((s, tag("version"), eq)),
+ alt((
+ delimited(char('\''), version_num, char('\'')),
+ delimited(char('"'), version_num, char('"')),
+ )),
+ )(input)
+}
+
+/// [25] Eq ::= S? '=' S?
+pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> {
+ tuple((opt(s), char('='), opt(s)))(input)
+}
+
+#[derive(Clone)]
+enum VersionNum {
+ One,
+ OneDotOne,
+}
+/// [26] VersionNum ::= '1.' [0-9]+
+pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
+ preceded(
+ tag("1."),
+ alt((
+ value(VersionNum::One, char('0')),
+ value(VersionNum::OneDotOne, char('1')),
+ )),
+ )(input)
+}