diff options
Diffstat (limited to 'src/parser.rs')
-rw-r--r-- | src/parser.rs | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/src/parser.rs b/src/parser.rs index b2a8579..518aad4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1 +1,284 @@ +use std::char; + +use nom::{ + branch::alt, + bytes::{ + complete::take_until, + streaming::{is_a, tag, take}, + }, + character::{ + complete::one_of, + streaming::{char, digit1, none_of, satisfy}, + }, + combinator::{cond, map, map_parser, map_res, not, opt, recognize, value, verify}, + error::ErrorKind, + multi::{many0, many1}, + sequence::{delimited, pair, preceded, tuple}, + Err, IResult, Parser, +}; + // parser: parses tokens from lexer into events + +enum Misc<'s> { + Comment(Comment<'s>), + PI(PI<'s>), +} + +type Comment<'s> = &'s str; + +struct PI<'s> { + target: &'s str, + instruction: Option<&'s str>, +} + +enum ContentItem<'s> { + CharData(&'s str), + Element(Element<'s>), + Reference(Reference<'s>), + CDSect(CDSect<'s>), +} + +type Content<'s> = Option<Vec<ContentItem<'s>>>; + +struct Element<'s> { + name: &'s str, + attributes: Vec<Attribute<'s>>, + content: Content<'s>, +} + +struct Attribute<'s> { + key: &'s str, + value: &'s str, +} + +// type VersionNum<'s> = &'s str; +/// Contains only latin characters or dash after first char +type EncName<'s> = &'s str; + +// struct XMLDecl<'s> { +// version_info: VersionNum<'s>, +// encoding_decl: Option<EncName<'s>>, +// sd_decl: Option<bool>, +// } + +struct DoctypeDecl<'s> { + name: &'s str, + // TODO +} + +pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { + todo!() +} + +pub fn element(input: &str) -> IResult<&str, Element> { + todo!() +} + +pub fn misc(input: &str) -> IResult<&str, Misc> { + todo!() +} + +type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>); +/// [1] document ::= prolog element Misc* +pub fn document(input: &str) -> IResult<&str, Document> { + tuple((prolog, element, many0(misc)))(input) +} + +type Char = char; +/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ +pub fn xmlchar(input: &str) -> IResult<&str, Char> { + satisfy( + |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'), + )(input) +} + +type S<'s> = &'s str; +/// [3] S ::= (#x20 | #x9 | #xD | #xA)+ +pub fn s(input: &str) -> IResult<&str, S> { + is_a("\u{20}\u{9}\u{D}\u{A}")(input) +} + +type NameStartChar = char; +/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> { + satisfy( + |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'), + )(input) +} + +type NameChar = char; +/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] +pub fn name_char(input: &str) -> IResult<&str, NameChar> { + alt(( + name_start_char, + satisfy( + |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'), + ), + ))(input) +} + +type Name<'s> = &'s str; +/// [5] Name ::= NameStartChar (NameChar)* +pub fn name(input: &str) -> IResult<&str, Name> { + recognize(pair(name_start_char, many0(name_char)))(input) +} + +type Names<'s> = &'s str; +/// [6] Names ::= Name (#x20 Name)* +pub fn names(input: &str) -> IResult<&str, Names> { + recognize(pair(name, many0(pair(char('\u{20}'), name))))(input) +} + +type Nmtoken<'s> = &'s str; +/// [7] Nmtoken ::= (NameChar)+ +pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> { + recognize(many1(name_char))(input) +} + +type Nmtokens<'s> = &'s str; +/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> { + recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input) +} + +type EntityValue<'s> = &'s str; +/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' +/// | "'" ([^%&'] | PEReference | Reference)* "'" +pub fn entity_value(input: &str) -> IResult<&str, EntityValue> { + alt(( + delimited( + char('"'), + recognize(many0(alt((none_of("%&\""), pe_reference, reference)))), + char('"'), + ), + delimited( + char('\''), + recognize(many0(alt((none_of("%&'"), pe_reference, reference)))), + char('\''), + ), + ))(input) +} + +type AttValue<'s> = &'s str; +/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' +/// | "'" ([^<&'] | Reference)* "'" +pub fn att_value(input: &str) -> IResult<&str, AttValue> { + alt(( + delimited( + char('"'), + recognize(many0(alt((none_of("<&\""), reference)))), + char('"'), + ), + delimited( + char('\''), + recognize(many0(alt((none_of("<&'"), reference)))), + char('\''), + ), + ))(input) +} + +type SystemLiteral<'s> = &'s str; +/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") +pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> { + alt(( + delimited(char('"'), recognize(many0(none_of("\""))), char('"')), + delimited(char('\''), recognize(many0(none_of("'"))), char('\'')), + ))(input) +} + +type PubidLiteral<'s> = &'s str; +/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" +pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> { + alt(( + delimited(char('"'), recognize(many0(pubid_char)), char('"')), + delimited( + char('\''), + recognize(many0(recognize(not(char('\''))).and_then(pubid_char))), + char('\''), + ), + ))(input) +} + +type PubidChar<'s> = char; +/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] +pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> { + satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))( + input, + ) +} + +type CharData<'s> = &'s str; +/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) +pub fn char_data(input: &str) -> IResult<&str, CharData> { + take_until()(input) +} + +type Prolog<'s> = ( + Option<XMLDecl>, + Vec<Misc<'s>>, + Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>, +); +/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +pub fn prolog(input: &str) -> IResult<&str, Prolog> { + tuple(( + opt(xml_decl), + many0(misc), + opt(tuple((doctypedecl, many0(misc)))), + ))(input) +} + +struct XMLDecl { + version_info: VersionInfo, + encoding_decl: Option<EncodingDecl>, + sd_decl: Option<SDDecl>, +} +/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' +pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { + // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>) + let (leftover, (version_info, encoding_decl, sd_decl)) = delimited( + tag("<?xml"), + tuple((version_info, opt(encoding_decl), opt(sd_decl))), + tag("?>"), + )(input)?; + Ok(( + leftover, + XMLDecl { + version_info, + encoding_decl, + sd_decl, + }, + )) +} + +type VersionInfo = VersionNum; +/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') +pub fn version_info(input: &str) -> IResult<&str, VersionInfo> { + preceded( + tuple((s, tag("version"), eq)), + alt(( + delimited(char('\''), version_num, char('\'')), + delimited(char('"'), version_num, char('"')), + )), + )(input) +} + +/// [25] Eq ::= S? '=' S? +pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> { + tuple((opt(s), char('='), opt(s)))(input) +} + +#[derive(Clone)] +enum VersionNum { + One, + OneDotOne, +} +/// [26] VersionNum ::= '1.' [0-9]+ +pub fn version_num(input: &str) -> IResult<&str, VersionNum> { + preceded( + tag("1."), + alt(( + value(VersionNum::One, char('0')), + value(VersionNum::OneDotOne, char('1')), + )), + )(input) +} |