aboutsummaryrefslogblamecommitdiffstats
path: root/src/parser.rs
blob: 2acd579a2cf4705a8b91a3c10f3670ee5c6e4cc7 (plain) (tree)
1
2
3
4
5
6
7
8
9


              
                               
                                                                                    

                         
                                                            
      


                                                                                           



                                                 
                                               
 


                         

                                



                                                




                      

                        
                                        
 
   



                                                               





                                   



                                                       



































































































































                                                                                                                                                                                                                                                                                                                                                                       













                                                                                                   
                

                            
            

                                                                        
        

                       


                                                                             

 









                                                                                 
                





































                                                                                            























                                                                     
                   
                        











                                                                           
                    
                              

                                            



                                                                                         
                                                                       
                     

                                                                
              
                          



                         

                          
















                                                                                                        

                                                        
















                                                              
 


































                                                                                                               







                                                         






















                                                                                              

            

                          






                                                                          



                                                                          
                                             


                                  
     
































                                                                

































                                                                          
 
use std::char;

use nom::{
    branch::{alt, permutation},
    bytes::streaming::{is_a, is_not, tag, tag_no_case, take, take_till, take_until},
    character::{
        complete::one_of,
        streaming::{alpha1, char, digit1, none_of, satisfy},
    },
    combinator::{cond, map, map_parser, map_res, not, opt, peek, recognize, value, verify},
    error::{Error, ErrorKind},
    multi::{many0, many1, many_till},
    sequence::{delimited, pair, preceded, tuple},
    Err, IResult, Parser,
};

// parser: parses tokens from lexer into events

enum ContentItem<'s> {
    CharData(&'s str),
    Element(Element<'s>),
    // Reference(Reference<'s>),
    // CDSect(CDSect<'s>),
}

type Content<'s> = Option<Vec<ContentItem<'s>>>;

struct Attribute<'s> {
    key: &'s str,
    value: &'s str,
}

struct DoctypeDecl<'s> {
    name: &'s str,
    // TODO: doctype declaration parsing
}
///
pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
    todo!()
}

struct Element<'s> {
    name: &'s str,
    attributes: Vec<Attribute<'s>>,
    content: Content<'s>,
}
/// Element
pub fn element(input: &str) -> IResult<&str, Element> {
    todo!()
}

type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
/// [1]   	document	   ::=   	prolog element Misc*
pub fn document(input: &str) -> IResult<&str, Document> {
    tuple((prolog, element, many0(misc)))(input)
}

type Char = char;
/// [2]   	Char	   ::=   	#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]	/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
pub fn xmlchar(input: &str) -> IResult<&str, Char> {
    satisfy(
        |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
    )(input)
}

type S<'s> = &'s str;
/// [3]   	S	   ::=   	(#x20 | #x9 | #xD | #xA)+
pub fn s(input: &str) -> IResult<&str, S> {
    is_a("\u{20}\u{9}\u{D}\u{A}")(input)
}

type NameStartChar = char;
/// [4]   	NameStartChar	   ::=   	":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
    satisfy(
        |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
    )(input)
}

type NameChar = char;
/// [4a]   	NameChar	   ::=   	NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
pub fn name_char(input: &str) -> IResult<&str, NameChar> {
    alt((
        name_start_char,
        satisfy(
            |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
        ),
    ))(input)
}

type Name<'s> = &'s str;
/// [5]   	Name	   ::=   	NameStartChar (NameChar)*
pub fn name(input: &str) -> IResult<&str, Name> {
    recognize(pair(name_start_char, many0(name_char)))(input)
}

type Names<'s> = &'s str;
/// [6]   	Names	   ::=   	Name (#x20 Name)*
pub fn names(input: &str) -> IResult<&str, Names> {
    recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
}

type Nmtoken<'s> = &'s str;
/// [7]   	Nmtoken	   ::=   	(NameChar)+
pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
    recognize(many1(name_char))(input)
}

type Nmtokens<'s> = &'s str;
/// [8]   	Nmtokens	   ::=   	Nmtoken (#x20 Nmtoken)*
pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
    recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
}

type EntityValue<'s> = &'s str;
/// [9]   	EntityValue	   ::=   	'"' ([^%&"] | PEReference | Reference)* '"'
///			|  "'" ([^%&'] | PEReference | Reference)* "'"
pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
    alt((
        delimited(
            char('"'),
            recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
            char('"'),
        ),
        delimited(
            char('\''),
            recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
            char('\''),
        ),
    ))(input)
}

type AttValue<'s> = &'s str;
/// [10]   	AttValue	   ::=   	'"' ([^<&"] | Reference)* '"'
/// 			|  "'" ([^<&'] | Reference)* "'"
pub fn att_value(input: &str) -> IResult<&str, AttValue> {
    alt((
        delimited(
            char('"'),
            recognize(many0(alt((none_of("<&\""), reference)))),
            char('"'),
        ),
        delimited(
            char('\''),
            recognize(many0(alt((none_of("<&'"), reference)))),
            char('\''),
        ),
    ))(input)
}

type SystemLiteral<'s> = &'s str;
/// [11]   	SystemLiteral	   ::=   	('"' [^"]* '"') | ("'" [^']* "'")
pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
    alt((
        delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
        delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
    ))(input)
}

type PubidLiteral<'s> = &'s str;
/// [12]   	PubidLiteral	   ::=   	'"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
    alt((
        delimited(char('"'), recognize(many0(pubid_char)), char('"')),
        delimited(
            char('\''),
            recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
            char('\''),
        ),
    ))(input)
}

type PubidChar<'s> = char;
/// [13]   	PubidChar	   ::=   	#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
    satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
        input,
    )
}

type CharData<'s> = &'s str;
/// [14]   	CharData	   ::=   	[^<&]* - ([^<&]* ']]>' [^<&]*)
pub fn char_data(input: &str) -> IResult<&str, CharData> {
    recognize(many_till(
        none_of("<&"),
        peek(alt((recognize(one_of("<&")), tag("]]>")))),
    ))(input)

    // let tagg: &str;
    // if let Ok((_, tagg1)) = peek(take_until::<&str, &str, Error<&str>>("]]>"))(input) {
    //     if let Ok((_, tagg2)) =
    //         peek::<&str, &str, Error<&str>, _>(take_till(|c: char| c == '<' || c == '&'))(input)
    //     {
    //         if tagg1.len() < tagg2.len() {
    //             tagg = tagg1
    //         } else {
    //             tagg = tagg2
    //         }
    //     } else {
    //         tagg = tagg1;
    //     }
    // } else {
    //     (_, tagg) = peek(take_till(|c| c == '<' || c == '&'))(input)?
    // }
    // tag(tagg)(input)

    // recognize(many0(permutation((none_of("<&"), not(tag("]]>"))))))(input)
    // recognize(many0(not(alt((tag("<"), tag("&"), tag("]]>"))))))(input)
    // take_till(|c| c == '<' || c == '&').and_then(take_until("]]>"))(input)
}

type Comment<'s> = &'s str;
/// Comment	   ::=   	'<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
pub fn comment(input: &str) -> IResult<&str, Comment> {
    delimited(
        tag("<!--"),
        recognize(many_till(xmlchar, peek(tag("--")))),
        tag("-->"),
    )(input)
}

#[derive(Clone)]
struct PI<'s> {
    target: &'s str,
    instruction: Option<&'s str>,
}
/// [16]   	PI	   ::=   	'<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
pub fn pi(input: &str) -> IResult<&str, PI> {
    let (rest, (target, instruction)) = delimited(
        tag("<?"),
        pair(
            pi_target,
            opt(recognize(pair(s, many_till(xmlchar, peek(tag("?>")))))),
        ),
        tag("?>"),
    )(input)?;
    Ok((
        rest,
        PI {
            target,
            instruction,
        },
    ))
}

type PITarget<'s> = &'s str;
/// [17]   	PITarget	   ::=   	Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
pub fn pi_target(input: &str) -> IResult<&str, PITarget> {
    let (rest, name) = name(input)?;
    if name.to_lowercase() == "xml" {
        return Err(Err::Error(Error {
            input,
            // TODO: check if better error to return
            code: ErrorKind::Tag,
        }));
    } else {
        return Ok((rest, name));
    }
}

type CDSect<'s> = (CDStart<'s>, CData<'s>, CDEnd<'s>);
/// [18]   	CDSect	   ::=   	CDStart CData CDEnd
pub fn cd_sect(input: &str) -> IResult<&str, CDSect> {
    tuple((cd_start, cdata, cd_end))(input)
}

type CDStart<'s> = &'s str;
/// [19]   	CDStart	   ::=   	'<![CDATA['
pub fn cd_start(input: &str) -> IResult<&str, CDStart> {
    tag("<![CDATA[")(input)
}

type CData<'s> = &'s str;
/// [20]   	CData	   ::=   	(Char* - (Char* ']]>' Char*))
pub fn cdata(input: &str) -> IResult<&str, CData> {
    recognize(many_till(xmlchar, peek(tag("]]>"))))(input)
}

type CDEnd<'s> = &'s str;
/// [21]   	CDEnd	   ::=   	']]>'
pub fn cd_end(input: &str) -> IResult<&str, CDEnd> {
    tag("]]>")(input)
}

type Prolog<'s> = (
    Option<XMLDecl<'s>>,
    Vec<Misc<'s>>,
    Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
);
/// [22]   	prolog	   ::=   	XMLDecl? Misc* (doctypedecl Misc*)?
pub fn prolog(input: &str) -> IResult<&str, Prolog> {
    tuple((
        opt(xml_decl),
        many0(misc),
        opt(tuple((doctypedecl, many0(misc)))),
    ))(input)
}

struct XMLDecl<'s> {
    version_info: VersionInfo,
    encoding_decl: Option<EncodingDecl<'s>>,
    sd_decl: Option<SDDecl>,
}
/// [23]   	XMLDecl	   ::=   	'<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
    // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
    let (leftover, (version_info, encoding_decl, sd_decl)) = delimited(
        tag("<?xml"),
        tuple((version_info, opt(encoding_decl), opt(sd_decl))),
        pair(opt(s), tag("?>")),
    )(input)?;
    // TODO: change to map
    Ok((
        leftover,
        XMLDecl {
            version_info,
            encoding_decl,
            sd_decl,
        },
    ))
}

type VersionInfo = VersionNum;
/// [24]   	VersionInfo	   ::=   	S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
    preceded(
        tuple((s, tag("version"), eq)),
        alt((
            delimited(char('\''), version_num, char('\'')),
            delimited(char('"'), version_num, char('"')),
        )),
    )(input)
}

/// [25]   	Eq	   ::=   	S? '=' S?
pub fn eq(input: &str) -> IResult<&str, &str> {
    recognize(tuple((opt(s), char('='), opt(s))))(input)
}

#[derive(Clone)]
enum VersionNum {
    One,
    OneDotOne,
}
/// [26]   	VersionNum	   ::=   	'1.' [0-9]+
pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
    preceded(
        tag("1."),
        alt((
            value(VersionNum::One, char('0')),
            value(VersionNum::OneDotOne, char('1')),
        )),
    )(input)
}

#[derive(Clone)]
enum Misc<'s> {
    Comment(Comment<'s>),
    PI(PI<'s>),
    S,
}
/// [27]   	Misc	   ::=   	Comment | PI | S
pub fn misc(input: &str) -> IResult<&str, Misc> {
    alt((
        map(comment, |comment| Misc::Comment(comment)),
        map(pi, |pi| Misc::PI(pi)),
        value(Misc::S, s),
    ))(input)
}

type SDDecl = bool;
/// [32]   	SDDecl	   ::=   	S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
pub fn sd_decl(input: &str) -> IResult<&str, SDDecl> {
    preceded(
        tuple((s, tag("standalone"), eq)),
        alt((
            delimited(
                char('\''),
                alt((value(true, tag("yes")), value(false, tag("no")))),
                char('\''),
            ),
            delimited(
                char('"'),
                alt((value(true, tag("yes")), value(false, tag("no")))),
                char('"'),
            ),
        )),
    )(input)
}

pub fn reference(input: &str) -> IResult<&str, char> {
    todo!()
}

pub fn pe_reference(input: &str) -> IResult<&str, char> {
    todo!()
}

type EncodingDecl<'s> = EncName<'s>;
/// [80]   	EncodingDecl	   ::=   	S 'encoding' Eq ('"' EncName '"' | "'" EncName
pub fn encoding_decl(input: &str) -> IResult<&str, EncodingDecl> {
    preceded(
        tuple((s, tag("encoding"), eq)),
        alt((
            delimited(char('"'), enc_name, char('"')),
            delimited(char('\''), enc_name, char('\'')),
        )),
    )(input)
}

type EncName<'s> = &'s str;
/// [81]   	EncName	   ::=   	[A-Za-z] ([A-Za-z0-9._] | '-')*
pub fn enc_name(input: &str) -> IResult<&str, EncName> {
    recognize(pair(
        satisfy(|c| matches!(c, 'A'..='Z' | 'a'..='z' )),
        many0(satisfy(
            |c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | '-' ),
        )),
    ))(input)
}

#[cfg(test)]
mod tests {
    use std::num::NonZero;

    use super::*;

    #[test]
    fn test_char_data() {
        assert_eq!(Ok(("&def]]>ghi", "abc")), char_data("abc&def]]>ghi"));
        assert_eq!(Ok(("]]>ghi", "abcdef")), char_data("abcdef]]>ghi"));
        assert_eq!(Ok(("&defghi", "abc")), char_data("abc&defghi"));
        assert_eq!(Ok(("]]>def&ghi", "abc")), char_data("abc]]>def&ghi"));
        assert_eq!(Ok(("&ghi", "abc]>def")), char_data("abc]>def&ghi"));
        assert_eq!(
            Err(Err::Incomplete(nom::Needed::Size(
                NonZero::new(3usize).unwrap()
            ))),
            char_data("abcdefghi")
        );
    }

    #[test]
    fn test_comment() {
        assert_eq!(Ok(("", "")), comment("<!---->"));
        assert_eq!(Ok(("", "asdf")), comment("<!--asdf-->"));
        assert_eq!(Ok(("", "as-df")), comment("<!--as-df-->"));
        assert_eq!(
            Err(Err::Incomplete(nom::Needed::Size(
                NonZero::new(2usize).unwrap()
            ))),
            comment("<!--asdf")
        );
    }

    #[test]
    fn test_pi_target() {
        assert_eq!(Ok((" ", "asdf")), pi_target("asdf "));
        assert_eq!(Ok((" ", "xmlasdf")), pi_target("xmlasdf "));
        assert_eq!(
            Err(Err::Error(Error {
                input: "xml ",
                code: ErrorKind::Tag
            })),
            pi_target("xml ")
        );
        assert_eq!(
            Err(Err::Error(Error {
                input: "xMl ",
                code: ErrorKind::Tag
            })),
            pi_target("xMl ")
        );
    }

    #[test]
    fn test_cd_sect() {
        assert_eq!(
            Ok((
                "",
                ("<![CDATA[", "<greeting>Hello, world!</greeting>", "]]>")
            )),
            cd_sect("<![CDATA[<greeting>Hello, world!</greeting>]]>")
        )
    }

    #[test]
    fn test_cd_start() {
        assert_eq!(Ok(("asdf", "<![CDATA[")), cd_start("<![CDATA[asdf"))
    }

    #[test]
    fn test_cdata() {
        assert_eq!(Ok(("]]>asdf", "asdf")), cdata("asdf]]>asdf"));
        assert_eq!(
            Ok(("]]>asdf", "<![CDATA[asdf")),
            cdata("<![CDATA[asdf]]>asdf")
        );
        assert_eq!(
            Ok(("]]>asdf", "<greeting>Hello, world!</greeting>")),
            cdata("<greeting>Hello, world!</greeting>]]>asdf")
        )
    }

    #[test]
    fn test_cd_end() {
        assert_eq!(Ok(("asdf", "]]>")), cd_end("]]>asdf"))
    }
}