From a92aee921d6e3cfcb8bf2e08ceefd40a66df940f Mon Sep 17 00:00:00 2001 From: cel 🌸 Date: Wed, 12 Jun 2024 10:15:48 +0100 Subject: WIP: parsers --- src/event.rs | 11 +++ src/lib.rs | 1 + src/parser.rs | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/reader.rs | 14 ++- 4 files changed, 305 insertions(+), 4 deletions(-) diff --git a/src/event.rs b/src/event.rs index 1eab55b..244d3aa 100644 --- a/src/event.rs +++ b/src/event.rs @@ -1 +1,12 @@ // tags, declaration, comments, text. individual bits and what they contain, e.g. tag contains attributes and namespace declarations, lang, ONLY within the tag + +pub enum Event<'s> { + StartTag(Vec>), + EmptyTag(Vec), + Attribute(()) + CData(&'s str), + Comment(&'s str), + Declaration(Vec>), + Attribute((&'str)) + EndTag, +} diff --git a/src/lib.rs b/src/lib.rs index 5d1046f..3d71373 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ mod element; mod error; +mod parser; mod reader; mod writer; diff --git a/src/parser.rs b/src/parser.rs index b2a8579..518aad4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1 +1,284 @@ +use std::char; + +use nom::{ + branch::alt, + bytes::{ + complete::take_until, + streaming::{is_a, tag, take}, + }, + character::{ + complete::one_of, + streaming::{char, digit1, none_of, satisfy}, + }, + combinator::{cond, map, map_parser, map_res, not, opt, recognize, value, verify}, + error::ErrorKind, + multi::{many0, many1}, + sequence::{delimited, pair, preceded, tuple}, + Err, IResult, Parser, +}; + // parser: parses tokens from lexer into events + +enum Misc<'s> { + Comment(Comment<'s>), + PI(PI<'s>), +} + +type Comment<'s> = &'s str; + +struct PI<'s> { + target: &'s str, + instruction: Option<&'s str>, +} + +enum ContentItem<'s> { + CharData(&'s str), + Element(Element<'s>), + Reference(Reference<'s>), + CDSect(CDSect<'s>), +} + +type Content<'s> = Option>>; + +struct Element<'s> { + name: &'s str, + attributes: Vec>, + content: Content<'s>, +} + +struct Attribute<'s> { + key: &'s str, + value: &'s str, +} + +// type VersionNum<'s> = &'s str; +/// Contains only latin characters or dash after first char +type EncName<'s> = &'s str; + +// struct XMLDecl<'s> { +// version_info: VersionNum<'s>, +// encoding_decl: Option>, +// sd_decl: Option, +// } + +struct DoctypeDecl<'s> { + name: &'s str, + // TODO +} + +pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> { + todo!() +} + +pub fn element(input: &str) -> IResult<&str, Element> { + todo!() +} + +pub fn misc(input: &str) -> IResult<&str, Misc> { + todo!() +} + +type Document<'s> = (Prolog<'s>, Element<'s>, Vec>); +/// [1] document ::= prolog element Misc* +pub fn document(input: &str) -> IResult<&str, Document> { + tuple((prolog, element, many0(misc)))(input) +} + +type Char = char; +/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ +pub fn xmlchar(input: &str) -> IResult<&str, Char> { + satisfy( + |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'), + )(input) +} + +type S<'s> = &'s str; +/// [3] S ::= (#x20 | #x9 | #xD | #xA)+ +pub fn s(input: &str) -> IResult<&str, S> { + is_a("\u{20}\u{9}\u{D}\u{A}")(input) +} + +type NameStartChar = char; +/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> { + satisfy( + |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'), + )(input) +} + +type NameChar = char; +/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] +pub fn name_char(input: &str) -> IResult<&str, NameChar> { + alt(( + name_start_char, + satisfy( + |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'), + ), + ))(input) +} + +type Name<'s> = &'s str; +/// [5] Name ::= NameStartChar (NameChar)* +pub fn name(input: &str) -> IResult<&str, Name> { + recognize(pair(name_start_char, many0(name_char)))(input) +} + +type Names<'s> = &'s str; +/// [6] Names ::= Name (#x20 Name)* +pub fn names(input: &str) -> IResult<&str, Names> { + recognize(pair(name, many0(pair(char('\u{20}'), name))))(input) +} + +type Nmtoken<'s> = &'s str; +/// [7] Nmtoken ::= (NameChar)+ +pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> { + recognize(many1(name_char))(input) +} + +type Nmtokens<'s> = &'s str; +/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> { + recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input) +} + +type EntityValue<'s> = &'s str; +/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' +/// | "'" ([^%&'] | PEReference | Reference)* "'" +pub fn entity_value(input: &str) -> IResult<&str, EntityValue> { + alt(( + delimited( + char('"'), + recognize(many0(alt((none_of("%&\""), pe_reference, reference)))), + char('"'), + ), + delimited( + char('\''), + recognize(many0(alt((none_of("%&'"), pe_reference, reference)))), + char('\''), + ), + ))(input) +} + +type AttValue<'s> = &'s str; +/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' +/// | "'" ([^<&'] | Reference)* "'" +pub fn att_value(input: &str) -> IResult<&str, AttValue> { + alt(( + delimited( + char('"'), + recognize(many0(alt((none_of("<&\""), reference)))), + char('"'), + ), + delimited( + char('\''), + recognize(many0(alt((none_of("<&'"), reference)))), + char('\''), + ), + ))(input) +} + +type SystemLiteral<'s> = &'s str; +/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") +pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> { + alt(( + delimited(char('"'), recognize(many0(none_of("\""))), char('"')), + delimited(char('\''), recognize(many0(none_of("'"))), char('\'')), + ))(input) +} + +type PubidLiteral<'s> = &'s str; +/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" +pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> { + alt(( + delimited(char('"'), recognize(many0(pubid_char)), char('"')), + delimited( + char('\''), + recognize(many0(recognize(not(char('\''))).and_then(pubid_char))), + char('\''), + ), + ))(input) +} + +type PubidChar<'s> = char; +/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] +pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> { + satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))( + input, + ) +} + +type CharData<'s> = &'s str; +/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) +pub fn char_data(input: &str) -> IResult<&str, CharData> { + take_until()(input) +} + +type Prolog<'s> = ( + Option, + Vec>, + Option<(DoctypeDecl<'s>, Vec>)>, +); +/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +pub fn prolog(input: &str) -> IResult<&str, Prolog> { + tuple(( + opt(xml_decl), + many0(misc), + opt(tuple((doctypedecl, many0(misc)))), + ))(input) +} + +struct XMLDecl { + version_info: VersionInfo, + encoding_decl: Option, + sd_decl: Option, +} +/// [23] XMLDecl ::= '' +pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> { + // (VersionInfo, Option, Option) + let (leftover, (version_info, encoding_decl, sd_decl)) = delimited( + tag(""), + )(input)?; + Ok(( + leftover, + XMLDecl { + version_info, + encoding_decl, + sd_decl, + }, + )) +} + +type VersionInfo = VersionNum; +/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') +pub fn version_info(input: &str) -> IResult<&str, VersionInfo> { + preceded( + tuple((s, tag("version"), eq)), + alt(( + delimited(char('\''), version_num, char('\'')), + delimited(char('"'), version_num, char('"')), + )), + )(input) +} + +/// [25] Eq ::= S? '=' S? +pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> { + tuple((opt(s), char('='), opt(s)))(input) +} + +#[derive(Clone)] +enum VersionNum { + One, + OneDotOne, +} +/// [26] VersionNum ::= '1.' [0-9]+ +pub fn version_num(input: &str) -> IResult<&str, VersionNum> { + preceded( + tag("1."), + alt(( + value(VersionNum::One, char('0')), + value(VersionNum::OneDotOne, char('1')), + )), + )(input) +} diff --git a/src/reader.rs b/src/reader.rs index 05afc73..26e540e 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,5 +1,5 @@ use futures::Stream; -use tokio::io::AsyncRead; +use tokio::io::AsyncBufRead; use crate::{ element::{Element, Name, Namespace}, @@ -14,13 +14,19 @@ pub struct Reader { namespaces: Vec<(usize, Namespace)>, } -impl Reader { - pub async fn read(&self) -> Result, Error> {} +impl Reader +where + R: AsyncBufRead, +{ + pub async fn read(&self) -> Result, Error> { + let buf = self.stream.poll_fill_buf().await?; + todo!() + } pub async fn read_start(&self) -> Result, Error> {} pub async fn read_end(&self) -> Result<(), Error> {} } -impl Stream for Reader { +impl Stream for Reader { type Item = impl From; async fn poll_next( -- cgit