WIP: parsers

author: cel 🌸 <cel@blos.sm> 2024-06-12 10:15:48 +0100
committer: cel 🌸 <cel@blos.sm> 2024-06-12 10:15:48 +0100
commit: a92aee921d6e3cfcb8bf2e08ceefd40a66df940f (patch)
tree: c60ee2a490f99a7d5861c865a9788660af213074 /src
parent: 844f3a5d11e4360e9d6bdb79cfed49287aa8b14d (diff)
download: peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.tar.gz
peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.tar.bz2
peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.zip
4 files changed, 305 insertions, 4 deletions
diff --git a/src/event.rs b/src/event.rs
index 1eab55b..244d3aa 100644
--- a/src/event.rs
+++ b/src/event.rs
@@ -1 +1,12 @@
 // tags, declaration, comments, text. individual bits and what they contain, e.g. tag contains attributes and namespace declarations, lang, ONLY within the tag
+
+pub enum Event<'s> {
+    StartTag(Vec<Event<'s>>),
+    EmptyTag(Vec<Event>),
+    Attribute(())
+    CData(&'s str),
+    Comment(&'s str),
+    Declaration(Vec<Attribute<'s>>),
+    Attribute((&'str))
+    EndTag,
+}
diff --git a/src/lib.rs b/src/lib.rs
index 5d1046f..3d71373 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,6 @@
 mod element;
 mod error;
+mod parser;
 mod reader;
 mod writer;
 
diff --git a/src/parser.rs b/src/parser.rs
index b2a8579..518aad4 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1 +1,284 @@
+use std::char;
+
+use nom::{
+    branch::alt,
+    bytes::{
+        complete::take_until,
+        streaming::{is_a, tag, take},
+    },
+    character::{
+        complete::one_of,
+        streaming::{char, digit1, none_of, satisfy},
+    },
+    combinator::{cond, map, map_parser, map_res, not, opt, recognize, value, verify},
+    error::ErrorKind,
+    multi::{many0, many1},
+    sequence::{delimited, pair, preceded, tuple},
+    Err, IResult, Parser,
+};
+
 // parser: parses tokens from lexer into events
+
+enum Misc<'s> {
+    Comment(Comment<'s>),
+    PI(PI<'s>),
+}
+
+type Comment<'s> = &'s str;
+
+struct PI<'s> {
+    target: &'s str,
+    instruction: Option<&'s str>,
+}
+
+enum ContentItem<'s> {
+    CharData(&'s str),
+    Element(Element<'s>),
+    Reference(Reference<'s>),
+    CDSect(CDSect<'s>),
+}
+
+type Content<'s> = Option<Vec<ContentItem<'s>>>;
+
+struct Element<'s> {
+    name: &'s str,
+    attributes: Vec<Attribute<'s>>,
+    content: Content<'s>,
+}
+
+struct Attribute<'s> {
+    key: &'s str,
+    value: &'s str,
+}
+
+// type VersionNum<'s> = &'s str;
+/// Contains only latin characters or dash after first char
+type EncName<'s> = &'s str;
+
+// struct XMLDecl<'s> {
+//     version_info: VersionNum<'s>,
+//     encoding_decl: Option<EncName<'s>>,
+//     sd_decl: Option<bool>,
+// }
+
+struct DoctypeDecl<'s> {
+    name: &'s str,
+    // TODO
+}
+
+pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
+    todo!()
+}
+
+pub fn element(input: &str) -> IResult<&str, Element> {
+    todo!()
+}
+
+pub fn misc(input: &str) -> IResult<&str, Misc> {
+    todo!()
+}
+
+type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
+/// [1]   	document	   ::=   	prolog element Misc*
+pub fn document(input: &str) -> IResult<&str, Document> {
+    tuple((prolog, element, many0(misc)))(input)
+}
+
+type Char = char;
+/// [2]   	Char	   ::=   	#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]	/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+pub fn xmlchar(input: &str) -> IResult<&str, Char> {
+    satisfy(
+        |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
+    )(input)
+}
+
+type S<'s> = &'s str;
+/// [3]   	S	   ::=   	(#x20 | #x9 | #xD | #xA)+
+pub fn s(input: &str) -> IResult<&str, S> {
+    is_a("\u{20}\u{9}\u{D}\u{A}")(input)
+}
+
+type NameStartChar = char;
+/// [4]   	NameStartChar	   ::=   	":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
+    satisfy(
+        |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
+    )(input)
+}
+
+type NameChar = char;
+/// [4a]   	NameChar	   ::=   	NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+pub fn name_char(input: &str) -> IResult<&str, NameChar> {
+    alt((
+        name_start_char,
+        satisfy(
+            |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
+        ),
+    ))(input)
+}
+
+type Name<'s> = &'s str;
+/// [5]   	Name	   ::=   	NameStartChar (NameChar)*
+pub fn name(input: &str) -> IResult<&str, Name> {
+    recognize(pair(name_start_char, many0(name_char)))(input)
+}
+
+type Names<'s> = &'s str;
+/// [6]   	Names	   ::=   	Name (#x20 Name)*
+pub fn names(input: &str) -> IResult<&str, Names> {
+    recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
+}
+
+type Nmtoken<'s> = &'s str;
+/// [7]   	Nmtoken	   ::=   	(NameChar)+
+pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
+    recognize(many1(name_char))(input)
+}
+
+type Nmtokens<'s> = &'s str;
+/// [8]   	Nmtokens	   ::=   	Nmtoken (#x20 Nmtoken)*
+pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
+    recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
+}
+
+type EntityValue<'s> = &'s str;
+/// [9]   	EntityValue	   ::=   	'"' ([^%&"] | PEReference | Reference)* '"'
+///			|  "'" ([^%&'] | PEReference | Reference)* "'"
+pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
+    alt((
+        delimited(
+            char('"'),
+            recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
+            char('"'),
+        ),
+        delimited(
+            char('\''),
+            recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
+            char('\''),
+        ),
+    ))(input)
+}
+
+type AttValue<'s> = &'s str;
+/// [10]   	AttValue	   ::=   	'"' ([^<&"] | Reference)* '"'
+/// 			|  "'" ([^<&'] | Reference)* "'"
+pub fn att_value(input: &str) -> IResult<&str, AttValue> {
+    alt((
+        delimited(
+            char('"'),
+            recognize(many0(alt((none_of("<&\""), reference)))),
+            char('"'),
+        ),
+        delimited(
+            char('\''),
+            recognize(many0(alt((none_of("<&'"), reference)))),
+            char('\''),
+        ),
+    ))(input)
+}
+
+type SystemLiteral<'s> = &'s str;
+/// [11]   	SystemLiteral	   ::=   	('"' [^"]* '"') | ("'" [^']* "'")
+pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
+    alt((
+        delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
+        delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
+    ))(input)
+}
+
+type PubidLiteral<'s> = &'s str;
+/// [12]   	PubidLiteral	   ::=   	'"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
+pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
+    alt((
+        delimited(char('"'), recognize(many0(pubid_char)), char('"')),
+        delimited(
+            char('\''),
+            recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
+            char('\''),
+        ),
+    ))(input)
+}
+
+type PubidChar<'s> = char;
+/// [13]   	PubidChar	   ::=   	#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
+pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
+    satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
+        input,
+    )
+}
+
+type CharData<'s> = &'s str;
+/// [14]   	CharData	   ::=   	[^<&]* - ([^<&]* ']]>' [^<&]*)
+pub fn char_data(input: &str) -> IResult<&str, CharData> {
+    take_until()(input)
+}
+
+type Prolog<'s> = (
+    Option<XMLDecl>,
+    Vec<Misc<'s>>,
+    Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
+);
+/// [22]   	prolog	   ::=   	XMLDecl? Misc* (doctypedecl Misc*)?
+pub fn prolog(input: &str) -> IResult<&str, Prolog> {
+    tuple((
+        opt(xml_decl),
+        many0(misc),
+        opt(tuple((doctypedecl, many0(misc)))),
+    ))(input)
+}
+
+struct XMLDecl {
+    version_info: VersionInfo,
+    encoding_decl: Option<EncodingDecl>,
+    sd_decl: Option<SDDecl>,
+}
+/// [23]   	XMLDecl	   ::=   	'<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
+    // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
+    let (leftover, (version_info, encoding_decl, sd_decl)) = delimited(
+        tag("<?xml"),
+        tuple((version_info, opt(encoding_decl), opt(sd_decl))),
+        tag("?>"),
+    )(input)?;
+    Ok((
+        leftover,
+        XMLDecl {
+            version_info,
+            encoding_decl,
+            sd_decl,
+        },
+    ))
+}
+
+type VersionInfo = VersionNum;
+/// [24]   	VersionInfo	   ::=   	S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
+pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
+    preceded(
+        tuple((s, tag("version"), eq)),
+        alt((
+            delimited(char('\''), version_num, char('\'')),
+            delimited(char('"'), version_num, char('"')),
+        )),
+    )(input)
+}
+
+/// [25]   	Eq	   ::=   	S? '=' S?
+pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> {
+    tuple((opt(s), char('='), opt(s)))(input)
+}
+
+#[derive(Clone)]
+enum VersionNum {
+    One,
+    OneDotOne,
+}
+/// [26]   	VersionNum	   ::=   	'1.' [0-9]+
+pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
+    preceded(
+        tag("1."),
+        alt((
+            value(VersionNum::One, char('0')),
+            value(VersionNum::OneDotOne, char('1')),
+        )),
+    )(input)
+}
diff --git a/src/reader.rs b/src/reader.rs
index 05afc73..26e540e 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -1,5 +1,5 @@
 use futures::Stream;
-use tokio::io::AsyncRead;
+use tokio::io::AsyncBufRead;
 
 use crate::{
     element::{Element, Name, Namespace},
@@ -14,13 +14,19 @@ pub struct Reader<R> {
     namespaces: Vec<(usize, Namespace)>,
 }
 
-impl<R: AsyncRead> Reader<R> {
-    pub async fn read(&self) -> Result<impl From<Element>, Error> {}
+impl<R> Reader<R>
+where
+    R: AsyncBufRead,
+{
+    pub async fn read(&self) -> Result<impl From<Element>, Error> {
+        let buf = self.stream.poll_fill_buf().await?;
+        todo!()
+    }
     pub async fn read_start(&self) -> Result<impl From<Element>, Error> {}
     pub async fn read_end(&self) -> Result<(), Error> {}
 }
 
-impl<R: AsyncRead> Stream for Reader<R> {
+impl<R: AsyncBufRead> Stream for Reader<R> {
     type Item = impl From<Element>;
 
     async fn poll_next(
author	cel 🌸 <cel@blos.sm>	2024-06-12 10:15:48 +0100
committer	cel 🌸 <cel@blos.sm>	2024-06-12 10:15:48 +0100
commit	a92aee921d6e3cfcb8bf2e08ceefd40a66df940f (patch)
tree	c60ee2a490f99a7d5861c865a9788660af213074 /src
parent	844f3a5d11e4360e9d6bdb79cfed49287aa8b14d (diff)
download	peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.tar.gz peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.tar.bz2 peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.zip