aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar cel 🌸 <cel@blos.sm>2024-06-12 10:15:48 +0100
committerLibravatar cel 🌸 <cel@blos.sm>2024-06-12 10:15:48 +0100
commita92aee921d6e3cfcb8bf2e08ceefd40a66df940f (patch)
treec60ee2a490f99a7d5861c865a9788660af213074
parent844f3a5d11e4360e9d6bdb79cfed49287aa8b14d (diff)
downloadpeanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.tar.gz
peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.tar.bz2
peanuts-a92aee921d6e3cfcb8bf2e08ceefd40a66df940f.zip
WIP: parsers
-rw-r--r--src/event.rs11
-rw-r--r--src/lib.rs1
-rw-r--r--src/parser.rs283
-rw-r--r--src/reader.rs14
4 files changed, 305 insertions, 4 deletions
diff --git a/src/event.rs b/src/event.rs
index 1eab55b..244d3aa 100644
--- a/src/event.rs
+++ b/src/event.rs
@@ -1 +1,12 @@
// tags, declaration, comments, text. individual bits and what they contain, e.g. tag contains attributes and namespace declarations, lang, ONLY within the tag
+
+pub enum Event<'s> {
+ StartTag(Vec<Event<'s>>),
+ EmptyTag(Vec<Event>),
+ Attribute(())
+ CData(&'s str),
+ Comment(&'s str),
+ Declaration(Vec<Attribute<'s>>),
+ Attribute((&'str))
+ EndTag,
+}
diff --git a/src/lib.rs b/src/lib.rs
index 5d1046f..3d71373 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,6 @@
mod element;
mod error;
+mod parser;
mod reader;
mod writer;
diff --git a/src/parser.rs b/src/parser.rs
index b2a8579..518aad4 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1 +1,284 @@
+use std::char;
+
+use nom::{
+ branch::alt,
+ bytes::{
+ complete::take_until,
+ streaming::{is_a, tag, take},
+ },
+ character::{
+ complete::one_of,
+ streaming::{char, digit1, none_of, satisfy},
+ },
+ combinator::{cond, map, map_parser, map_res, not, opt, recognize, value, verify},
+ error::ErrorKind,
+ multi::{many0, many1},
+ sequence::{delimited, pair, preceded, tuple},
+ Err, IResult, Parser,
+};
+
// parser: parses tokens from lexer into events
+
+enum Misc<'s> {
+ Comment(Comment<'s>),
+ PI(PI<'s>),
+}
+
+type Comment<'s> = &'s str;
+
+struct PI<'s> {
+ target: &'s str,
+ instruction: Option<&'s str>,
+}
+
+enum ContentItem<'s> {
+ CharData(&'s str),
+ Element(Element<'s>),
+ Reference(Reference<'s>),
+ CDSect(CDSect<'s>),
+}
+
+type Content<'s> = Option<Vec<ContentItem<'s>>>;
+
+struct Element<'s> {
+ name: &'s str,
+ attributes: Vec<Attribute<'s>>,
+ content: Content<'s>,
+}
+
+struct Attribute<'s> {
+ key: &'s str,
+ value: &'s str,
+}
+
+// type VersionNum<'s> = &'s str;
+/// Contains only latin characters or dash after first char
+type EncName<'s> = &'s str;
+
+// struct XMLDecl<'s> {
+// version_info: VersionNum<'s>,
+// encoding_decl: Option<EncName<'s>>,
+// sd_decl: Option<bool>,
+// }
+
+struct DoctypeDecl<'s> {
+ name: &'s str,
+ // TODO
+}
+
+pub fn doctypedecl(input: &str) -> IResult<&str, DoctypeDecl> {
+ todo!()
+}
+
+pub fn element(input: &str) -> IResult<&str, Element> {
+ todo!()
+}
+
+pub fn misc(input: &str) -> IResult<&str, Misc> {
+ todo!()
+}
+
+type Document<'s> = (Prolog<'s>, Element<'s>, Vec<Misc<'s>>);
+/// [1] document ::= prolog element Misc*
+pub fn document(input: &str) -> IResult<&str, Document> {
+ tuple((prolog, element, many0(misc)))(input)
+}
+
+type Char = char;
+/// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+pub fn xmlchar(input: &str) -> IResult<&str, Char> {
+ satisfy(
+ |c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}'),
+ )(input)
+}
+
+type S<'s> = &'s str;
+/// [3] S ::= (#x20 | #x9 | #xD | #xA)+
+pub fn s(input: &str) -> IResult<&str, S> {
+ is_a("\u{20}\u{9}\u{D}\u{A}")(input)
+}
+
+type NameStartChar = char;
+/// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+pub fn name_start_char(input: &str) -> IResult<&str, NameStartChar> {
+ satisfy(
+ |c| matches!(c, ':' | 'A'..='Z' | '_' | 'a'..='z' | '\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'),
+ )(input)
+}
+
+type NameChar = char;
+/// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+pub fn name_char(input: &str) -> IResult<&str, NameChar> {
+ alt((
+ name_start_char,
+ satisfy(
+ |c| matches!(c, '-' | '.' | '0'..='9' | '\u{B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'),
+ ),
+ ))(input)
+}
+
+type Name<'s> = &'s str;
+/// [5] Name ::= NameStartChar (NameChar)*
+pub fn name(input: &str) -> IResult<&str, Name> {
+ recognize(pair(name_start_char, many0(name_char)))(input)
+}
+
+type Names<'s> = &'s str;
+/// [6] Names ::= Name (#x20 Name)*
+pub fn names(input: &str) -> IResult<&str, Names> {
+ recognize(pair(name, many0(pair(char('\u{20}'), name))))(input)
+}
+
+type Nmtoken<'s> = &'s str;
+/// [7] Nmtoken ::= (NameChar)+
+pub fn nmtoken(input: &str) -> IResult<&str, Nmtoken> {
+ recognize(many1(name_char))(input)
+}
+
+type Nmtokens<'s> = &'s str;
+/// [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+pub fn nmtokens(input: &str) -> IResult<&str, Nmtokens> {
+ recognize(pair(nmtoken, many0(pair(char('\u{20}'), nmtoken))))(input)
+}
+
+type EntityValue<'s> = &'s str;
+/// [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
+/// | "'" ([^%&'] | PEReference | Reference)* "'"
+pub fn entity_value(input: &str) -> IResult<&str, EntityValue> {
+ alt((
+ delimited(
+ char('"'),
+ recognize(many0(alt((none_of("%&\""), pe_reference, reference)))),
+ char('"'),
+ ),
+ delimited(
+ char('\''),
+ recognize(many0(alt((none_of("%&'"), pe_reference, reference)))),
+ char('\''),
+ ),
+ ))(input)
+}
+
+type AttValue<'s> = &'s str;
+/// [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
+/// | "'" ([^<&'] | Reference)* "'"
+pub fn att_value(input: &str) -> IResult<&str, AttValue> {
+ alt((
+ delimited(
+ char('"'),
+ recognize(many0(alt((none_of("<&\""), reference)))),
+ char('"'),
+ ),
+ delimited(
+ char('\''),
+ recognize(many0(alt((none_of("<&'"), reference)))),
+ char('\''),
+ ),
+ ))(input)
+}
+
+type SystemLiteral<'s> = &'s str;
+/// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
+pub fn system_literal(input: &str) -> IResult<&str, SystemLiteral> {
+ alt((
+ delimited(char('"'), recognize(many0(none_of("\""))), char('"')),
+ delimited(char('\''), recognize(many0(none_of("'"))), char('\'')),
+ ))(input)
+}
+
+type PubidLiteral<'s> = &'s str;
+/// [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
+pub fn pubid_literal(input: &str) -> IResult<&str, PubidLiteral> {
+ alt((
+ delimited(char('"'), recognize(many0(pubid_char)), char('"')),
+ delimited(
+ char('\''),
+ recognize(many0(recognize(not(char('\''))).and_then(pubid_char))),
+ char('\''),
+ ),
+ ))(input)
+}
+
+type PubidChar<'s> = char;
+/// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
+pub fn pubid_char(input: &str) -> IResult<&str, PubidChar> {
+ satisfy(|c| matches!(c, '\u{20}' | '\u{D}' | '\u{A}' | 'a'..='z' | 'A'..='Z' | '0'..='9'))(
+ input,
+ )
+}
+
+type CharData<'s> = &'s str;
+/// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+pub fn char_data(input: &str) -> IResult<&str, CharData> {
+ take_until()(input)
+}
+
+type Prolog<'s> = (
+ Option<XMLDecl>,
+ Vec<Misc<'s>>,
+ Option<(DoctypeDecl<'s>, Vec<Misc<'s>>)>,
+);
+/// [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+pub fn prolog(input: &str) -> IResult<&str, Prolog> {
+ tuple((
+ opt(xml_decl),
+ many0(misc),
+ opt(tuple((doctypedecl, many0(misc)))),
+ ))(input)
+}
+
+struct XMLDecl {
+ version_info: VersionInfo,
+ encoding_decl: Option<EncodingDecl>,
+ sd_decl: Option<SDDecl>,
+}
+/// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+pub fn xml_decl(input: &str) -> IResult<&str, XMLDecl> {
+ // (VersionInfo, Option<EncodingDecl>, Option<SDDecl>)
+ let (leftover, (version_info, encoding_decl, sd_decl)) = delimited(
+ tag("<?xml"),
+ tuple((version_info, opt(encoding_decl), opt(sd_decl))),
+ tag("?>"),
+ )(input)?;
+ Ok((
+ leftover,
+ XMLDecl {
+ version_info,
+ encoding_decl,
+ sd_decl,
+ },
+ ))
+}
+
+type VersionInfo = VersionNum;
+/// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
+pub fn version_info(input: &str) -> IResult<&str, VersionInfo> {
+ preceded(
+ tuple((s, tag("version"), eq)),
+ alt((
+ delimited(char('\''), version_num, char('\'')),
+ delimited(char('"'), version_num, char('"')),
+ )),
+ )(input)
+}
+
+/// [25] Eq ::= S? '=' S?
+pub fn eq(input: &str) -> IResult<&str, (Option<&str>, char, Option<&str>)> {
+ tuple((opt(s), char('='), opt(s)))(input)
+}
+
+#[derive(Clone)]
+enum VersionNum {
+ One,
+ OneDotOne,
+}
+/// [26] VersionNum ::= '1.' [0-9]+
+pub fn version_num(input: &str) -> IResult<&str, VersionNum> {
+ preceded(
+ tag("1."),
+ alt((
+ value(VersionNum::One, char('0')),
+ value(VersionNum::OneDotOne, char('1')),
+ )),
+ )(input)
+}
diff --git a/src/reader.rs b/src/reader.rs
index 05afc73..26e540e 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -1,5 +1,5 @@
use futures::Stream;
-use tokio::io::AsyncRead;
+use tokio::io::AsyncBufRead;
use crate::{
element::{Element, Name, Namespace},
@@ -14,13 +14,19 @@ pub struct Reader<R> {
namespaces: Vec<(usize, Namespace)>,
}
-impl<R: AsyncRead> Reader<R> {
- pub async fn read(&self) -> Result<impl From<Element>, Error> {}
+impl<R> Reader<R>
+where
+ R: AsyncBufRead,
+{
+ pub async fn read(&self) -> Result<impl From<Element>, Error> {
+ let buf = self.stream.poll_fill_buf().await?;
+ todo!()
+ }
pub async fn read_start(&self) -> Result<impl From<Element>, Error> {}
pub async fn read_end(&self) -> Result<(), Error> {}
}
-impl<R: AsyncRead> Stream for Reader<R> {
+impl<R: AsyncBufRead> Stream for Reader<R> {
type Item = impl From<Element>;
async fn poll_next(