From 1f0103cbecc6a4dfe3f34fb6441d4d491b385142 Mon Sep 17 00:00:00 2001 From: cel 🌸 Date: Thu, 27 Jun 2024 20:22:16 +0100 Subject: WIP: stream parsing --- Cargo.lock | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 +- src/element.rs | 4 +- src/error.rs | 20 ++++++- src/main.rs | 2 +- src/reader.rs | 46 ++++++++++++--- 6 files changed, 239 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f9658ed..530c7ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -38,6 +38,12 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bytes" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" + [[package]] name = "cc" version = "1.0.89" @@ -173,6 +179,17 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + [[package]] name = "nom" version = "7.1.3" @@ -243,6 +260,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "syn" version = "2.0.52" @@ -261,7 +288,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", + "bytes", + "libc", + "mio", "pin-project-lite", + "socket2", + "windows-sys 0.48.0", ] [[package]] @@ -269,3 +301,148 @@ name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.5", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/Cargo.toml b/Cargo.toml index ca9d389..224673e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,4 +8,4 @@ edition = "2021" [dependencies] futures = "0.3.30" nom = "7.1.3" -tokio = "1.36.0" +tokio = { version = "1.36.0", features = ["io-util", "net"] } diff --git a/src/element.rs b/src/element.rs index 3273ba0..4dcb616 100644 --- a/src/element.rs +++ b/src/element.rs @@ -23,13 +23,13 @@ pub enum Node { // should this be a trait? pub struct Element { name: Name, - // namespace: (Name, String), // can't have this, must be external method that is called within the context of a reader/writer + // namespace: Name, // each element once created contains the qualified namespace information for that element // the name contains the qualified namespace so this is unnecessary // namespace: String, // hashmap of explicit namespace declarations on the element itself only // possibly not needed as can be calculated at write time depending on context and qualified namespace, and for reading, element validity and namespaces are kept track of by the reader. - // namespaces: HashMap, String>, + namespaces: HashMap, String>, // attributes can be in a different namespace than the element. how to make sure they are valid? // maybe include the namespace instead of or with the prefix // you can calculate the prefix from the namespaced name and the current writer context diff --git a/src/error.rs b/src/error.rs index 12fcaf2..78508ae 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1 +1,19 @@ -pub enum Error {} +use std::str::Utf8Error; + +pub enum Error { + ReadError(std::io::Error), + Utf8Error(Utf8Error), + ParseError(String), +} + +impl From for Error { + fn from(e: std::io::Error) -> Self { + Self::ReadError(e) + } +} + +impl From for Error { + fn from(e: Utf8Error) -> Self { + Self::Utf8Error(e) + } +} diff --git a/src/main.rs b/src/main.rs index 424046e..b08c197 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,4 @@ -use peanuts::parser::document; +use peanuts::xml::document; fn main() { let document = document( diff --git a/src/reader.rs b/src/reader.rs index 6e622f4..2785c88 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,5 +1,7 @@ use futures::Stream; -use tokio::io::AsyncBufRead; +use nom::Err; +use std::str; +use tokio::io::AsyncBufReadExt; use crate::{ element::{Element, Name, Namespace}, @@ -8,20 +10,50 @@ use crate::{ /// streaming reader that tracks depth and available namespaces at current depth pub struct Reader { - stream: R, + inner: R, // holds which tags we are in atm over depth depth: Vec, namespaces: Vec<(usize, Namespace)>, } +impl Reader { + pub fn new(reader: R) -> Self { + Self { + inner: reader, + depth: Vec::new(), + namespaces: Vec::new(), + } + } +} + impl Reader where - R: AsyncBufRead, + R: AsyncBufReadExt + Unpin, { - // pub async fn read(&self) -> Result, Error> { - // let buf = self.stream.poll_fill_buf().await?; - // todo!() - // } + /// reads entire next prolog, element, or misc + pub async fn read<'s>(&'s mut self) -> Result, Error> { + let element; + let len; + loop { + let buf = self.inner.fill_buf().await?; + let input = str::from_utf8(buf)?; + match crate::xml::element(input) { + Ok((rest, e)) => { + element = e; + len = buf.len() - rest.len(); + break; + } + Err(e) => match e { + Err::Incomplete(_) => (), + e => return Err(Error::ParseError(input.to_owned())), + }, + } + } + self.inner.consume(len); + + // Ok(element) + todo!() + } // pub async fn read_start(&self) -> Result, Error> { // todo!() // } -- cgit