From aef4dbb071d6078561fc2244e40113d94b5cd120 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Sun, 24 Mar 2024 12:55:49 -0400 Subject: [PATCH 01/20] add ignored any test --- src/lib.rs | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 3b65a9d..ea54fa2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -428,6 +428,55 @@ mod tests { assert_eq!(actual, TestSeq); } + #[cfg(feature = "value")] + #[tokio::test] + async fn test_ignored_any() { + struct IgnoredValue; + + #[async_trait] + impl FromStream for IgnoredValue { + type Context = (); + async fn from_stream(_: (), decoder: &mut D) -> Result { + use destream::Visitor; + struct IgnoredVisitor; + impl Visitor for IgnoredVisitor { + type Value = IgnoredValue; + fn expecting() -> &'static str { + todo!() + } + } + decoder.decode_ignored_any(IgnoredVisitor).await + } + } + + let encoded = "{ + \"string\": \"Hello, world!\", + \"number\": 42, + \"boolean\": true, + \"array\": [1, 2, 3], + \"object\": {\"key\": \"value\"}, + \"null_value\": null, + \"nested_object\": { + \"nested_string\": \"Nested string\", + \"nested_number\": 3.14, + \"nested_boolean\": false, + \"nested_array\": [\"apple\", \"banana\", \"orange\"], + \"nested_null\": null + }, + \"unicode_characters\": \"💡🌟🔑\", + \"empty_array\": [], + \"empty_object\": {}, + \"multiline_string\": \"This is a\nmultiline\nstring.\", + \"escaped_characters\": \"Escaped characters: \\\" \\\\ \\/ \\b \\f \\n \\r \\t\" + }"; + + let source = stream::iter(encoded.as_bytes().iter().copied()) + .chunks(2) + .map(Bytes::from); + + let _: IgnoredValue = decode((), source).await.unwrap(); + } + #[cfg(feature = "value")] #[tokio::test] async fn test_complex_list_with_err() { From 12d7ef9b792adac924b994e7e3aabdd0750b2f53 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Sun, 24 Mar 2024 23:04:19 -0400 Subject: [PATCH 02/20] fix ignore_value --- Cargo.toml | 2 + src/constants.rs | 53 ++++++ src/de.rs | 443 +++++++++++++++++++++++++++++++++++++++++++++-- src/lib.rs | 13 +- 4 files changed, 495 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3f8ad01..1778437 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ value = ["number-general/stream"] all = ["tokio-io", "value"] [dependencies] +async-recursion = "1.1.0" async-trait = "0.1.78" base64 = "0.22" bytes = "1.5" @@ -33,3 +34,4 @@ number-general = { version = "0.11.1", features=["stream"] } tokio = { version = "1.35", features = ["fs", "macros"] } tokio-util = { version = "0.7", features = ["io"] } tokio-test = "0.4" +test-case = "3.3.1" diff --git a/src/constants.rs b/src/constants.rs index b946f41..d9e0c68 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -14,3 +14,56 @@ pub const NUMERIC: [u8; 15] = [ b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'0', b'-', b'e', b'E', b'.', ]; pub const QUOTE: &[u8] = b"\""; + +// Lookup table of bytes that must be escaped. A value of true at +// index i means that byte i requires an escape sequence in the input. +// taken from +// https://github.com/serde-rs/json/blob/4a0be88b5ac6cda971a52df9f027b551fe566347/src/read.rs#L789 +pub static ESCAPE_CHARS: [bool; 256] = { + const CT: bool = true; // control character \x00..=\x1F + const QU: bool = true; // quote \x22 + const BS: bool = true; // backslash \x5C + const __: bool = false; // allow unescaped + [ + // 1 2 3 4 5 6 7 8 9 A B C D E F + CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0 + CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1 + __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 + __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F + ] +}; + +pub static HEX: [u8; 256] = { + const __: u8 = 255; // not a hex digit + [ + // 1 2 3 4 5 6 7 8 9 A B C D E F + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 + 00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3 + __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5 + __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F + ] +}; diff --git a/src/de.rs b/src/de.rs index 0476355..ed864e1 100644 --- a/src/de.rs +++ b/src/de.rs @@ -4,7 +4,9 @@ use std::collections::HashSet; use std::fmt; use std::str::FromStr; +use async_recursion::async_recursion; use async_trait::async_trait; + use bytes::{BufMut, Bytes}; use destream::{de, FromStream, Visitor}; use futures::stream::{Fuse, FusedStream, Stream, StreamExt, TryStreamExt}; @@ -398,6 +400,50 @@ impl Decoder { Ok(i) } + pub(crate) async fn peek(&mut self) -> Result, Error> { + while self.buffer.is_empty() && !self.source.is_terminated() { + self.buffer().await?; + } + + if self.buffer.is_empty() { + Ok(None) + } else { + Ok(Some(self.buffer[0])) + } + } + + async fn peek_or_null(&mut self) -> Result { + Ok(self.peek().await?.unwrap_or(b'\x00')) + } + + async fn next_char(&mut self) -> Result, Error> { + while self.buffer.is_empty() && !self.source.is_terminated() { + self.buffer().await?; + } + + Ok(Some(self.buffer.remove(0))) + } + + async fn eat_char(&mut self) -> Result<(), Error> { + self.next_char().await?; + Ok(()) + } + + async fn next_char_or_null(&mut self) -> Result { + Ok(self.next_char().await?.unwrap_or(b'\x00')) + } + + async fn next_or_eof(&mut self) -> Result { + while self.buffer.is_empty() && !self.source.is_terminated() { + self.buffer().await?; + } + if self.buffer.is_empty() { + Err(Error::unexpected_end()) + } else { + Ok(self.buffer.remove(0)) + } + } + async fn decode_number(&mut self, visitor: V) -> Result { let mut i = 0; loop { @@ -452,23 +498,101 @@ impl Decoder { self.buffer().await?; } - if self.buffer.is_empty() { - Ok(()) - } else { - if self.buffer.starts_with(QUOTE) { - self.parse_string().await?; - } else if self.numeric.contains(&self.buffer[0]) { - self.parse_number::().await?; - } else if self.buffer[0] == b'n' { - self.parse_unit().await?; - } else { - self.parse_bool().await?; + if !self.buffer.is_empty() { + // Determine the type of JSON value based on the first character in the buffer + match self.buffer[0] { + b'"' => self.ignore_string().await?, + b'-' => { + self.eat_char().await?; + self.ignore_number().await?; + } + b'0'..=b'9' => self.ignore_number().await?, + b't' => self.ignore_exactly("true").await?, + b'f' => self.ignore_exactly("false").await?, + b'n' => self.ignore_exactly("null").await?, + b'[' => self.ignore_array().await?, + b'{' => self.ignore_object().await?, + // If the first character doesn't match any JSON value type, return an error + _ => { + return Err(Error::invalid_utf8(format!( + "unexpected token ignoring value: {}", + self.buffer[0] + ))) + } } + } - Ok(()) + Ok(()) + } + + async fn ignore_string(&mut self) -> Result<(), Error> { + // eat the first char, which is a quote + self.next_or_eof().await?; + loop { + if self.buffer.is_empty() { + self.buffer().await?; + } + if self.buffer.is_empty() && self.source.is_terminated() { + return Err(Error::unexpected_end()); + } + let ch = self.next_or_eof().await?; + if !ESCAPE_CHARS[ch as usize] { + continue; + } + match ch { + b'"' => { + return Ok(()); + } + b'\\' => { + self.ignore_escaped_char().await?; + } + ch => { + return Err(Error::invalid_utf8(format!( + "invalid control character in string: {ch}" + ))); + } + } } } + /// Parses a JSON escape sequence and discards the value. Assumes the previous + /// byte read was a backslash. + async fn ignore_escaped_char(&mut self) -> Result<(), Error> { + let ch = self.next_or_eof().await?; + + match ch { + b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} + b'u' => { + // At this point we don't care if the codepoint is valid. We just + // want to consume it. We don't actually know what is valid or not + // at this point, because that depends on if this string will + // ultimately be parsed into a string or a byte buffer in the "real" + // parse. + + self.decode_hex_escape().await?; + } + _ => { + return Err(Error::invalid_utf8("invalid escape character in string")); + } + } + + Ok(()) + } + + async fn decode_hex_escape(&mut self) -> Result { + let mut n = 0; + for _ in 0..4 { + let ch = decode_hex_val(self.next_or_eof().await?); + match ch { + None => return Err(Error::invalid_utf8("invalid escape decoding hex escape")), + Some(val) => { + n = (n << 4) + val; + } + } + } + Ok(n) + } + async fn maybe_delimiter(&mut self, delimiter: &'static [u8]) -> Result { while self.buffer.is_empty() && !self.source.is_terminated() { self.buffer().await?; @@ -556,6 +680,137 @@ impl Decoder { Err(de::Error::invalid_type(as_str, "null")) } } + + async fn ignore_exactly(&mut self, s: &str) -> Result<(), Error> { + for ch in s.as_bytes() { + let next = self.next_or_eof().await?; + if &next != ch { + return Err(Error::invalid_utf8(format!( + "invalid char {next}, expected {ch}" + ))); + } + } + Ok(()) + } + + async fn ignore_number(&mut self) -> Result<(), Error> { + let ch = self.next_char_or_null().await?; + match ch { + b'0' => { + // There cannot be any leading zeroes. + // If there is a leading '0', it cannot be followed by more digits. + if let b'0'..=b'9' = self.peek_or_null().await? { + return Err(Error::invalid_utf8("invalid number, two leading zeroes")); + } + } + b'1'..=b'9' => { + while let b'0'..=b'9' = self.peek_or_null().await? { + self.eat_char().await?; + } + } + _ => { + return Err(Error::invalid_utf8(format!( + "invalid number: {}", + self.peek_or_null().await? + ))); + } + } + + match self.peek_or_null().await? { + b'.' => self.ignore_decimal().await, + b'e' | b'E' => self.ignore_exponent().await, + _ => Ok(()), + } + } + + async fn ignore_decimal(&mut self) -> Result<(), Error> { + self.next_char().await?; + + let mut at_least_one_digit = false; + while let b'0'..=b'9' = self.peek_or_null().await? { + self.next_char().await?; + at_least_one_digit = true; + } + + if !at_least_one_digit { + return Err(Error::invalid_utf8( + "invalid number, expected at least one digit after decimal", + )); + } + + match self.peek_or_null().await? { + b'e' | b'E' => self.ignore_exponent().await, + _ => Ok(()), + } + } + + async fn ignore_exponent(&mut self) -> Result<(), Error> { + self.next_char().await?; + + match self.peek_or_null().await? { + b'+' | b'-' => self.eat_char().await?, + _ => {} + } + + // Make sure a digit follows the exponent place. + match self.next_char_or_null().await? { + b'0'..=b'9' => {} + ch => { + return Err(Error::invalid_utf8(format!( + "expected a digit to follow the exponent, found {ch}" + ))); + } + } + + while let b'0'..=b'9' = self.peek_or_null().await? { + self.eat_char().await?; + } + + Ok(()) + } + + #[async_recursion] + async fn ignore_array(&mut self) -> Result<(), Error> { + self.eat_char().await?; + + loop { + self.expect_whitespace().await?; + match self.peek_or_null().await? { + b']' => { + self.eat_char().await?; + return Ok(()); + } + b',' => { + self.ignore_exactly(",").await?; + } + _ => { + self.ignore_value().await?; + } + } + } + } + + #[async_recursion] + async fn ignore_object(&mut self) -> Result<(), Error> { + self.eat_char().await?; // { + + loop { + self.expect_whitespace().await?; + match self.peek_or_null().await? { + b'}' => { + self.eat_char().await?; + return Ok(()); + } + b',' => self.eat_char().await?, + _ => { + self.ignore_string().await?; // key + self.expect_whitespace().await?; + self.ignore_exactly(":").await?; + self.ignore_value().await?; + } + } + } + } } #[async_trait] @@ -895,3 +1150,167 @@ pub async fn read_from( ) -> Result { T::from_stream(context, &mut Decoder::from(SourceReader::from(source))).await } + +fn decode_hex_val(val: u8) -> Option { + let n = HEX[val as usize] as u16; + if n == 255 { + None + } else { + Some(n) + } +} + +#[cfg(test)] +mod tests { + use futures::stream; + use test_case::test_case; + + use super::*; + + /// next_or_eof should return the next char in the buffer/stream, or + /// if we've hit the EOF, throw an error. + #[tokio::test] + async fn test_next_or_eof() { + let s = b"bar"; + for num_chunks in (1..s.len()).rev() { + let source = stream::iter(s.iter().copied()) + .chunks(num_chunks) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + for i in 0..s.len() { + let ch = decoder.next_or_eof().await.unwrap(); + assert_eq!(ch, s[i]); + } + let res = decoder.next_or_eof().await; + assert!(res.is_err()); + } + } + + /// ignore_exactly takes a vector of bytes, and consumes exactly those characters. + #[tokio::test] + async fn test_ignore_exactly() { + let inputs = [("foo", "foo", true, 0), ("foobar", "foo", true, 3)]; + + for input in inputs { + let source = input.0; + let to_ignore = input.1; + let success = input.2; + let chars_left = input.3; + + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + + let res = decoder.ignore_exactly(to_ignore).await; + assert_eq!(res.is_ok(), success); + assert_eq!(decoder.buffer.len(), chars_left); + } + } + + /// ignore_string should consume the stream until the end of the current string. + #[tokio::test] + async fn test_ignore_string() { + let inputs = [ + ("\"foo\"bar", 3), + ("\"test\"", 0), + ("\"\"", 0), + ("\"\\r\"", 0), + ("\"hello\"world\"", 6), + ("\" hello\"", 0), + ("\"hello \" ", 3), + ("\"\\t\\n\\r\"", 0), + ("\"\"test\\\"", 6), + ("\"\\\\\\\\\"", 0), + ]; + + for input in inputs { + let source = input.0; + let end_length = input.1; + + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + + decoder.ignore_string().await.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); + } + } + + #[tokio::test] + async fn test_ignore_number() { + let inputs = [ + ("0", 0), + ("123", 0), + ("-123", 0), + ("123.45", 0), + ("-123.45", 0), + ("0.0", 0), + ("123, 45", 4), + ]; + + for input in inputs { + let source = input.0; + let end_length = input.1; + + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + + // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' + decoder.ignore_value().await.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); + } + } + + #[test_case("[]", 0; "empty array")] + #[test_case("[1]", 0; "single array")] + #[test_case("[ ] ", 1; "whitespace empty array")] + #[test_case("[ 1 ] ", 1; "whitespace single array")] + #[test_case("[1,2]", 0; "multi array")] + #[test_case("[],[]", 3; "ends correctly")] + #[test_case("[\"foo\",\"bar\"]", 0; "string array")] + #[tokio::test] + async fn test_ignore_array(source: &str, end_length: usize) { + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + + decoder.ignore_array().await.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); + } + + #[test_case("{}", 0; "empty object")] + #[test_case("{},{}", 3; "ends correctly")] + #[test_case("{\"k\":2, \"k\":3}", 0; "multi object")] + #[test_case("{\"k\":1}", 0; "single object")] + #[test_case("{\"foo\":\"bar\"}", 0; "string value")] + #[test_case("{ } ", 1; "whitespace empty object")] + #[test_case("{\"k\" : 2 , \" k \" : 3 }", 0; "whitespace multi object")] + #[test_case("{ \" k \" : 1 } ", 1; "whitespace single object")] + #[tokio::test] + async fn test_ignore_object(source: &str, end_length: usize) { + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + + decoder.ignore_object().await.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); + } +} diff --git a/src/lib.rs b/src/lib.rs index ea54fa2..4f5c84d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -431,7 +431,9 @@ mod tests { #[cfg(feature = "value")] #[tokio::test] async fn test_ignored_any() { - struct IgnoredValue; + enum IgnoredValue { + None, + } #[async_trait] impl FromStream for IgnoredValue { @@ -442,7 +444,10 @@ mod tests { impl Visitor for IgnoredVisitor { type Value = IgnoredValue; fn expecting() -> &'static str { - todo!() + "any json to be ignored" + } + fn visit_unit(self) -> Result { + Ok(Self::Value::None) } } decoder.decode_ignored_any(IgnoredVisitor).await @@ -466,8 +471,8 @@ mod tests { \"unicode_characters\": \"💡🌟🔑\", \"empty_array\": [], \"empty_object\": {}, - \"multiline_string\": \"This is a\nmultiline\nstring.\", - \"escaped_characters\": \"Escaped characters: \\\" \\\\ \\/ \\b \\f \\n \\r \\t\" + \"multiline_string\": \"This is a\\nmultiline\\nstring.\", + \"escaped_characters\": \"Escaped characters: \\\" \\\\ \\/ \\b \\f \\n \\r \\t \\u1234\" }"; let source = stream::iter(encoded.as_bytes().iter().copied()) From b1f07affbbcb8fffd6acba9621e103071beeade0 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Sun, 24 Mar 2024 23:11:30 -0400 Subject: [PATCH 03/20] fix clippy warning --- src/constants.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/constants.rs b/src/constants.rs index d9e0c68..ce02647 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -52,7 +52,7 @@ pub static HEX: [u8; 256] = { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 - 00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, __, __, __, __, __, __, // 3 __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5 __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6 From 0b799ab2f44ea8a75923f01d23a82d648f8133f7 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Sun, 24 Mar 2024 23:15:10 -0400 Subject: [PATCH 04/20] fix cargo clippy test --- src/de.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/de.rs b/src/de.rs index ed864e1..87f58ec 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1179,9 +1179,9 @@ mod tests { .map(Result::::Ok); let mut decoder = Decoder::from_stream(source); - for i in 0..s.len() { - let ch = decoder.next_or_eof().await.unwrap(); - assert_eq!(ch, s[i]); + for expected in s { + let actual = decoder.next_or_eof().await.unwrap(); + assert_eq!(&actual, expected); } let res = decoder.next_or_eof().await; assert!(res.is_err()); From 36861a8f8d493ae2b177c62bf4fa28d6a63c69fe Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Sun, 24 Mar 2024 23:19:03 -0400 Subject: [PATCH 05/20] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 1778437..54fe045 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "destream_json" -version = "0.12.1" +version = "0.12.2" authors = ["code@tinychain.net"] edition = "2021" license = "Apache-2.0" From 8c10c3b9ebf60c247dec860475610435d42ccaf3 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 22:01:27 -0400 Subject: [PATCH 06/20] use raw string for test --- src/lib.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4f5c84d..b7735f1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -454,26 +454,26 @@ mod tests { } } - let encoded = "{ - \"string\": \"Hello, world!\", - \"number\": 42, - \"boolean\": true, - \"array\": [1, 2, 3], - \"object\": {\"key\": \"value\"}, - \"null_value\": null, - \"nested_object\": { - \"nested_string\": \"Nested string\", - \"nested_number\": 3.14, - \"nested_boolean\": false, - \"nested_array\": [\"apple\", \"banana\", \"orange\"], - \"nested_null\": null + let encoded = r#"{ + "string": "Hello, world!", + "number": 42, + "boolean": true, + "array": [1, 2, 3], + "object": {"key": "value"}, + "null_value": null, + "nested_object": { + "nested_string": "Nested string", + "nested_number": 3.14, + "nested_boolean": false, + "nested_array": ["apple", "banana", "orange"], + "nested_null": null }, - \"unicode_characters\": \"💡🌟🔑\", - \"empty_array\": [], - \"empty_object\": {}, - \"multiline_string\": \"This is a\\nmultiline\\nstring.\", - \"escaped_characters\": \"Escaped characters: \\\" \\\\ \\/ \\b \\f \\n \\r \\t \\u1234\" - }"; + "unicode_characters": "💡🌟🔑", + "empty_array": [], + "empty_object": {}, + "multiline_string": "This is a\nmultiline\nstring.", + "escaped_characters": "Escaped characters: \" \\ \/ \b \f \n \r \t \u1234" + }"#; let source = stream::iter(encoded.as_bytes().iter().copied()) .chunks(2) From 8555926f47ad6150ee024f262415aa5e6363a64b Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 22:02:46 -0400 Subject: [PATCH 07/20] remove pub from peek --- src/de.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/de.rs b/src/de.rs index 87f58ec..b72c2a9 100644 --- a/src/de.rs +++ b/src/de.rs @@ -400,7 +400,7 @@ impl Decoder { Ok(i) } - pub(crate) async fn peek(&mut self) -> Result, Error> { + async fn peek(&mut self) -> Result, Error> { while self.buffer.is_empty() && !self.source.is_terminated() { self.buffer().await?; } From a9df7059cf029341ee4a81a3d058e558e53f063f Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 22:45:08 -0400 Subject: [PATCH 08/20] rearrange ignoring --- src/de.rs | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/de.rs b/src/de.rs index b72c2a9..92b161b 100644 --- a/src/de.rs +++ b/src/de.rs @@ -724,11 +724,11 @@ impl Decoder { } async fn ignore_decimal(&mut self) -> Result<(), Error> { - self.next_char().await?; + self.eat_char().await?; let mut at_least_one_digit = false; while let b'0'..=b'9' = self.peek_or_null().await? { - self.next_char().await?; + self.eat_char().await?; at_least_one_digit = true; } @@ -775,40 +775,40 @@ impl Decoder { loop { self.expect_whitespace().await?; - match self.peek_or_null().await? { - b']' => { + match self.peek().await? { + Some(b',') => self.eat_char().await?, + Some(b']') => { self.eat_char().await?; return Ok(()); } - b',' => { - self.ignore_exactly(",").await?; - } - _ => { - self.ignore_value().await?; - } + Some(_) => {} + None => return Err(Error::unexpected_end()), } + self.ignore_value().await?; } } #[async_recursion] async fn ignore_object(&mut self) -> Result<(), Error> { - self.eat_char().await?; // { + self.eat_char().await?; // b'{' loop { self.expect_whitespace().await?; - match self.peek_or_null().await? { - b'}' => { + match self.peek().await? { + Some(b'}') => { self.eat_char().await?; return Ok(()); } - b',' => self.eat_char().await?, - _ => { - self.ignore_string().await?; // key - self.expect_whitespace().await?; - self.ignore_exactly(":").await?; - self.ignore_value().await?; - } + Some(b',') => self.eat_char().await?, + Some(_) => {} + None => return Err(Error::unexpected_end()), } + + self.expect_whitespace().await?; + self.ignore_string().await?; // key + self.expect_whitespace().await?; + self.ignore_exactly(":").await?; + self.ignore_value().await?; } } } From 47724367c22160b2891b8a99b4970071243510f5 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 22:51:32 -0400 Subject: [PATCH 09/20] delete `or_null` methods --- src/de.rs | 51 +++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/src/de.rs b/src/de.rs index 92b161b..3e741e3 100644 --- a/src/de.rs +++ b/src/de.rs @@ -412,10 +412,6 @@ impl Decoder { } } - async fn peek_or_null(&mut self) -> Result { - Ok(self.peek().await?.unwrap_or(b'\x00')) - } - async fn next_char(&mut self) -> Result, Error> { while self.buffer.is_empty() && !self.source.is_terminated() { self.buffer().await?; @@ -429,10 +425,6 @@ impl Decoder { Ok(()) } - async fn next_char_or_null(&mut self) -> Result { - Ok(self.next_char().await?.unwrap_or(b'\x00')) - } - async fn next_or_eof(&mut self) -> Result { while self.buffer.is_empty() && !self.source.is_terminated() { self.buffer().await?; @@ -694,31 +686,29 @@ impl Decoder { } async fn ignore_number(&mut self) -> Result<(), Error> { - let ch = self.next_char_or_null().await?; + let ch = self.next_char().await?; match ch { - b'0' => { + Some(b'0') => { // There cannot be any leading zeroes. // If there is a leading '0', it cannot be followed by more digits. - if let b'0'..=b'9' = self.peek_or_null().await? { + if let Some(b'0'..=b'9') = self.peek().await? { return Err(Error::invalid_utf8("invalid number, two leading zeroes")); } } - b'1'..=b'9' => { - while let b'0'..=b'9' = self.peek_or_null().await? { + Some(b'1'..=b'9') => { + while let Some(b'0'..=b'9') = self.peek().await? { self.eat_char().await?; } } - _ => { - return Err(Error::invalid_utf8(format!( - "invalid number: {}", - self.peek_or_null().await? - ))); + Some(ch) => { + return Err(Error::invalid_utf8(format!("invalid number: {}", ch))); } + None => return Err(Error::unexpected_end()), } - match self.peek_or_null().await? { - b'.' => self.ignore_decimal().await, - b'e' | b'E' => self.ignore_exponent().await, + match self.peek().await? { + Some(b'.') => self.ignore_decimal().await, + Some(b'e' | b'E') => self.ignore_exponent().await, _ => Ok(()), } } @@ -727,7 +717,7 @@ impl Decoder { self.eat_char().await?; let mut at_least_one_digit = false; - while let b'0'..=b'9' = self.peek_or_null().await? { + while let Some(b'0'..=b'9') = self.peek().await? { self.eat_char().await?; at_least_one_digit = true; } @@ -738,8 +728,8 @@ impl Decoder { )); } - match self.peek_or_null().await? { - b'e' | b'E' => self.ignore_exponent().await, + match self.peek().await? { + Some(b'e' | b'E') => self.ignore_exponent().await, _ => Ok(()), } } @@ -747,22 +737,23 @@ impl Decoder { async fn ignore_exponent(&mut self) -> Result<(), Error> { self.next_char().await?; - match self.peek_or_null().await? { - b'+' | b'-' => self.eat_char().await?, + match self.peek().await? { + Some(b'+' | b'-') => self.eat_char().await?, _ => {} } // Make sure a digit follows the exponent place. - match self.next_char_or_null().await? { - b'0'..=b'9' => {} - ch => { + match self.next_char().await? { + Some(b'0'..=b'9') => {} + Some(ch) => { return Err(Error::invalid_utf8(format!( "expected a digit to follow the exponent, found {ch}" ))); } + None => return Err(Error::unexpected_end()), } - while let b'0'..=b'9' = self.peek_or_null().await? { + while let Some(b'0'..=b'9') = self.peek().await? { self.eat_char().await?; } From b99fec634a84eca1f2b730880f93b1f0b386b636 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 22:52:56 -0400 Subject: [PATCH 10/20] vertical spacing --- src/de.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/de.rs b/src/de.rs index 3e741e3..9b475c0 100644 --- a/src/de.rs +++ b/src/de.rs @@ -519,18 +519,21 @@ impl Decoder { async fn ignore_string(&mut self) -> Result<(), Error> { // eat the first char, which is a quote - self.next_or_eof().await?; + self.eat_char().await?; loop { if self.buffer.is_empty() { self.buffer().await?; } + if self.buffer.is_empty() && self.source.is_terminated() { return Err(Error::unexpected_end()); } + let ch = self.next_or_eof().await?; if !ESCAPE_CHARS[ch as usize] { continue; } + match ch { b'"' => { return Ok(()); From 74ba00a82a92c2715ca1282430cf7a50532fc0a1 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 22:56:54 -0400 Subject: [PATCH 11/20] clippy --- src/de.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/de.rs b/src/de.rs index 9b475c0..126ed5f 100644 --- a/src/de.rs +++ b/src/de.rs @@ -740,9 +740,8 @@ impl Decoder { async fn ignore_exponent(&mut self) -> Result<(), Error> { self.next_char().await?; - match self.peek().await? { - Some(b'+' | b'-') => self.eat_char().await?, - _ => {} + if let Some(b'+' | b'-') = self.peek().await? { + self.eat_char().await?; } // Make sure a digit follows the exponent place. From d92a5244ce004cc2c358ec7873febd835ae67a10 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Tue, 26 Mar 2024 23:20:25 -0400 Subject: [PATCH 12/20] fix ignore exactly; don't eat first char --- src/de.rs | 132 +++++++++++++++++++++++------------------------------- 1 file changed, 56 insertions(+), 76 deletions(-) diff --git a/src/de.rs b/src/de.rs index 126ed5f..24ee36a 100644 --- a/src/de.rs +++ b/src/de.rs @@ -678,11 +678,14 @@ impl Decoder { async fn ignore_exactly(&mut self, s: &str) -> Result<(), Error> { for ch in s.as_bytes() { - let next = self.next_or_eof().await?; - if &next != ch { - return Err(Error::invalid_utf8(format!( - "invalid char {next}, expected {ch}" - ))); + match self.peek().await?.as_ref() { + None => return Err(Error::unexpected_end()), + Some(next) if next == ch => self.eat_char().await?, + Some(next) => { + return Err(Error::invalid_utf8(format!( + "invalid char {next}, expected {ch}" + ))); + } } } Ok(()) @@ -1182,88 +1185,65 @@ mod tests { } /// ignore_exactly takes a vector of bytes, and consumes exactly those characters. + #[test_case("foo", "foo", true, 0; "ignore foo")] + #[test_case("foobar", "foo", true, 3; "ignore foo not bar")] + #[test_case("foobar", "bar", false, 6; "wrong expected str")] #[tokio::test] - async fn test_ignore_exactly() { - let inputs = [("foo", "foo", true, 0), ("foobar", "foo", true, 3)]; - - for input in inputs { - let source = input.0; - let to_ignore = input.1; - let success = input.2; - let chars_left = input.3; - - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) - .map(Bytes::from) - .map(Result::::Ok); - - let mut decoder = Decoder::from_stream(source); + async fn test_ignore_exactly(source: &str, to_ignore: &str, success: bool, chars_left: usize) { + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); - let res = decoder.ignore_exactly(to_ignore).await; - assert_eq!(res.is_ok(), success); - assert_eq!(decoder.buffer.len(), chars_left); - } - } + let mut decoder = Decoder::from_stream(source); - /// ignore_string should consume the stream until the end of the current string. + let res = decoder.ignore_exactly(to_ignore).await; + assert_eq!(res.is_ok(), success); + assert_eq!(decoder.buffer.len(), chars_left); + } + + #[test_case("\"foo\"bar", 3; "ends correctly")] + #[test_case("\"test\"", 0; "string value")] + #[test_case("\"\"", 0; "empty")] + #[test_case("\"\\r\"", 0; "carriage return")] + #[test_case("\"hello\"world\"", 6; "multiple quotes")] + #[test_case("\" hello\"", 0; "whitespace before")] + #[test_case("\"hello \" ", 3; "whitespace after")] + #[test_case("\"\\t\\n\\r\"", 0; "whitespace chars")] + #[test_case("\"\"test\\\"", 6; "chars after empty string")] + #[test_case("\"\\\\\\\\\"", 0; "backslashs")] #[tokio::test] - async fn test_ignore_string() { - let inputs = [ - ("\"foo\"bar", 3), - ("\"test\"", 0), - ("\"\"", 0), - ("\"\\r\"", 0), - ("\"hello\"world\"", 6), - ("\" hello\"", 0), - ("\"hello \" ", 3), - ("\"\\t\\n\\r\"", 0), - ("\"\"test\\\"", 6), - ("\"\\\\\\\\\"", 0), - ]; - - for input in inputs { - let source = input.0; - let end_length = input.1; - - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) - .map(Bytes::from) - .map(Result::::Ok); + async fn test_ignore_string(source: &str, end_length: usize) { + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); - let mut decoder = Decoder::from_stream(source); + let mut decoder = Decoder::from_stream(source); - decoder.ignore_string().await.unwrap(); - assert_eq!(decoder.buffer.len(), end_length); - } + decoder.ignore_string().await.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); } + #[test_case("0", 0; "zero")] + #[test_case("123", 0; "positive number")] + #[test_case("-123", 0; "negative number")] + #[test_case("123.45", 0; "positive float")] + #[test_case("-123.45", 0; "negative float")] + #[test_case("0.0", 0; "zero float")] + #[test_case("123, 45", 4; "parses only one number")] #[tokio::test] - async fn test_ignore_number() { - let inputs = [ - ("0", 0), - ("123", 0), - ("-123", 0), - ("123.45", 0), - ("-123.45", 0), - ("0.0", 0), - ("123, 45", 4), - ]; - - for input in inputs { - let source = input.0; - let end_length = input.1; - - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) - .map(Bytes::from) - .map(Result::::Ok); + async fn test_ignore_number(source: &str, end_length: usize) { + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(source.len()) + .map(Bytes::from) + .map(Result::::Ok); - let mut decoder = Decoder::from_stream(source); + let mut decoder = Decoder::from_stream(source); - // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' - decoder.ignore_value().await.unwrap(); - assert_eq!(decoder.buffer.len(), end_length); - } + // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' + decoder.ignore_value().await.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); } #[test_case("[]", 0; "empty array")] From 760d85788c80db9d10d0b82c4795b990fe8ed485 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Thu, 28 Mar 2024 20:07:11 -0400 Subject: [PATCH 13/20] add some error tests, not done yet --- src/de.rs | 90 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 23 deletions(-) diff --git a/src/de.rs b/src/de.rs index 24ee36a..34f9f4e 100644 --- a/src/de.rs +++ b/src/de.rs @@ -92,6 +92,7 @@ impl From for SourceReader { } /// An error encountered while decoding a JSON stream. +#[derive(PartialEq)] pub struct Error { message: String, } @@ -417,7 +418,10 @@ impl Decoder { self.buffer().await?; } - Ok(Some(self.buffer.remove(0))) + match self.buffer.len() { + 0 => Ok(None), + _ => Ok(Some(self.buffer.remove(0))), + } } async fn eat_char(&mut self) -> Result<(), Error> { @@ -1158,6 +1162,8 @@ fn decode_hex_val(val: u8) -> Option { #[cfg(test)] mod tests { + use std::cmp::max; + use futures::stream; use test_case::test_case; @@ -1202,6 +1208,8 @@ mod tests { assert_eq!(decoder.buffer.len(), chars_left); } + #[test_case(r#""""#, 0; "empty string")] + #[test_case(r#""","#, 1; "empty string then leave char")] #[test_case("\"foo\"bar", 3; "ends correctly")] #[test_case("\"test\"", 0; "string value")] #[test_case("\"\"", 0; "empty")] @@ -1211,7 +1219,7 @@ mod tests { #[test_case("\"hello \" ", 3; "whitespace after")] #[test_case("\"\\t\\n\\r\"", 0; "whitespace chars")] #[test_case("\"\"test\\\"", 6; "chars after empty string")] - #[test_case("\"\\\\\\\\\"", 0; "backslashs")] + #[test_case("\"\\\\\\\\\"", 0; "backslashes")] #[tokio::test] async fn test_ignore_string(source: &str, end_length: usize) { let source = stream::iter(source.as_bytes().iter().copied()) @@ -1225,25 +1233,55 @@ mod tests { assert_eq!(decoder.buffer.len(), end_length); } - #[test_case("0", 0; "zero")] - #[test_case("123", 0; "positive number")] - #[test_case("-123", 0; "negative number")] - #[test_case("123.45", 0; "positive float")] - #[test_case("-123.45", 0; "negative float")] - #[test_case("0.0", 0; "zero float")] - #[test_case("123, 45", 4; "parses only one number")] + #[test_case("-123", Ok(0); "negative number")] + #[test_case("-123.45", Ok(0); "negative float")] + #[test_case("abc", Err(Error::invalid_utf8("unexpected token ignoring value: 97")); "non number")] + #[test_case("", Ok(0); "empty source")] #[tokio::test] - async fn test_ignore_number(source: &str, end_length: usize) { + async fn test_ignore_value(source: &str, expected: Result) { + let chunk_size = max(source.len(), 1); let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) + .chunks(chunk_size) .map(Bytes::from) .map(Result::::Ok); let mut decoder = Decoder::from_stream(source); // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' - decoder.ignore_value().await.unwrap(); - assert_eq!(decoder.buffer.len(), end_length); + let res = decoder.ignore_value().await; + if let Ok(end_length) = expected { + res.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); + } else { + assert_eq!(res.unwrap_err(), expected.unwrap_err()) + } + } + + #[test_case("0", Ok(0); "zero")] + #[test_case("123", Ok(0); "positive number")] + #[test_case("123.45", Ok(0); "positive float")] + #[test_case("0.0", Ok(0); "zero float")] + #[test_case("123, 45", Ok(4); "parses only one number")] + #[test_case("abc", Err(Error::invalid_utf8("invalid number: 97")); "unexpected token")] + #[test_case("", Err(Error::unexpected_end()); "unexpected end")] + #[tokio::test] + async fn test_ignore_number(source: &str, expected: Result) { + let chunk_size = max(source.len(), 1); + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(chunk_size) + .map(Bytes::from) + .map(Result::::Ok); + + let mut decoder = Decoder::from_stream(source); + + // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' + let res = decoder.ignore_number().await; + if let Ok(end_length) = expected { + res.unwrap(); + assert_eq!(decoder.buffer.len(), end_length); + } else { + assert_eq!(res.unwrap_err(), expected.unwrap_err()) + } } #[test_case("[]", 0; "empty array")] @@ -1266,16 +1304,18 @@ mod tests { assert_eq!(decoder.buffer.len(), end_length); } - #[test_case("{}", 0; "empty object")] - #[test_case("{},{}", 3; "ends correctly")] - #[test_case("{\"k\":2, \"k\":3}", 0; "multi object")] - #[test_case("{\"k\":1}", 0; "single object")] - #[test_case("{\"foo\":\"bar\"}", 0; "string value")] - #[test_case("{ } ", 1; "whitespace empty object")] - #[test_case("{\"k\" : 2 , \" k \" : 3 }", 0; "whitespace multi object")] - #[test_case("{ \" k \" : 1 } ", 1; "whitespace single object")] + #[test_case("{}", 0, None; "empty object")] + #[test_case("{},{}", 3, None; "ends correctly")] + #[test_case(r#"{"k":2, "k":3}"#, 0, None; "multi object")] + #[test_case(r#"{"k":1}"#, 0, None; "single object")] + #[test_case(r#"{"foo":"bar"}"#, 0, None; "string value")] + #[test_case(r#"{ } "#, 1, None; "whitespace empty object")] + #[test_case(r#"{"k" : 2 , " k " : 3 }"#, 0, None; "whitespace multi object")] + #[test_case(r#"{ " k " : 1 } "#, 1, None; "whitespace single object")] + #[test_case(r#"{"k""v"}"#, 4, Some(Error::invalid_utf8("invalid char 34, expected 58")); "missing colon")] + #[test_case(r#"{"k","v"}"#, 5, Some(Error::invalid_utf8("invalid char 44, expected 58")); "comma when expecting colon")] #[tokio::test] - async fn test_ignore_object(source: &str, end_length: usize) { + async fn test_ignore_object(source: &str, end_length: usize, expected_error: Option) { let source = stream::iter(source.as_bytes().iter().copied()) .chunks(source.len()) .map(Bytes::from) @@ -1283,7 +1323,11 @@ mod tests { let mut decoder = Decoder::from_stream(source); - decoder.ignore_object().await.unwrap(); + let res = decoder.ignore_object().await; + match expected_error { + Some(e) => assert_eq!(Err(e), res), + None => assert!(res.is_ok()), + } assert_eq!(decoder.buffer.len(), end_length); } } From b1170ddbb73f82fe178a008cb4f8d3b0e9af9a00 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Fri, 29 Mar 2024 15:36:58 -0400 Subject: [PATCH 14/20] update tests --- src/de.rs | 73 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/src/de.rs b/src/de.rs index 34f9f4e..8dd0660 100644 --- a/src/de.rs +++ b/src/de.rs @@ -772,8 +772,14 @@ impl Decoder { #[async_recursion] async fn ignore_array(&mut self) -> Result<(), Error> { self.eat_char().await?; + self.expect_whitespace().await?; + if self.peek().await? == Some(b']') { + self.eat_char().await?; + return Ok(()); + } loop { + self.ignore_value().await?; self.expect_whitespace().await?; match self.peek().await? { Some(b',') => self.eat_char().await?, @@ -781,18 +787,31 @@ impl Decoder { self.eat_char().await?; return Ok(()); } - Some(_) => {} + Some(ch) => { + return Err(Error::invalid_utf8(format!( + "invalid char {ch}, expected , or ]" + ))) + } None => return Err(Error::unexpected_end()), } - self.ignore_value().await?; } } #[async_recursion] async fn ignore_object(&mut self) -> Result<(), Error> { self.eat_char().await?; // b'{' + self.expect_whitespace().await?; + if self.peek().await? == Some(b'}') { + self.eat_char().await?; + return Ok(()); + } loop { + self.expect_whitespace().await?; + self.ignore_string().await?; // key + self.expect_whitespace().await?; + self.ignore_exactly(":").await?; + self.ignore_value().await?; self.expect_whitespace().await?; match self.peek().await? { Some(b'}') => { @@ -800,15 +819,13 @@ impl Decoder { return Ok(()); } Some(b',') => self.eat_char().await?, - Some(_) => {} + Some(ch) => { + return Err(Error::invalid_utf8(format!( + "invalid char {ch}, expected , or }}" + ))) + } None => return Err(Error::unexpected_end()), } - - self.expect_whitespace().await?; - self.ignore_string().await?; // key - self.expect_whitespace().await?; - self.ignore_exactly(":").await?; - self.ignore_value().await?; } } } @@ -1272,10 +1289,10 @@ mod tests { .map(Bytes::from) .map(Result::::Ok); - let mut decoder = Decoder::from_stream(source); - // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' + let mut decoder = Decoder::from_stream(source); let res = decoder.ignore_number().await; + if let Ok(end_length) = expected { res.unwrap(); assert_eq!(decoder.buffer.len(), end_length); @@ -1284,24 +1301,30 @@ mod tests { } } - #[test_case("[]", 0; "empty array")] - #[test_case("[1]", 0; "single array")] - #[test_case("[ ] ", 1; "whitespace empty array")] - #[test_case("[ 1 ] ", 1; "whitespace single array")] - #[test_case("[1,2]", 0; "multi array")] - #[test_case("[],[]", 3; "ends correctly")] - #[test_case("[\"foo\",\"bar\"]", 0; "string array")] + #[test_case("[]", Ok(0); "empty array")] + #[test_case("[1]", Ok(0); "single array")] + #[test_case("[ ] ", Ok(1); "whitespace empty array")] + #[test_case("[ 1 ] ", Ok(1); "whitespace single array")] + #[test_case("[1,2]", Ok(0); "multi array")] + #[test_case("[],[]", Ok(3); "ends correctly")] + #[test_case("[\"foo\",\"bar\"]", Ok(0); "string array")] + #[test_case(r#""#, Err(Error::unexpected_end()); "unexpected end")] + #[test_case(r#"["test""test"]"#, Err(Error::invalid_utf8("invalid char 34, expected , or ]")); "no comma")] #[tokio::test] - async fn test_ignore_array(source: &str, end_length: usize) { + async fn test_ignore_array(source: &str, expected: Result) { + let chunk_size = max(source.len(), 1); let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) + .chunks(chunk_size) .map(Bytes::from) .map(Result::::Ok); let mut decoder = Decoder::from_stream(source); + let res = decoder.ignore_array().await; - decoder.ignore_array().await.unwrap(); - assert_eq!(decoder.buffer.len(), end_length); + match expected { + Ok(end_length) => assert_eq!(decoder.buffer.len(), end_length), + Err(e) => assert_eq!(res.unwrap_err(), e), + } } #[test_case("{}", 0, None; "empty object")] @@ -1314,6 +1337,7 @@ mod tests { #[test_case(r#"{ " k " : 1 } "#, 1, None; "whitespace single object")] #[test_case(r#"{"k""v"}"#, 4, Some(Error::invalid_utf8("invalid char 34, expected 58")); "missing colon")] #[test_case(r#"{"k","v"}"#, 5, Some(Error::invalid_utf8("invalid char 44, expected 58")); "comma when expecting colon")] + #[test_case(r#"{,"k":"v"}"#, 7, Some(Error::invalid_utf8("invalid char 107, expected 58")); "comma when expecting value")] #[tokio::test] async fn test_ignore_object(source: &str, end_length: usize, expected_error: Option) { let source = stream::iter(source.as_bytes().iter().copied()) @@ -1322,12 +1346,11 @@ mod tests { .map(Result::::Ok); let mut decoder = Decoder::from_stream(source); - let res = decoder.ignore_object().await; + match expected_error { Some(e) => assert_eq!(Err(e), res), - None => assert!(res.is_ok()), + None => assert_eq!(decoder.buffer.len(), end_length), } - assert_eq!(decoder.buffer.len(), end_length); } } From 3de4fd17a7e9e46b5000853f93066bb4c5eeea0c Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Fri, 29 Mar 2024 15:39:01 -0400 Subject: [PATCH 15/20] match test format --- src/de.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/de.rs b/src/de.rs index 8dd0660..a70c4e0 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1327,19 +1327,19 @@ mod tests { } } - #[test_case("{}", 0, None; "empty object")] - #[test_case("{},{}", 3, None; "ends correctly")] - #[test_case(r#"{"k":2, "k":3}"#, 0, None; "multi object")] - #[test_case(r#"{"k":1}"#, 0, None; "single object")] - #[test_case(r#"{"foo":"bar"}"#, 0, None; "string value")] - #[test_case(r#"{ } "#, 1, None; "whitespace empty object")] - #[test_case(r#"{"k" : 2 , " k " : 3 }"#, 0, None; "whitespace multi object")] - #[test_case(r#"{ " k " : 1 } "#, 1, None; "whitespace single object")] - #[test_case(r#"{"k""v"}"#, 4, Some(Error::invalid_utf8("invalid char 34, expected 58")); "missing colon")] - #[test_case(r#"{"k","v"}"#, 5, Some(Error::invalid_utf8("invalid char 44, expected 58")); "comma when expecting colon")] - #[test_case(r#"{,"k":"v"}"#, 7, Some(Error::invalid_utf8("invalid char 107, expected 58")); "comma when expecting value")] + #[test_case("{}", Ok(0); "empty object")] + #[test_case("{},{}", Ok(3); "ends correctly")] + #[test_case(r#"{"k":2, "k":3}"#, Ok(0); "multi object")] + #[test_case(r#"{"k":1}"#, Ok(0); "single object")] + #[test_case(r#"{"foo":"bar"}"#, Ok(0); "string value")] + #[test_case(r#"{ } "#, Ok(1); "whitespace empty object")] + #[test_case(r#"{"k" : 2 , " k " : 3 }"#, Ok(0); "whitespace multi object")] + #[test_case(r#"{ " k " : 1 } "#, Ok(1); "whitespace single object")] + #[test_case(r#"{"k""v"}"#, Err(Error::invalid_utf8("invalid char 34, expected 58")); "missing colon")] + #[test_case(r#"{"k","v"}"#, Err(Error::invalid_utf8("invalid char 44, expected 58")); "comma when expecting colon")] + #[test_case(r#"{,"k":"v"}"#, Err(Error::invalid_utf8("invalid char 107, expected 58")); "comma when expecting value")] #[tokio::test] - async fn test_ignore_object(source: &str, end_length: usize, expected_error: Option) { + async fn test_ignore_object(source: &str, expected: Result) { let source = stream::iter(source.as_bytes().iter().copied()) .chunks(source.len()) .map(Bytes::from) @@ -1348,9 +1348,9 @@ mod tests { let mut decoder = Decoder::from_stream(source); let res = decoder.ignore_object().await; - match expected_error { - Some(e) => assert_eq!(Err(e), res), - None => assert_eq!(decoder.buffer.len(), end_length), + match expected { + Err(e) => assert_eq!(Err(e), res), + Ok(end_length) => assert_eq!(decoder.buffer.len(), end_length), } } } From 9a9f703c73511429ce89682a715df8edecf16543 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Mon, 1 Apr 2024 18:32:26 -0600 Subject: [PATCH 16/20] broken: extract out decoder creation --- src/de.rs | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/de.rs b/src/de.rs index a70c4e0..e5ab553 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1207,6 +1207,30 @@ mod tests { } } + fn test_decoder( + source: &str, + ) -> Decoder< + SourceStream< + futures::stream::Map< + futures::stream::Map< + futures::stream::Chunks< + futures::stream::Iter>>, + >, + fn(Vec) -> bytes::Bytes, + >, + fn(bytes::Bytes) -> Result, + >, + >, + > { + let chunk_size = max(source.len(), 1); + let source = stream::iter(source.as_bytes().iter().copied()) + .chunks(chunk_size) + .map(Bytes::from) + .map(Result::::Ok); + + Decoder::from_stream(source) + } + /// ignore_exactly takes a vector of bytes, and consumes exactly those characters. #[test_case("foo", "foo", true, 0; "ignore foo")] #[test_case("foobar", "foo", true, 3; "ignore foo not bar")] @@ -1283,14 +1307,7 @@ mod tests { #[test_case("", Err(Error::unexpected_end()); "unexpected end")] #[tokio::test] async fn test_ignore_number(source: &str, expected: Result) { - let chunk_size = max(source.len(), 1); - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(chunk_size) - .map(Bytes::from) - .map(Result::::Ok); - - // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' - let mut decoder = Decoder::from_stream(source); + let decoder = test_decoder(source); let res = decoder.ignore_number().await; if let Ok(end_length) = expected { From 33ba2563f4a913bf7d77dea591c6f3a11623a056 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Mon, 1 Apr 2024 18:46:34 -0600 Subject: [PATCH 17/20] works: impl Stream --- src/de.rs | 56 +++++++++---------------------------------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/src/de.rs b/src/de.rs index e5ab553..bde9d42 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1209,19 +1209,7 @@ mod tests { fn test_decoder( source: &str, - ) -> Decoder< - SourceStream< - futures::stream::Map< - futures::stream::Map< - futures::stream::Chunks< - futures::stream::Iter>>, - >, - fn(Vec) -> bytes::Bytes, - >, - fn(bytes::Bytes) -> Result, - >, - >, - > { + ) -> Decoder> + '_>> { let chunk_size = max(source.len(), 1); let source = stream::iter(source.as_bytes().iter().copied()) .chunks(chunk_size) @@ -1237,14 +1225,9 @@ mod tests { #[test_case("foobar", "bar", false, 6; "wrong expected str")] #[tokio::test] async fn test_ignore_exactly(source: &str, to_ignore: &str, success: bool, chars_left: usize) { - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) - .map(Bytes::from) - .map(Result::::Ok); - - let mut decoder = Decoder::from_stream(source); - + let mut decoder = test_decoder(source); let res = decoder.ignore_exactly(to_ignore).await; + assert_eq!(res.is_ok(), success); assert_eq!(decoder.buffer.len(), chars_left); } @@ -1263,12 +1246,7 @@ mod tests { #[test_case("\"\\\\\\\\\"", 0; "backslashes")] #[tokio::test] async fn test_ignore_string(source: &str, end_length: usize) { - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) - .map(Bytes::from) - .map(Result::::Ok); - - let mut decoder = Decoder::from_stream(source); + let mut decoder = test_decoder(source); decoder.ignore_string().await.unwrap(); assert_eq!(decoder.buffer.len(), end_length); @@ -1280,16 +1258,11 @@ mod tests { #[test_case("", Ok(0); "empty source")] #[tokio::test] async fn test_ignore_value(source: &str, expected: Result) { - let chunk_size = max(source.len(), 1); - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(chunk_size) - .map(Bytes::from) - .map(Result::::Ok); - - let mut decoder = Decoder::from_stream(source); + let mut decoder = test_decoder(source); // `ignore_number` only works on positive numbers. `ignore_value` will eat that b'-' let res = decoder.ignore_value().await; + if let Ok(end_length) = expected { res.unwrap(); assert_eq!(decoder.buffer.len(), end_length); @@ -1307,7 +1280,7 @@ mod tests { #[test_case("", Err(Error::unexpected_end()); "unexpected end")] #[tokio::test] async fn test_ignore_number(source: &str, expected: Result) { - let decoder = test_decoder(source); + let mut decoder = test_decoder(source); let res = decoder.ignore_number().await; if let Ok(end_length) = expected { @@ -1329,13 +1302,7 @@ mod tests { #[test_case(r#"["test""test"]"#, Err(Error::invalid_utf8("invalid char 34, expected , or ]")); "no comma")] #[tokio::test] async fn test_ignore_array(source: &str, expected: Result) { - let chunk_size = max(source.len(), 1); - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(chunk_size) - .map(Bytes::from) - .map(Result::::Ok); - - let mut decoder = Decoder::from_stream(source); + let mut decoder = test_decoder(source); let res = decoder.ignore_array().await; match expected { @@ -1357,12 +1324,7 @@ mod tests { #[test_case(r#"{,"k":"v"}"#, Err(Error::invalid_utf8("invalid char 107, expected 58")); "comma when expecting value")] #[tokio::test] async fn test_ignore_object(source: &str, expected: Result) { - let source = stream::iter(source.as_bytes().iter().copied()) - .chunks(source.len()) - .map(Bytes::from) - .map(Result::::Ok); - - let mut decoder = Decoder::from_stream(source); + let mut decoder = test_decoder(source); let res = decoder.ignore_object().await; match expected { From d26f288d5241d56ee3eb1e6a2a86e466e51ad937 Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Mon, 1 Apr 2024 19:01:42 -0600 Subject: [PATCH 18/20] add more tests --- src/de.rs | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/de.rs b/src/de.rs index bde9d42..cb1bc9b 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1223,6 +1223,8 @@ mod tests { #[test_case("foo", "foo", true, 0; "ignore foo")] #[test_case("foobar", "foo", true, 3; "ignore foo not bar")] #[test_case("foobar", "bar", false, 6; "wrong expected str")] + #[test_case("", "", true, 0; "empty good")] + #[test_case("", "a", false, 0; "empty bad")] #[tokio::test] async fn test_ignore_exactly(source: &str, to_ignore: &str, success: bool, chars_left: usize) { let mut decoder = test_decoder(source); @@ -1232,24 +1234,34 @@ mod tests { assert_eq!(decoder.buffer.len(), chars_left); } - #[test_case(r#""""#, 0; "empty string")] - #[test_case(r#""","#, 1; "empty string then leave char")] - #[test_case("\"foo\"bar", 3; "ends correctly")] - #[test_case("\"test\"", 0; "string value")] - #[test_case("\"\"", 0; "empty")] - #[test_case("\"\\r\"", 0; "carriage return")] - #[test_case("\"hello\"world\"", 6; "multiple quotes")] - #[test_case("\" hello\"", 0; "whitespace before")] - #[test_case("\"hello \" ", 3; "whitespace after")] - #[test_case("\"\\t\\n\\r\"", 0; "whitespace chars")] - #[test_case("\"\"test\\\"", 6; "chars after empty string")] - #[test_case("\"\\\\\\\\\"", 0; "backslashes")] + #[test_case(r#""""#, Ok(0); "empty string")] + #[test_case(r#""","#, Ok(1); "empty string then leave char")] + #[test_case("\"foo\"bar", Ok(3); "ends correctly")] + #[test_case("\"test\"", Ok(0); "string value")] + #[test_case("\"\"", Ok(0); "empty")] + #[test_case("\"\\r\"", Ok(0); "carriage return")] + #[test_case("\"hello\"world\"", Ok(6); "multiple quotes")] + #[test_case("\" hello\"", Ok(0); "whitespace before")] + #[test_case("\"hello \" ", Ok(3); "whitespace after")] + #[test_case("\"\\t\\n\\r\"", Ok(0); "whitespace chars")] + #[test_case("\"\"test\\\"", Ok(6); "chars after empty string")] + #[test_case("\"\\\\\\\\\"", Ok(0); "backslashes")] + #[test_case("", Err(Error::unexpected_end()); "eof")] + #[test_case(r#""a"#, Err(Error::unexpected_end()); "unfinished string")] + #[test_case( + r#""\x01""#, + Err(Error::invalid_utf8("invalid escape character in string")) + )] #[tokio::test] - async fn test_ignore_string(source: &str, end_length: usize) { + async fn test_ignore_string(source: &str, expected: Result) { let mut decoder = test_decoder(source); - decoder.ignore_string().await.unwrap(); - assert_eq!(decoder.buffer.len(), end_length); + let res = decoder.ignore_string().await; + + match expected { + Ok(end_length) => assert_eq!(decoder.buffer.len(), end_length), + Err(e) => assert_eq!(Err(e), res), + } } #[test_case("-123", Ok(0); "negative number")] From 2dfba8ffdbf32efe3084382e2f3f7c99b6630a5c Mon Sep 17 00:00:00 2001 From: drewmcarthur Date: Mon, 1 Apr 2024 19:26:31 -0600 Subject: [PATCH 19/20] coverage for all ignore functions and paths --- src/de.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/de.rs b/src/de.rs index cb1bc9b..f54a01c 100644 --- a/src/de.rs +++ b/src/de.rs @@ -745,7 +745,7 @@ impl Decoder { } async fn ignore_exponent(&mut self) -> Result<(), Error> { - self.next_char().await?; + self.eat_char().await?; if let Some(b'+' | b'-') = self.peek().await? { self.eat_char().await?; @@ -1247,7 +1247,9 @@ mod tests { #[test_case("\"\"test\\\"", Ok(6); "chars after empty string")] #[test_case("\"\\\\\\\\\"", Ok(0); "backslashes")] #[test_case("", Err(Error::unexpected_end()); "eof")] - #[test_case(r#""a"#, Err(Error::unexpected_end()); "unfinished string")] + #[test_case(r#""a"#, Err(Error::unexpected_end()); "unterminatedstring")] + #[test_case("\"\x01\"", Err(Error::invalid_utf8("invalid control character in string: 1")); "invalid control char")] + #[test_case(r#""\u00""#, Err(Error::invalid_utf8("invalid escape decoding hex escape")); "unfinished hex char")] #[test_case( r#""\x01""#, Err(Error::invalid_utf8("invalid escape character in string")) @@ -1284,12 +1286,19 @@ mod tests { } #[test_case("0", Ok(0); "zero")] + #[test_case("00", Err(Error::invalid_utf8("invalid number, two leading zeroes")); "double zero")] #[test_case("123", Ok(0); "positive number")] #[test_case("123.45", Ok(0); "positive float")] #[test_case("0.0", Ok(0); "zero float")] #[test_case("123, 45", Ok(4); "parses only one number")] + #[test_case("1e30, 45", Ok(4); "parses exponent")] + #[test_case("1.2e3, 45", Ok(4); "parses decimal exponent")] #[test_case("abc", Err(Error::invalid_utf8("invalid number: 97")); "unexpected token")] #[test_case("", Err(Error::unexpected_end()); "unexpected end")] + #[test_case("1.", Err(Error::invalid_utf8("invalid number, expected at least one digit after decimal")); "expected a number after the decimal")] + #[test_case("1.1e-1", Ok(0); "negative exponent")] + #[test_case("1.1e-a", Err(Error::invalid_utf8("expected a digit to follow the exponent, found 97")); "invalid exponent")] + #[test_case("1.1e", Err(Error::unexpected_end()); "unterminated number")] #[tokio::test] async fn test_ignore_number(source: &str, expected: Result) { let mut decoder = test_decoder(source); @@ -1334,6 +1343,7 @@ mod tests { #[test_case(r#"{"k""v"}"#, Err(Error::invalid_utf8("invalid char 34, expected 58")); "missing colon")] #[test_case(r#"{"k","v"}"#, Err(Error::invalid_utf8("invalid char 44, expected 58")); "comma when expecting colon")] #[test_case(r#"{,"k":"v"}"#, Err(Error::invalid_utf8("invalid char 107, expected 58")); "comma when expecting value")] + #[test_case(r#"{"k":"v"asdf}"#, Err(Error::invalid_utf8("invalid char 97, expected , or }")); "value when expecting comma")] #[tokio::test] async fn test_ignore_object(source: &str, expected: Result) { let mut decoder = test_decoder(source); From 342c7a8551686bac5e2eb6389bfc404cece63d32 Mon Sep 17 00:00:00 2001 From: Drew McArthur Date: Mon, 1 Apr 2024 19:29:10 -0600 Subject: [PATCH 20/20] nit: remove extra newline in src/de.rs --- src/de.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/de.rs b/src/de.rs index f54a01c..1a93a97 100644 --- a/src/de.rs +++ b/src/de.rs @@ -6,7 +6,6 @@ use std::str::FromStr; use async_recursion::async_recursion; use async_trait::async_trait; - use bytes::{BufMut, Bytes}; use destream::{de, FromStream, Visitor}; use futures::stream::{Fuse, FusedStream, Stream, StreamExt, TryStreamExt};