diff --git a/.gitignore b/.gitignore index af65c3c..4c6ae88 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__/ **/*.rs.bk /schemas +/test_tuple_results \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 0c0fd48..6d7d779 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,7 +96,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "jsonschema-transpiler" -version = "1.4.1" +version = "1.5.0" dependencies = [ "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index bae0159..278b9ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jsonschema-transpiler" -version = "1.4.1" +version = "1.5.0" authors = ["Anthony Miyaguchi "] description = "A tool to transpile JSON Schema into schemas for data processing" license = "MPL-2.0" diff --git a/scripts/mps-download-schemas.sh b/scripts/mps-download-schemas.sh index 72092b5..5c3fde1 100755 --- a/scripts/mps-download-schemas.sh +++ b/scripts/mps-download-schemas.sh @@ -2,6 +2,6 @@ # Download production mozilla-pipeline-schemas into a schema folder cd "$(dirname "$0")/.." || exit -curl -o schemas.tar.gz -L https://github.com/mozilla-services/mozilla-pipeline-schemas/archive/dev.tar.gz -tar --strip-components=1 -xvf schemas.tar.gz mozilla-pipeline-schemas-dev/schemas +curl -o schemas.tar.gz -L https://github.com/mozilla-services/mozilla-pipeline-schemas/archive/master.tar.gz +tar --strip-components=1 -xvf schemas.tar.gz mozilla-pipeline-schemas-master/schemas rm schemas.tar.gz diff --git a/scripts/mps-generate-avro-data-helper.py b/scripts/mps-generate-avro-data-helper.py index bb2d316..c8a9af4 100755 --- a/scripts/mps-generate-avro-data-helper.py +++ b/scripts/mps-generate-avro-data-helper.py @@ -15,12 +15,13 @@ else: sys.exit("Error: missing argument for document") +assert os.path.isdir("data") assert any( [document in name for name in os.listdir("data")] -), "document not found in data" +), f"{document} not found in data" assert any( [document in name for name in os.listdir("avro")] -), "document not found in avro schemas" +), f"{document} not found in avro schemas" def format_key(key): @@ -43,6 +44,9 @@ def convert(data, schema): out = {} if not data: return out + # cast tuple into an object before continuing + if isinstance(data, list): + data = {f"f{i}_": v for i, v in enumerate(data)} for key, value in data.items(): # apply the appropriate transformations on the key key = format_key(key) diff --git a/scripts/mps-generate-avro-data.sh b/scripts/mps-generate-avro-data.sh index 3aef5da..cce741a 100755 --- a/scripts/mps-generate-avro-data.sh +++ b/scripts/mps-generate-avro-data.sh @@ -7,7 +7,7 @@ documents=$(ls data | sed 's/.ndjson//') total=0 failed=0 for document in $documents; do - if ! python3 scripts/mps-generate-avro-data-helper.py $document 2> /dev/null; then + if ! python3 scripts/mps-generate-avro-data-helper.py $document; then echo "failed to write $document" rm "avro-data/$document.avro" ((failed++)) diff --git a/scripts/mps-load-avro-bq.sh b/scripts/mps-load-avro-bq.sh index d581d40..a172c61 100755 --- a/scripts/mps-load-avro-bq.sh +++ b/scripts/mps-load-avro-bq.sh @@ -2,30 +2,26 @@ cd "$(dirname "$0")/.." || exit -project_id="test-avro-ingest" +project_id=$(gcloud config get-value project) +dataset_id="test_avro" gsutil -m cp avro-data/* gs://${project_id}/data - -documents=$(ls avro-data | sed 's/.avro//') +bq rm -rf $dataset_id +bq mk $dataset_id total=0 error=0 skip=0 trap "exit" INT -for document in ${documents}; do +for document in $(ls avro-data | sed 's/.avro//'); do # downcase hyphens to underscores before generating names bq_document=$(echo $document | sed 's/-/_/g') namespace=$(echo $bq_document | cut -d. -f1) doctype=$(echo $bq_document | cut -d. -f2) docver=$(echo $bq_document | cut -d. -f3) - if ! bq ls | grep ${namespace} >/dev/null ; then - echo "creating dataset: ${namespace}" - bq mk ${namespace} - fi - - table_exists=$(bq ls ${namespace} | grep ${doctype}_v${docver}) + table_exists=$(bq ls ${dataset_id} | grep ${namespace}__${doctype}_v${docver}) if [[ ! -z ${SKIP_EXISTING+x} ]] && [[ ! -z ${table_exists} ]]; then echo "skipping bq load for ${document}" @@ -36,7 +32,7 @@ for document in ${documents}; do echo "running bq load for ${document}" bq load --source_format=AVRO \ --replace \ - ${namespace}.${doctype}_v${docver} \ + ${dataset_id}.${namespace}__${doctype}_v${docver} \ gs://${project_id}/data/${document}.avro if [[ $? -ne 0 ]]; then diff --git a/scripts/mps-verify-tuple-struct.sh b/scripts/mps-verify-tuple-struct.sh new file mode 100755 index 0000000..b1692c4 --- /dev/null +++ b/scripts/mps-verify-tuple-struct.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +cd "$(dirname "$0").." + +datadir=$(mktemp -d -t tmp.XXXXXXXXXX) +function cleanup { + echo "Running cleanup!" + rm -rf "$datadir" +} +trap cleanup EXIT + +scripts/mps-download-schemas.sh + +avro_control=$datadir/avro-control +avro_no_tuple=$datadir/avro-no-tuple +avro_tuple=$datadir/avro-tuple + +bq_control=$datadir/bq-control +bq_no_tuple=$datadir/bq-no-tuple +bq_tuple=$datadir/bq-tuple + + +# get control values +git checkout v1.4.1 + +scripts/mps-generate-schemas.sh $avro_control --type avro --resolve drop +scripts/mps-generate-schemas.sh $bq_control --type bigquery --resolve drop + +git checkout - + +# get values for tuple/no-tuple +scripts/mps-generate-schemas.sh $avro_no_tuple --type avro --resolve drop +scripts/mps-generate-schemas.sh $avro_tuple --type avro --resolve drop --tuple-struct +scripts/mps-generate-schemas.sh $bq_no_tuple --type bigquery --resolve drop +scripts/mps-generate-schemas.sh $bq_tuple --type bigquery --resolve drop --tuple-struct + +outdir="test_tuple_results" +mkdir -p $outdir + +diff -r $avro_control $avro_no_tuple > $outdir/avro-no-tuple.diff +diff -r $bq_control $bq_no_tuple > $outdir/bq-no-tuple.diff +diff -r $avro_no_tuple $avro_tuple > $outdir/avro-tuple.diff +diff -r $bq_no_tuple $bq_tuple > $outdir/bq-tuple.diff diff --git a/src/ast.rs b/src/ast.rs index c339d93..cccb895 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -478,6 +478,12 @@ impl Tag { set_and_recurse(item, "__union__"); } } + Type::Tuple(tuple) => { + for (i, item) in tuple.items.iter_mut().enumerate() { + let name = format!("f{}_", i); + set_and_recurse(item, &name); + } + } _ => (), } } @@ -567,7 +573,7 @@ impl TranslateFrom for Tag { type Error = &'static str; fn translate_from(tag: jsonschema::Tag, context: Context) -> Result { - let mut tag = tag.type_into_ast(); + let mut tag = tag.type_into_ast(context)?; tag.infer_name(context.normalize_case); tag.infer_nullability(context.force_nullable); tag.is_root = true; diff --git a/src/avro.rs b/src/avro.rs index 5e713bd..38dbdb3 100644 --- a/src/avro.rs +++ b/src/avro.rs @@ -192,6 +192,40 @@ impl TranslateFrom for Type { Type::Complex(Complex::Record(record)) } } + ast::Type::Tuple(tuple) => { + let fields = tuple + .items + .iter() + .enumerate() + .map(|(i, v)| { + let default = if v.nullable { Some(json!(null)) } else { None }; + ( + format!("f{}_", i), + Type::translate_from(v.clone(), context), + default, + ) + }) + .filter(|(_, v, _)| v.is_ok()) + .map(|(name, data_type, default)| Field { + name, + data_type: data_type.unwrap(), + default, + ..Default::default() + }) + .collect(); + let record = Record { + common: CommonAttributes { + name: tag.name.clone().unwrap_or_else(|| "__UNNAMED__".into()), + namespace: tag.namespace.clone(), + ..Default::default() + }, + fields, + }; + if record.common.name == "__UNNAMED__" { + warn!("{} - Unnamed field", tag.fully_qualified_name()); + } + Type::Complex(Complex::Record(record)) + } ast::Type::Array(array) => match Type::translate_from(*array.items.clone(), context) { Ok(data_type) => Type::Complex(Complex::Array(Array { items: Box::new(data_type), @@ -596,7 +630,14 @@ mod tests { } } }); - let avro = json!({"type": "string"}); + let avro = json!({ + "type": "record", + "name": "__UNNAMED__", + "fields": [ + {"name": "f0_", "type": {"type": "boolean"}}, + {"name": "f1_", "type": {"type": "long"}} + ] + }); assert_from_ast_eq(ast, avro); } diff --git a/src/bigquery.rs b/src/bigquery.rs index 09376b2..85c7ebe 100644 --- a/src/bigquery.rs +++ b/src/bigquery.rs @@ -110,6 +110,29 @@ impl TranslateFrom for Tag { Type::Record(Record { fields }) } } + ast::Type::Tuple(tuple) => { + let fields: Result, _> = tuple + .items + .iter() + .map(|v| Tag::translate_from(v.clone(), context)) + .collect(); + + let named_fields: HashMap> = fields? + .into_iter() + .enumerate() + .map(|(i, mut v)| { + // The name is actually derived from the value, not from + // the associated key. Modify the name in place. + let name = format!("f{}_", i); + v.name = Some(name.clone()); + (name, Box::new(v)) + }) + .collect(); + + Type::Record(Record { + fields: named_fields, + }) + } ast::Type::Array(array) => match Tag::translate_from(*array.items.clone(), context) { Ok(tag) => *tag.data_type, Err(_) => return Err(fmt_reason("untyped array")), @@ -531,8 +554,12 @@ mod tests { } }); let expect = json!({ - "type": "STRING", + "type": "RECORD", "mode": "REQUIRED", + "fields": [ + {"name": "f0_", "type": "BOOL", "mode": "REQUIRED"}, + {"name": "f1_", "type": "INT64", "mode": "REQUIRED"} + ] }); assert_eq!(expect, transform_tag(data)); } diff --git a/src/jsonschema.rs b/src/jsonschema.rs index 2db85f5..3a6b334 100644 --- a/src/jsonschema.rs +++ b/src/jsonschema.rs @@ -4,6 +4,7 @@ use serde_json::{json, Value}; use std::collections::{HashMap, HashSet}; use super::ast; +use super::Context; /// The type enumeration does not contain any data and is used to determine /// available fields in the flattened tag. In JSONSchema parlance, these are @@ -60,10 +61,20 @@ enum ArrayType { } #[derive(Serialize, Deserialize, Debug, Default)] +#[serde(rename_all = "camelCase")] struct Array { // Using Option would support tuple validation #[serde(skip_serializing_if = "Option::is_none")] items: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + additional_items: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + min_items: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + max_items: Option, } #[derive(Serialize, Deserialize, Debug, PartialEq)] @@ -120,9 +131,9 @@ impl Tag { } } - pub fn type_into_ast(&self) -> ast::Tag { + pub fn type_into_ast(&self, context: Context) -> Result { match self.get_type() { - Type::Atom(atom) => self.atom_into_ast(atom), + Type::Atom(atom) => self.atom_into_ast(atom, context), Type::List(list) => { let mut nullable: bool = false; let mut items: Vec = Vec::new(); @@ -130,15 +141,19 @@ impl Tag { if let Atom::Null = &atom { nullable = true; } - items.push(self.atom_into_ast(atom)); + items.push(self.atom_into_ast(atom, context)?); } - ast::Tag::new(ast::Type::Union(ast::Union::new(items)), None, nullable) + Ok(ast::Tag::new( + ast::Type::Union(ast::Union::new(items)), + None, + nullable, + )) } } } - fn atom_into_ast(&self, data_type: Atom) -> ast::Tag { - match data_type { + fn atom_into_ast(&self, data_type: Atom, context: Context) -> Result { + let result = match data_type { Atom::Null => ast::Tag::new(ast::Type::Null, None, true), Atom::Boolean => ast::Tag::new(ast::Type::Atom(ast::Atom::Boolean), None, false), Atom::Number => ast::Tag::new(ast::Type::Atom(ast::Atom::Number), None, false), @@ -150,7 +165,7 @@ impl Tag { Some(properties) => { let mut fields: HashMap = HashMap::new(); for (key, value) in properties { - fields.insert(key.to_string(), value.type_into_ast()); + fields.insert(key.to_string(), value.type_into_ast(context)?); } ast::Tag::new( ast::Type::Object(ast::Object::new(fields, self.object.required.clone())), @@ -166,23 +181,25 @@ impl Tag { ) { (Some(AdditionalProperties::Object(add)), Some(pat)) => { let mut vec: Vec = Vec::new(); - vec.push(add.type_into_ast()); - vec.extend(pat.values().map(|v| v.type_into_ast())); + vec.push(add.type_into_ast(context)?); + let pat_vec: Result, _> = + pat.values().map(|v| v.type_into_ast(context)).collect(); + vec.extend(pat_vec?); let value = ast::Tag::new(ast::Type::Union(ast::Union::new(vec)), None, false); ast::Tag::new(ast::Type::Map(ast::Map::new(None, value)), None, false) } (Some(AdditionalProperties::Object(tag)), None) => ast::Tag::new( - ast::Type::Map(ast::Map::new(None, tag.type_into_ast())), + ast::Type::Map(ast::Map::new(None, tag.type_into_ast(context)?)), None, false, ), (_, Some(tag)) => { + let items: Result, _> = + tag.values().map(|v| v.type_into_ast(context)).collect(); let union = ast::Tag::new( - ast::Type::Union(ast::Union::new( - tag.values().map(|v| v.type_into_ast()).collect(), - )), + ast::Type::Union(ast::Union::new(items?)), None, false, ); @@ -192,11 +209,14 @@ impl Tag { // handle oneOf match &self.one_of { Some(vec) => { - let items: Vec = - vec.iter().map(|item| item.type_into_ast()).collect(); - let nullable: bool = items.iter().any(ast::Tag::is_null); + let items: Result, _> = vec + .iter() + .map(|item| item.type_into_ast(context)) + .collect(); + let unwrapped = items?; + let nullable: bool = unwrapped.iter().any(ast::Tag::is_null); ast::Tag::new( - ast::Type::Union(ast::Union::new(items)), + ast::Type::Union(ast::Union::new(unwrapped)), None, nullable, ) @@ -213,20 +233,70 @@ impl Tag { if let Some(items) = &self.array.items { let data_type = match items { ArrayType::Tag(items) => { - ast::Type::Array(ast::Array::new(items.type_into_ast())) + ast::Type::Array(ast::Array::new(items.type_into_ast(context)?)) } ArrayType::TagTuple(items) => { - let items: Vec = - items.iter().map(|item| item.type_into_ast()).collect(); - ast::Type::Tuple(ast::Tuple::new(items)) + // Instead of expanding the definition of the AST + // tuple type, only a subset of tuple validation is + // accepted as valid. The type must be set to + // "array", the items a list of sub-schemas, + // additionalItems set to a valid type, and maxItems + // set to a value that is equal to or longer than + // the items list. Anything else will be directly + // translated into a JSON atom. + if context.tuple_struct { + let items: Result, _> = items + .iter() + .map(|item| item.type_into_ast(context)) + .collect(); + let mut unwrapped = items?; + let min_items: usize = + self.array.min_items.unwrap_or(unwrapped.len()); + // set items to optional + for i in min_items..unwrapped.len() { + unwrapped[i].nullable = true; + } + match &self.array.additional_items { + Some(AdditionalProperties::Object(tag)) => { + let max_items: usize = self.array.max_items.unwrap_or(0); + if max_items < unwrapped.len() { + return Err("maxItems is less than tuple length"); + } + for _ in unwrapped.len()..max_items { + let mut ast_tag = tag.type_into_ast(context)?; + ast_tag.nullable = true; + unwrapped.push(ast_tag); + } + ast::Type::Tuple(ast::Tuple::new(unwrapped)) + } + Some(AdditionalProperties::Bool(false)) => { + ast::Type::Tuple(ast::Tuple::new(unwrapped)) + } + None => { + // additionalItems may be unspecified if the max_length + // matches the number of items in the tuple. This corresponds + // to optional tuple values (variable length tuple). + let max_items: usize = self.array.max_items.unwrap_or(0); + if max_items == unwrapped.len() { + ast::Type::Tuple(ast::Tuple::new(unwrapped)) + } else { + return Err("maxItems must be set if additionalItems are allowed"); + } + } + _ => return Err("additionalItems set incorrectly"), + } + } else { + ast::Type::Atom(ast::Atom::JSON) + } } }; ast::Tag::new(data_type, None, false) } else { - panic!(format!("array missing item: {:#?}", self)) + return Err("array missing item"); } } - } + }; + Ok(result) } } @@ -407,7 +477,9 @@ mod tests { "items": [ {"type": "integer"}, {"type": "boolean"} - ] + ], + "additionalItems": {"type": "string"}, + "maxItems": 4, }); let schema: Tag = serde_json::from_value(data).unwrap(); if let ArrayType::TagTuple(items) = schema.array.items.unwrap() { @@ -416,6 +488,12 @@ mod tests { } else { panic!(); } + if let AdditionalProperties::Object(tag) = schema.array.additional_items.unwrap() { + assert_eq!(tag.data_type, json!("string")); + } else { + panic!(); + } + assert_eq!(schema.array.max_items.unwrap(), 4); } #[test] @@ -683,4 +761,120 @@ mod tests { }); assert_eq!(expect, translate(data)) } + + fn translate_tuple(data: Value) -> Value { + let context = Context { + tuple_struct: true, + ..Default::default() + }; + let schema: Tag = serde_json::from_value(data).unwrap(); + let ast: ast::Tag = schema.translate_into(context).unwrap(); + json!(ast) + } + + #[test] + fn test_into_ast_tuple_default_behavior() { + let data = json!({ + "type": "array", + "items": [ + {"type": "boolean"}, + {"type": "integer"} + ] + }); + let expect = json!({"type": {"atom": "json"}, "nullable": false}); + assert_eq!(expect, translate(data)) + } + + #[test] + #[should_panic(expected = "maxItems must be set")] + fn test_into_ast_tuple_invalid() { + let data = json!({ + "type": "array", + "items": [ + {"type": "boolean"}, + {"type": "integer"} + ] + }); + let expect = json!({"type": {"atom": "json"}, "nullable": false}); + assert_eq!(expect, translate_tuple(data)) + } + + #[test] + #[should_panic(expected = "maxItems")] + fn test_into_ast_tuple_missing_max_items() { + let data = json!({ + "type": "array", + "items": [ + {"type": "boolean"}, + {"type": "integer"} + ], + "additionalItems": {"type": "string"} + }); + let expect = json!({"type": {"atom": "json"}, "nullable": false}); + assert_eq!(expect, translate_tuple(data)) + } + + #[test] + fn test_into_ast_tuple_static() { + let data = json!({ + "type": "array", + "items": [ + {"type": "boolean"}, + {"type": "integer"} + ], + "maxItems": 2 + }); + let expect = json!({ + "type": {"tuple": {"items": [ + {"name": "f0_", "type": {"atom": "boolean"}, "nullable": false}, + {"name": "f1_", "type": {"atom": "integer"}, "nullable": false} + ]}}, + "nullable": false + }); + assert_eq!(expect, translate_tuple(data)) + } + + #[test] + fn test_into_ast_tuple_static_nullable() { + let data = json!({ + "type": "array", + "items": [ + {"type": "boolean"}, + {"type": "integer"} + ], + "minItems": 1, + "maxItems": 2 + }); + let expect = json!({ + "type": {"tuple": {"items": [ + {"name": "f0_", "type": {"atom": "boolean"}, "nullable": false}, + {"name": "f1_", "type": {"atom": "integer"}, "nullable": true} + ]}}, + "nullable": false + }); + assert_eq!(expect, translate_tuple(data)) + } + + #[test] + fn test_into_ast_tuple_valid() { + let data = json!({ + "type": "array", + "items": [ + {"type": "boolean"}, + {"type": "integer"} + ], + "additionalItems": {"type": "string"}, + "maxItems": 4 + }); + let expect = json!({ + "type": {"tuple": {"items": [ + {"name": "f0_", "type": {"atom": "boolean"}, "nullable": false}, + {"name": "f1_", "type": {"atom": "integer"}, "nullable": false}, + {"name": "f2_", "type": {"atom": "string"}, "nullable": true}, + {"name": "f3_", "type": {"atom": "string"}, "nullable": true}, + ]}}, + "nullable": false, + }); + assert_eq!(expect, translate_tuple(data)) + } } diff --git a/src/lib.rs b/src/lib.rs index 6dc95bc..1d18606 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,6 +63,7 @@ pub struct Context { pub resolve_method: ResolveMethod, pub normalize_case: bool, pub force_nullable: bool, + pub tuple_struct: bool, } fn into_ast(input: &Value, context: Context) -> ast::Tag { diff --git a/src/main.rs b/src/main.rs index 32843d0..a91b970 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,6 +49,11 @@ fn main() { .short("n") .long("force-nullable"), ) + .arg( + Arg::with_name("tuple-struct") + .help("Treats tuple validation as an anonymous struct") + .long("tuple-struct"), + ) .get_matches(); let reader: Box = match matches.value_of("file") { @@ -69,6 +74,7 @@ fn main() { }, normalize_case: matches.is_present("normalize-case"), force_nullable: matches.is_present("force-nullable"), + tuple_struct: matches.is_present("tuple-struct"), }; let output = match matches.value_of("type").unwrap() { diff --git a/tests/force_nullable.rs b/tests/force_nullable.rs index e83040e..2e483e0 100644 --- a/tests/force_nullable.rs +++ b/tests/force_nullable.rs @@ -55,9 +55,16 @@ fn test_data() -> Value { "required": ["f"] } ] + }, + "tuple": { + "type": "array", + "items": [ + {"type": "boolean"} + ], + "maxItems": 1 } }, - "required": ["atom", "object", "map", "array", "union"] + "required": ["atom", "object", "map", "array", "union", "tuple"] } "#, ) @@ -68,6 +75,7 @@ fn test_data() -> Value { fn test_bigquery_force_nullable() { let context = Context { force_nullable: true, + tuple_struct: true, ..Default::default() }; @@ -132,6 +140,18 @@ fn test_bigquery_force_nullable() { "name": "object", "type": "RECORD" }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "f0_", + "type": "BOOL" + } + ], + "mode": "NULLABLE", + "name": "tuple", + "type": "RECORD" + }, { "fields": [ { @@ -161,6 +181,7 @@ fn test_bigquery_force_nullable() { fn test_avro_force_nullable() { let context = Context { force_nullable: true, + tuple_struct: true, ..Default::default() }; let expected: Value = serde_json::from_str( @@ -263,6 +284,28 @@ fn test_avro_force_nullable() { } ] }, + { + "default": null, + "name": "tuple", + "type": [ + {"type": "null"}, + { + "name": "tuple", + "namespace": "root", + "type": "record", + "fields": [ + { + "default": null, + "name": "f0_", + "type": [ + {"type": "null"}, + {"type": "boolean"} + ] + } + ] + } + ] + }, { "default": null, "name": "union", diff --git a/tests/tuple_struct.rs b/tests/tuple_struct.rs new file mode 100644 index 0000000..9ced104 --- /dev/null +++ b/tests/tuple_struct.rs @@ -0,0 +1,303 @@ +use jst::{convert_avro, convert_bigquery}; +use jst::{Context, ResolveMethod}; +use pretty_assertions::assert_eq; +use serde_json::Value; + +fn data_atomic() -> Value { + serde_json::from_str( + r#" + { + "additionalItems": false, + "items": [ + {"type": "boolean"}, + {"type": "string"} + ], + "type": "array" + } + "#, + ) + .unwrap() +} + +fn data_atomic_with_additional_properties() -> Value { + serde_json::from_str( + r#" + { + "additionalItems": { + "type": "integer" + }, + "items": [ + {"type": "boolean"}, + {"type": "string"} + ], + "maxItems": 4, + "type": "array" + } + "#, + ) + .unwrap() +} + +fn data_object_missing() -> Value { + // The second item has an incompatible field, but will be dropped. + serde_json::from_str( + r#" + { + "additionalItems": false, + "items": [ + {"type": "integer"}, + { + "type": "object", + "properties": { + "first": {"type": "string"}, + "second": {"type": ["string", "object"]} + }, + "required": ["first"] + } + ], + "type": "array" + } + "#, + ) + .unwrap() +} + +fn data_incompatible() -> Value { + serde_json::from_str( + r#" + { + "additionalItems": false, + "items": [ + {"type": ["string", "integer"]} + ] + } + "#, + ) + .unwrap() +} + +#[test] +fn test_avro_tuple_atomic() { + let context = Context { + tuple_struct: true, + ..Default::default() + }; + let expected: Value = serde_json::from_str( + r#" + { + "fields": [ + { + "name": "f0_", + "type": {"type": "boolean"} + }, + { + "name": "f1_", + "type": {"type": "string"} + } + ], + "name": "root", + "type": "record" + } + "#, + ) + .unwrap(); + assert_eq!(expected, convert_avro(&data_atomic(), context)); +} + +#[test] +fn test_bigquery_tuple_atomic() { + let context = Context { + tuple_struct: true, + ..Default::default() + }; + let expected: Value = serde_json::from_str( + r#" + [ + { + "mode": "REQUIRED", + "name": "f0_", + "type": "BOOL" + }, + { + "mode": "REQUIRED", + "name": "f1_", + "type": "STRING" + } + ] + "#, + ) + .unwrap(); + assert_eq!(expected, convert_bigquery(&data_atomic(), context)); +} + +#[test] +fn test_avro_tuple_atomic_with_additional_items() { + let context = Context { + tuple_struct: true, + ..Default::default() + }; + let expected: Value = serde_json::from_str( + r#" + { + "fields": [ + { + "name": "f0_", + "type": {"type": "boolean"} + }, + { + "name": "f1_", + "type": {"type": "string"} + }, + { + "name": "f2_", + "default": null, + "type": [{"type": "null"}, {"type": "long"}] + }, + { + "name": "f3_", + "default": null, + "type": [{"type": "null"}, {"type": "long"}] + } + ], + "name": "root", + "type": "record" + } + "#, + ) + .unwrap(); + assert_eq!( + expected, + convert_avro(&data_atomic_with_additional_properties(), context) + ); +} + +#[test] +fn test_bigquery_tuple_atomic_with_additional_items() { + let context = Context { + tuple_struct: true, + ..Default::default() + }; + let expected: Value = serde_json::from_str( + r#" + [ + { + "mode": "REQUIRED", + "name": "f0_", + "type": "BOOL" + }, + { + "mode": "REQUIRED", + "name": "f1_", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "f2_", + "type": "INT64" + }, + { + "mode": "NULLABLE", + "name": "f3_", + "type": "INT64" + } + ] + "#, + ) + .unwrap(); + assert_eq!( + expected, + convert_bigquery(&data_atomic_with_additional_properties(), context) + ); +} + +/// Objects within tuples are allowed to have extra fields. The decoding tool +/// should preserve the ordering of the items in the tuples. +#[test] +fn test_avro_tuple_object_drop() { + let context = Context { + tuple_struct: true, + resolve_method: ResolveMethod::Drop, + ..Default::default() + }; + + let expected: Value = serde_json::from_str( + r#" + { + "fields": [ + { + "name": "f0_", + "type": {"type": "long"} + }, + { + "name": "f1_", + "type": { + "name": "f1_", + "namespace": "root", + "type": "record", + "fields": [ + {"name": "first", "type": {"type": "string"}} + ] + } + } + ], + "name": "root", + "type": "record" + } + "#, + ) + .unwrap(); + assert_eq!(expected, convert_avro(&data_object_missing(), context)); +} + +#[test] +fn test_bigquery_tuple_object_drop() { + let context = Context { + tuple_struct: true, + resolve_method: ResolveMethod::Drop, + ..Default::default() + }; + + let expected: Value = serde_json::from_str( + r#" + [ + { + "mode": "REQUIRED", + "name": "f0_", + "type": "INT64" + }, + { + "mode": "REQUIRED", + "name": "f1_", + "type": "RECORD", + "fields": [ + {"name": "first", "type": "STRING", "mode": "REQUIRED"} + ] + } + ] + "#, + ) + .unwrap(); + assert_eq!(expected, convert_bigquery(&data_object_missing(), context)); +} + +#[test] +#[should_panic] +fn test_avro_tuple_object_incompatible() { + let context = Context { + tuple_struct: true, + resolve_method: ResolveMethod::Drop, + ..Default::default() + }; + convert_avro(&data_incompatible(), context); +} + +#[test] +#[should_panic] +fn test_bigquery_tuple_object_incompatible() { + let context = Context { + tuple_struct: true, + resolve_method: ResolveMethod::Drop, + ..Default::default() + }; + convert_bigquery(&data_incompatible(), context); +}