From eb6921c50995c68c737f68897d892729de6caeea Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Wed, 30 Jun 2021 01:29:40 -0400 Subject: [PATCH 1/5] feat: modify BQ table template to use partitioning and clustering --- templates/terraform/google_bigquery_table.tf.jinja2 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/templates/terraform/google_bigquery_table.tf.jinja2 b/templates/terraform/google_bigquery_table.tf.jinja2 index 89e46c4ea..a5506c17a 100644 --- a/templates/terraform/google_bigquery_table.tf.jinja2 +++ b/templates/terraform/google_bigquery_table.tf.jinja2 @@ -23,6 +23,16 @@ resource "google_bigquery_table" "{{ tf_resource_name }}" { {% if description -%} description = {{ description|tojson }} {%- endif %} + {% if time_partitioning -%} + time_partitioning { + {%- for key, val in time_partitioning.items() %} + {{ key }} = {{ val|tojson }} + {% endfor -%} + } + {%- endif %} + {% if clustering -%} + clustering = {{ clustering|tojson }} + {%- endif %} {% if schema -%} schema = < Date: Fri, 2 Jul 2021 14:51:21 -0400 Subject: [PATCH 2/5] added BQ table optional properties on pipeline.yaml sample --- samples/pipeline.yaml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/samples/pipeline.yaml b/samples/pipeline.yaml index 0be2290d2..dfdd89f0e 100644 --- a/samples/pipeline.yaml +++ b/samples/pipeline.yaml @@ -32,9 +32,31 @@ resources: # table_id table_id: PIPELINE_FOLDER_NAME + # Optional Properties: # Description of the table description: "This is a table description." + # Time-based partitioning configuration. There is no need for this property + # if you have a relatively small dataset to host on a BigQuery table. + time_partitioning: + + # The supported types are DAY, HOUR, MONTH, and YEAR, which will generate one partition per day, hour, month, and year, respectively. + type: "DAY" + + # If set to true, queries over this table require a partition filter that can be used for partition elimination to be specified. + require_partition_filter: false + + # Specifies column names to use for data clustering. Up to four top-level columns are allowed, and should be specified in descending priority order. + clustering: + - "column_1" + - "column_2" + - "column_3" + + # The table cannot be deleted without first disabling this property. + # Unless this field is set to false in Terraform state, a `terraform destroy` + # or `terraform apply` that would delete the table will fail. + deletion_protection: true + dag: # The DAG acronym stands for directed acyclic graph. This block represents # your data pipeline along with every property and configuration it needs to From c0d420bb29971d4d98113c16d4d11ba8b6813df8 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Fri, 2 Jul 2021 14:51:42 -0400 Subject: [PATCH 3/5] added deletion_protection property for BQ tables --- templates/terraform/google_bigquery_table.tf.jinja2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/templates/terraform/google_bigquery_table.tf.jinja2 b/templates/terraform/google_bigquery_table.tf.jinja2 index a5506c17a..c09a06393 100644 --- a/templates/terraform/google_bigquery_table.tf.jinja2 +++ b/templates/terraform/google_bigquery_table.tf.jinja2 @@ -33,6 +33,9 @@ resource "google_bigquery_table" "{{ tf_resource_name }}" { {% if clustering -%} clustering = {{ clustering|tojson }} {%- endif %} + {% if deletion_protection -%} + deletion_protection = {{ deletion_protection|tojson }} + {%- endif %} {% if schema -%} schema = < Date: Fri, 2 Jul 2021 14:52:03 -0400 Subject: [PATCH 4/5] tests for new optional BQ table properties --- tests/scripts/test_generate_terraform.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/scripts/test_generate_terraform.py b/tests/scripts/test_generate_terraform.py index fd6e48225..261ac9c2d 100644 --- a/tests/scripts/test_generate_terraform.py +++ b/tests/scripts/test_generate_terraform.py @@ -542,7 +542,7 @@ def test_dataset_tf_has_no_bq_dataset_description_when_unspecified( assert not re.search(r"description\s+\=", result.group(1)) -def test_pipeline_tf_contains_bq_table_description_when_specified( +def test_pipeline_tf_contains_optional_properties_when_specified( dataset_path, pipeline_path, project_id, @@ -571,10 +571,13 @@ def test_pipeline_tf_contains_bq_table_description_when_specified( ) assert bq_table assert bq_table["description"] + assert bq_table["time_partitioning"] + assert bq_table["clustering"] + assert bq_table["deletion_protection"] # Match the "google_bigquery_table" properties, i.e. any lines between the # curly braces, in the *_pipeline.tf file - regexp = r"\"google_bigquery_table\" \"" + bq_table["table_id"] + r"\" \{(.*?)\}" + regexp = r"\"google_bigquery_table\" \"" + bq_table["table_id"] + r"\" \{(.*?)^\}" bq_table_tf_string = re.compile(regexp, flags=re.MULTILINE | re.DOTALL) for path_prefix in ( @@ -587,9 +590,12 @@ def test_pipeline_tf_contains_bq_table_description_when_specified( assert re.search(r"table_id\s+\=", result.group(1)) assert re.search(r"description\s+\=", result.group(1)) + assert re.search(r"time_partitioning\s+\{", result.group(1)) + assert re.search(r"clustering\s+\=", result.group(1)) + assert re.search(r"deletion_protection\s+\=", result.group(1)) -def test_pipeline_tf_has_no_bq_table_description_when_unspecified( +def test_pipeline_tf_has_no_optional_properties_when_unspecified( dataset_path, pipeline_path, project_id, @@ -608,6 +614,9 @@ def test_pipeline_tf_has_no_bq_table_description_when_unspecified( (r for r in config["resources"] if r["type"] == "bigquery_table"), None ) del bq_table["description"] + del bq_table["time_partitioning"] + del bq_table["clustering"] + del bq_table["deletion_protection"] with open(pipeline_path / "pipeline.yaml", "w") as file: yaml.dump(config, file) @@ -624,7 +633,7 @@ def test_pipeline_tf_has_no_bq_table_description_when_unspecified( # Match the "google_bigquery_table" properties, i.e. any lines between the # curly braces, in the *_pipeline.tf file - regexp = r"\"google_bigquery_table\" \"" + bq_table["table_id"] + r"\" \{(.*?)\}" + regexp = r"\"google_bigquery_table\" \"" + bq_table["table_id"] + r"\" \{(.*?)^\}" bq_table_tf_string = re.compile(regexp, flags=re.MULTILINE | re.DOTALL) for path_prefix in ( @@ -637,6 +646,9 @@ def test_pipeline_tf_has_no_bq_table_description_when_unspecified( assert re.search(r"table_id\s+\=", result.group(1)) assert not re.search(r"description\s+\=", result.group(1)) + assert not re.search(r"time_partitioning\s+\{", result.group(1)) + assert not re.search(r"clustering\s+\=", result.group(1)) + assert not re.search(r"deletion_protection\s+\=", result.group(1)) def test_bq_table_can_have_a_description_with_newlines_and_quotes( From 8eab2b5f5e23af67091414b5be93302e5debc309 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Fri, 9 Jul 2021 17:04:00 -0400 Subject: [PATCH 5/5] fix: partition Google Trends tables (#118) --- .../_terraform/top_terms_pipeline.tf | 80 +++++++++++++++++++ .../google_trends/top_terms/pipeline.yaml | 70 ++++++++++++++++ 2 files changed, 150 insertions(+) diff --git a/datasets/google_trends/_terraform/top_terms_pipeline.tf b/datasets/google_trends/_terraform/top_terms_pipeline.tf index 61029dd55..5560b317f 100644 --- a/datasets/google_trends/_terraform/top_terms_pipeline.tf +++ b/datasets/google_trends/_terraform/top_terms_pipeline.tf @@ -21,7 +21,47 @@ resource "google_bigquery_table" "top_terms" { table_id = "top_terms" description = "Daily top 25 terms in the United States with score, ranking, time, and designated market area" + time_partitioning { + type = "DAY" + field = "refresh_date" + + require_partition_filter = false + } + + + schema = <