From e383bc236e684025567ff71caf3c75d176d3ab35 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Mon, 28 Jun 2021 16:50:57 -0400 Subject: [PATCH 1/8] YAML config files for C19 VSI dataset --- .../pipeline.yaml | 99 +++++++++++++++++++ .../dataset.yaml | 26 +++++ 2 files changed, 125 insertions(+) create mode 100644 datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml create mode 100644 datasets/covid19_vaccination_search_insights/dataset.yaml diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml new file mode 100644 index 000000000..5f3b9c278 --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml @@ -0,0 +1,99 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + - type: bigquery_table + table_id: covid19_vaccination_search_insights + description: "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." + +dag: + initialize: + dag_id: vaccination_search_insights_to_bq + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-06-28' + max_active_runs: 1 + schedule_interval: "@weekly" + catchup: False + default_view: graph + + tasks: + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load global vaccination search insights CSV file from the covid19-open-data bucket to BQ" + args: + task_id: "gcs_to_bq_vaccination_search_insights" + bucket: "{{ var.json.covid19_vaccination_search_insights.source_bucket }}" + source_objects: ["{{ var.json.covid19_vaccination_search_insights.source_prefix }}/Global_vaccination_search_insights.csv"] + source_format: "CSV" + destination_project_dataset_table: "covid19_vaccination_search_insights.covid19_vaccination_search_insights" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: date + description: "The first day of the week (starting on Monday) on which the searches took place. For example, in the weekly data the row labeled 2021-04-19 represents the search activity for the week of April 19 to April 25, 2021, inclusive. Calendar days start and end at midnight Pacific Standard Time." + type: "DATE" + mode: "REQUIRED" + - name: country_region + description: "The name of the country in English. For example, United States." + type: "STRING" + mode: "REQUIRED" + - name: country_region_code + description: "The ISO 3166-1 code for the country. For example, US." + type: "STRING" + mode: "REQUIRED" + - name: sub_region_1 + description: "The name of a region in the country. For example, California." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_1_code + description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_2 + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_2_code + description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_3 + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_2. For example, Downtown or postal_code." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_3_code + description: "In the US, the ZIP code. For example 94303." + type: "STRING" + mode: "NULLABLE" + - name: place_id + description: "The Google place ID for the most-specific subregion. Used in the Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." + type: "STRING" + mode: "REQUIRED" + - name: snf_covid19_vaccination + description: "The scaled normalized interest related to all COVID-19 vaccination for the region and date. For example, 87.02. Empty when data isn’t available." + type: "FLOAT" + mode: "NULLABLE" + - name: snf_vaccination_intent + description: "The scaled normalized interest related to vaccination intent for the region and date. For example, 22.69. Empty when data isn’t available." + type: "FLOAT" + mode: "NULLABLE" + - name: snf_safety_side_effects + description: "The scaled normalized interest related to safety and side effects of the vaccines for the region and date. For example, 17.96. Empty when data isn’t available." + type: "FLOAT" + mode: "NULLABLE" + + graph_paths: + - "gcs_to_bq_vaccination_search_insights" diff --git a/datasets/covid19_vaccination_search_insights/dataset.yaml b/datasets/covid19_vaccination_search_insights/dataset.yaml new file mode 100644 index 000000000..cf89942d5 --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/dataset.yaml @@ -0,0 +1,26 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: covid19_vaccination_search_insights + friendly_name: ~ + description: "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." + dataset_sources: ~ + terms_of_use: ~ + + +resources: + - type: bigquery_dataset + dataset_id: covid19_vaccination_search_insights + description: "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." From 50a8bf1fe12196cd6395ffd2e8ecefb92ed09610 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Mon, 28 Jun 2021 16:51:22 -0400 Subject: [PATCH 2/8] generate .tf files --- ...d19_vaccination_search_insights_dataset.tf | 26 ++++++++++++++ ...19_vaccination_search_insights_pipeline.tf | 36 +++++++++++++++++++ .../_terraform/provider.tf | 28 +++++++++++++++ .../_terraform/variables.tf | 23 ++++++++++++ 4 files changed, 113 insertions(+) create mode 100644 datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf create mode 100644 datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf create mode 100644 datasets/covid19_vaccination_search_insights/_terraform/provider.tf create mode 100644 datasets/covid19_vaccination_search_insights/_terraform/variables.tf diff --git a/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf new file mode 100644 index 000000000..055905b69 --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "covid19_vaccination_search_insights" { + dataset_id = "covid19_vaccination_search_insights" + project = var.project_id + description = "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." +} + +output "bigquery_dataset-covid19_vaccination_search_insights-dataset_id" { + value = google_bigquery_dataset.covid19_vaccination_search_insights.dataset_id +} diff --git a/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf new file mode 100644 index 000000000..c5b13ccb3 --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "covid19_vaccination_search_insights" { + project = var.project_id + dataset_id = "covid19_vaccination_search_insights" + table_id = "covid19_vaccination_search_insights" + + description = "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." + + depends_on = [ + google_bigquery_dataset.covid19_vaccination_search_insights + ] +} + +output "bigquery_table-covid19_vaccination_search_insights-table_id" { + value = google_bigquery_table.covid19_vaccination_search_insights.table_id +} + +output "bigquery_table-covid19_vaccination_search_insights-id" { + value = google_bigquery_table.covid19_vaccination_search_insights.id +} diff --git a/datasets/covid19_vaccination_search_insights/_terraform/provider.tf b/datasets/covid19_vaccination_search_insights/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/covid19_vaccination_search_insights/_terraform/variables.tf b/datasets/covid19_vaccination_search_insights/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + From ef8c541ce6d1cd9be8ea80f2cf4c3ef2232c7aff Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Mon, 28 Jun 2021 16:51:32 -0400 Subject: [PATCH 3/8] generate DAG file --- ...covid19_vaccination_search_insights_dag.py | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py new file mode 100644 index 000000000..92442266f --- /dev/null +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py @@ -0,0 +1,128 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.contrib.operators import gcs_to_bq + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-06-28", +} + + +with DAG( + dag_id="covid19_vaccination_search_insights.vaccination_search_insights_to_bq", + default_args=default_args, + max_active_runs=1, + schedule_interval="@weekly", + catchup=False, + default_view="graph", +) as dag: + + # Task to load global vaccination search insights CSV file from the covid19-open-data bucket to BQ + gcs_to_bq_vaccination_search_insights = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( + task_id="gcs_to_bq_vaccination_search_insights", + bucket="{{ var.json.covid19_vaccination_search_insights.source_bucket }}", + source_objects=[ + "{{ var.json.covid19_vaccination_search_insights.source_prefix }}/Global_vaccination_search_insights.csv" + ], + source_format="CSV", + destination_project_dataset_table="covid19_vaccination_search_insights.covid19_vaccination_search_insights", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "date", + "description": "The first day of the week (starting on Monday) on which the searches took place. For example, in the weekly data the row labeled 2021-04-19 represents the search activity for the week of April 19 to April 25, 2021, inclusive. Calendar days start and end at midnight Pacific Standard Time.", + "type": "DATE", + "mode": "REQUIRED", + }, + { + "name": "country_region", + "description": "The name of the country in English. For example, United States.", + "type": "STRING", + "mode": "REQUIRED", + }, + { + "name": "country_region_code", + "description": "The ISO 3166-1 code for the country. For example, US.", + "type": "STRING", + "mode": "REQUIRED", + }, + { + "name": "sub_region_1", + "description": "The name of a region in the country. For example, California.", + "type": "STRING", + "mode": "NULLABLE", + }, + { + "name": "sub_region_1_code", + "description": "A country-specific ISO 3166-2 code for the region. For example, US-CA.", + "type": "STRING", + "mode": "NULLABLE", + }, + { + "name": "sub_region_2", + "description": "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough.", + "type": "STRING", + "mode": "NULLABLE", + }, + { + "name": "sub_region_2_code", + "description": "In the US, the FIPS code for a US county (or equivalent). For example, 06085.", + "type": "STRING", + "mode": "NULLABLE", + }, + { + "name": "sub_region_3", + "description": "The name (or type) of a region in the country. Typically a subdivision of sub_region_2. For example, Downtown or postal_code.", + "type": "STRING", + "mode": "NULLABLE", + }, + { + "name": "sub_region_3_code", + "description": "In the US, the ZIP code. For example 94303.", + "type": "STRING", + "mode": "NULLABLE", + }, + { + "name": "place_id", + "description": "The Google place ID for the most-specific subregion. Used in the Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA.", + "type": "STRING", + "mode": "REQUIRED", + }, + { + "name": "snf_covid19_vaccination", + "description": "The scaled normalized interest related to all COVID-19 vaccination for the region and date. For example, 87.02. Empty when data isn’t available.", + "type": "FLOAT", + "mode": "NULLABLE", + }, + { + "name": "snf_vaccination_intent", + "description": "The scaled normalized interest related to vaccination intent for the region and date. For example, 22.69. Empty when data isn’t available.", + "type": "FLOAT", + "mode": "NULLABLE", + }, + { + "name": "snf_safety_side_effects", + "description": "The scaled normalized interest related to safety and side effects of the vaccines for the region and date. For example, 17.96. Empty when data isn’t available.", + "type": "FLOAT", + "mode": "NULLABLE", + }, + ], + ) + + gcs_to_bq_vaccination_search_insights From 8dc16b5e613c28d77a3fce6e0898788e5bc24ea8 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Tue, 29 Jun 2021 01:36:59 -0400 Subject: [PATCH 4/8] use sni instead of snf columns --- .../covid19_vaccination_search_insights_dag.py | 6 +++--- .../covid19_vaccination_search_insights/pipeline.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py index 92442266f..1e73fb138 100644 --- a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py @@ -105,19 +105,19 @@ "mode": "REQUIRED", }, { - "name": "snf_covid19_vaccination", + "name": "sni_covid19_vaccination", "description": "The scaled normalized interest related to all COVID-19 vaccination for the region and date. For example, 87.02. Empty when data isn’t available.", "type": "FLOAT", "mode": "NULLABLE", }, { - "name": "snf_vaccination_intent", + "name": "sni_vaccination_intent", "description": "The scaled normalized interest related to vaccination intent for the region and date. For example, 22.69. Empty when data isn’t available.", "type": "FLOAT", "mode": "NULLABLE", }, { - "name": "snf_safety_side_effects", + "name": "sni_safety_side_effects", "description": "The scaled normalized interest related to safety and side effects of the vaccines for the region and date. For example, 17.96. Empty when data isn’t available.", "type": "FLOAT", "mode": "NULLABLE", diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml index 5f3b9c278..655918287 100644 --- a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml @@ -82,15 +82,15 @@ dag: description: "The Google place ID for the most-specific subregion. Used in the Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." type: "STRING" mode: "REQUIRED" - - name: snf_covid19_vaccination + - name: sni_covid19_vaccination description: "The scaled normalized interest related to all COVID-19 vaccination for the region and date. For example, 87.02. Empty when data isn’t available." type: "FLOAT" mode: "NULLABLE" - - name: snf_vaccination_intent + - name: sni_vaccination_intent description: "The scaled normalized interest related to vaccination intent for the region and date. For example, 22.69. Empty when data isn’t available." type: "FLOAT" mode: "NULLABLE" - - name: snf_safety_side_effects + - name: sni_safety_side_effects description: "The scaled normalized interest related to safety and side effects of the vaccines for the region and date. For example, 17.96. Empty when data isn’t available." type: "FLOAT" mode: "NULLABLE" From 2b1fe79743333901713b93dd3b2b6f86f079a21b Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Wed, 30 Jun 2021 01:28:41 -0400 Subject: [PATCH 5/8] use partitioned tables --- ...d19_vaccination_search_insights_dataset.tf | 2 +- ...19_vaccination_search_insights_pipeline.tf | 89 ++++++++++++- ...covid19_vaccination_search_insights_dag.py | 16 +-- .../pipeline.yaml | 121 ++++++++++++++++-- .../dataset.yaml | 26 +++- 5 files changed, 233 insertions(+), 21 deletions(-) diff --git a/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf index 055905b69..d4d69f72b 100644 --- a/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf +++ b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_dataset.tf @@ -18,7 +18,7 @@ resource "google_bigquery_dataset" "covid19_vaccination_search_insights" { dataset_id = "covid19_vaccination_search_insights" project = var.project_id - description = "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." + description = "Terms of use\nTo download or use the data, you must agree to the Google Terms of Service: https://policies.google.com/terms\n\nDescription\nThe COVID-19 Vaccination Search Insights data shows aggregated, anonymized trends in searches related to COVID-19 vaccination. The dataset provides a weekly time series for each region showing the relative interest of Google searches related to COVID-19 vaccination, across several categories.\n\nThe data is intended to help public health officials design, target, and evaluate public education campaigns.\n\nTo explore and download the data, use our interactive dashboard: http://goo.gle/covid19vaccinationinsights\nTo learn more about the dataset, how we generate it and preserve privacy, read the data documentation:\nhttps://storage.googleapis.com/gcs-public-datasets/COVID-19%20Vaccination%20Search%20Insights%20documentation.pdf" } output "bigquery_dataset-covid19_vaccination_search_insights-dataset_id" { diff --git a/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf index c5b13ccb3..eee21dc19 100644 --- a/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf +++ b/datasets/covid19_vaccination_search_insights/_terraform/covid19_vaccination_search_insights_pipeline.tf @@ -20,8 +20,95 @@ resource "google_bigquery_table" "covid19_vaccination_search_insights" { dataset_id = "covid19_vaccination_search_insights" table_id = "covid19_vaccination_search_insights" - description = "The COVID-19 Vaccination Search Insights shows aggregated, anonymized trends in searches patterns related to COVID-19 vaccination. The dataset provides a time series for each region showing the relative interest of searches per category of interest." + description = "Terms of use\nTo download or use the data, you must agree to the Google Terms of Service: https://policies.google.com/terms\n\nDescription\nThe COVID-19 Vaccination Search Insights data shows aggregated, anonymized trends in searches related to COVID-19 vaccination. The dataset provides a weekly time series for each region showing the relative interest of Google searches related to COVID-19 vaccination, across several categories.\n\nThe data is intended to help public health officials design, target, and evaluate public education campaigns.\n\nTo explore and download the data, use our interactive dashboard: http://goo.gle/covid19vaccinationinsights\n\nTo learn more about the dataset, how we generate it and preserve privacy, read the data documentation:\nhttps://storage.googleapis.com/gcs-public-datasets/COVID-19%20Vaccination%20Search%20Insights%20documentation.pdf" + time_partitioning { + type = "DAY" + require_partition_filter = false + } + clustering = ["sub_region_1_code", "sub_region_2_code", "sub_region_3_code", "place_id"] + schema = < Date: Wed, 30 Jun 2021 01:29:40 -0400 Subject: [PATCH 6/8] feat: modify BQ table template to use partitioning and clustering --- templates/terraform/google_bigquery_table.tf.jinja2 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/templates/terraform/google_bigquery_table.tf.jinja2 b/templates/terraform/google_bigquery_table.tf.jinja2 index 89e46c4ea..a5506c17a 100644 --- a/templates/terraform/google_bigquery_table.tf.jinja2 +++ b/templates/terraform/google_bigquery_table.tf.jinja2 @@ -23,6 +23,16 @@ resource "google_bigquery_table" "{{ tf_resource_name }}" { {% if description -%} description = {{ description|tojson }} {%- endif %} + {% if time_partitioning -%} + time_partitioning { + {%- for key, val in time_partitioning.items() %} + {{ key }} = {{ val|tojson }} + {% endfor -%} + } + {%- endif %} + {% if clustering -%} + clustering = {{ clustering|tojson }} + {%- endif %} {% if schema -%} schema = < Date: Fri, 9 Jul 2021 00:28:44 -0400 Subject: [PATCH 7/8] fixed YAML indentation --- .../pipeline.yaml | 104 +++++++++--------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml index 27aedcaa6..ff02fa364 100644 --- a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml @@ -145,58 +145,58 @@ dag: skip_leading_rows: 1 write_disposition: "WRITE_TRUNCATE" schema_fields: - - name: date - description: "The first day of the week (starting on Monday) on which the searches took place. For example, in the weekly data the row labeled 2021-04-19 represents the search activity for the week of April 19 to April 25, 2021, inclusive. Calendar days start and end at midnight Pacific Standard Time." - type: "DATE" - mode: "NULLABLE" - - name: country_region - description: "The name of the country in English. For example, United States." - type: "STRING" - mode: "NULLABLE" - - name: country_region_code - description: "The ISO 3166-1 code for the country. For example, US." - type: "STRING" - mode: "NULLABLE" - - name: sub_region_1 - description: "The name of a region in the country. For example, California." - type: "STRING" - mode: "NULLABLE" - - name: sub_region_1_code - description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." - type: "STRING" - mode: "NULLABLE" - - name: sub_region_2 - description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." - type: "STRING" - mode: "NULLABLE" - - name: sub_region_2_code - description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." - type: "STRING" - mode: "NULLABLE" - - name: sub_region_3 - description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_2. For example, Downtown or postal_code." - type: "STRING" - mode: "NULLABLE" - - name: sub_region_3_code - description: "In the US, the ZIP code. For example 94303." - type: "STRING" - mode: "NULLABLE" - - name: place_id - description: "The Google place ID for the most-specific subregion. Used in the Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." - type: "STRING" - mode: "NULLABLE" - - name: sni_covid19_vaccination - description: "The scaled normalized interest related to all COVID-19 vaccination for the region and date. For example, 87.02. Empty when data isn't available." - type: "FLOAT" - mode: "NULLABLE" - - name: sni_vaccination_intent - description: "The scaled normalized interest related to vaccination intent for the region and date. For example, 22.69. Empty when data isn't available." - type: "FLOAT" - mode: "NULLABLE" - - name: sni_safety_side_effects - description: "The scaled normalized interest related to safety and side effects of the vaccines for the region and date. For example, 17.96. Empty when data isn't available." - type: "FLOAT" - mode: "NULLABLE" + - name: date + description: "The first day of the week (starting on Monday) on which the searches took place. For example, in the weekly data the row labeled 2021-04-19 represents the search activity for the week of April 19 to April 25, 2021, inclusive. Calendar days start and end at midnight Pacific Standard Time." + type: "DATE" + mode: "NULLABLE" + - name: country_region + description: "The name of the country in English. For example, United States." + type: "STRING" + mode: "NULLABLE" + - name: country_region_code + description: "The ISO 3166-1 code for the country. For example, US." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_1 + description: "The name of a region in the country. For example, California." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_1_code + description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_2 + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_2_code + description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_3 + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_2. For example, Downtown or postal_code." + type: "STRING" + mode: "NULLABLE" + - name: sub_region_3_code + description: "In the US, the ZIP code. For example 94303." + type: "STRING" + mode: "NULLABLE" + - name: place_id + description: "The Google place ID for the most-specific subregion. Used in the Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." + type: "STRING" + mode: "NULLABLE" + - name: sni_covid19_vaccination + description: "The scaled normalized interest related to all COVID-19 vaccination for the region and date. For example, 87.02. Empty when data isn't available." + type: "FLOAT" + mode: "NULLABLE" + - name: sni_vaccination_intent + description: "The scaled normalized interest related to vaccination intent for the region and date. For example, 22.69. Empty when data isn't available." + type: "FLOAT" + mode: "NULLABLE" + - name: sni_safety_side_effects + description: "The scaled normalized interest related to safety and side effects of the vaccines for the region and date. For example, 17.96. Empty when data isn't available." + type: "FLOAT" + mode: "NULLABLE" graph_paths: - "gcs_to_bq_vaccination_search_insights" From 38db643976b0a8d849c36f315e9c59b42b09ef05 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Fri, 9 Jul 2021 11:36:08 -0400 Subject: [PATCH 8/8] fixed DAG ID --- .../covid19_vaccination_search_insights_dag.py | 2 +- .../covid19_vaccination_search_insights/pipeline.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py index 8f91f8621..62f67daec 100644 --- a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/covid19_vaccination_search_insights_dag.py @@ -24,7 +24,7 @@ with DAG( - dag_id="covid19_vaccination_search_insights.vaccination_search_insights_to_bq", + dag_id="covid19_vaccination_search_insights.covid19_vaccination_search_insights", default_args=default_args, max_active_runs=1, schedule_interval="@hourly", diff --git a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml index ff02fa364..98e1feb57 100644 --- a/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml +++ b/datasets/covid19_vaccination_search_insights/covid19_vaccination_search_insights/pipeline.yaml @@ -123,7 +123,7 @@ resources: dag: initialize: - dag_id: vaccination_search_insights_to_bq + dag_id: covid19_vaccination_search_insights default_args: owner: "Google" depends_on_past: False