From 458d53e26badcea4c56b6ac50dcac0fc5b6f61fe Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Fri, 11 Jun 2021 17:49:49 -0400 Subject: [PATCH 1/5] feat: Support BQ to BQ operator --- scripts/dag_imports.json | 4 ++++ scripts/generate_dag.py | 1 + 2 files changed, 5 insertions(+) diff --git a/scripts/dag_imports.json b/scripts/dag_imports.json index 973d4acd2..9af103771 100644 --- a/scripts/dag_imports.json +++ b/scripts/dag_imports.json @@ -16,6 +16,10 @@ "import": "from airflow.contrib.operators import bigquery_operator", "class": "bigquery_operator.BigQueryOperator" }, + "BigQueryToBigQueryOperator": { + "import": "from airflow.contrib.operators import bigquery_to_bigquery", + "class": "bigquery_to_bigquery.BigQueryToBigQueryOperator" + }, "KubernetesPodOperator": { "import": "from airflow.contrib.operators import kubernetes_pod_operator", "class": "kubernetes_pod_operator.KubernetesPodOperator" diff --git a/scripts/generate_dag.py b/scripts/generate_dag.py index 7136082d1..731a0378a 100644 --- a/scripts/generate_dag.py +++ b/scripts/generate_dag.py @@ -33,6 +33,7 @@ "GoogleCloudStorageToGoogleCloudStorageOperator", "GoogleCloudStorageDeleteOperator", "BigQueryOperator", + "BigQueryToBigQueryOperator", "KubernetesPodOperator", } From 4d666baf86e74319d87d3ec242c93f4bec24feb3 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Fri, 11 Jun 2021 17:56:09 -0400 Subject: [PATCH 2/5] feat: Added BigQueryToBigQueryOperator in pipeline.yaml sample --- samples/pipeline.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/samples/pipeline.yaml b/samples/pipeline.yaml index fbb65fec1..0be2290d2 100644 --- a/samples/pipeline.yaml +++ b/samples/pipeline.yaml @@ -278,6 +278,16 @@ dag: limit_memory: "250M" limit_cpu: "1" + - operator: "BigQueryToBigQueryOperator" + description: "Task to run a BQ to BQ operator" + + args: + task_id: "sample_bq_to_bq_task" + source_project_dataset_tables: ["{{ var.json.DATASET_FOLDER_NAME.PIPELINE_NAME.source_project_dataset_table }}"] + destination_project_dataset_table: "{{ var.json.DATASET_FOLDER_NAME.PIPELINE_NAME.destination_project_dataset_table }}" + impersonation_chain: "{{ var.json.DATASET_FOLDER_NAME.service_account }}" + write_disposition: "WRITE_TRUNCATE" + graph_paths: # This is where you specify the relationships (i.e. directed paths/edges) # among the tasks specified above. Use the bitshift operator to define the @@ -286,5 +296,5 @@ dag: # For more info, see # https://airflow.apache.org/docs/apache-airflow/stable/tutorial.html#setting-up-dependencies - "sample_bash_task >> [sample_gcs_to_bq_task, sample_gcs_to_gcs_task]" - - "sample_gcs_to_bq_task >> sample_bq_sql_task" + - "sample_gcs_to_bq_task >> [sample_bq_sql_task, sample_bq_to_bq_task]" - "sample_bq_sql_task >> sample_gcs_delete_task" From 9a1d3704331776f15352e476f77d0c7a0e25c234 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Mon, 14 Jun 2021 10:57:10 -0400 Subject: [PATCH 3/5] feat: Google trends pipeline config --- datasets/google_trends/dataset.yaml | 58 ++++++++++++++++++ .../google_trends/top_n_terms/pipeline.yaml | 60 +++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 datasets/google_trends/dataset.yaml create mode 100644 datasets/google_trends/top_n_terms/pipeline.yaml diff --git a/datasets/google_trends/dataset.yaml b/datasets/google_trends/dataset.yaml new file mode 100644 index 000000000..267dcdfd0 --- /dev/null +++ b/datasets/google_trends/dataset.yaml @@ -0,0 +1,58 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + # The `dataset` block includes properties for your dataset that will be shown + # to users of your data on the Google Cloud website. + + # Must be exactly the same name as the folder name your dataset.yaml is in. + name: google_trends + + # A friendly, human-readable name of the dataset + friendly_name: ~ + + # A short, descriptive summary of the dataset. + description: ~ + + # A list of sources the dataset is derived from, using the YAML list syntax. + dataset_sources: ~ + + # A list of terms and conditions that users of the dataset should agree on, + # using the YAML list syntax. + terms_of_use: ~ + + +resources: + # A list of Google Cloud resources needed by your dataset. In principle, all + # pipelines under a dataset should be able to share these resources. + # + # The currently supported resources are shown below. Use only the resources + # you need, and delete the rest as needed by your pipeline. + # + # We will keep adding to the list below to support more Google Cloud resources + # over time. If a resource you need isn't supported, please file an issue on + # the repository. + + - type: bigquery_dataset + # Google BigQuery dataset to namespace all tables managed by this folder + # + # Required Properties: + # dataset_id + # + # Optional Properties: + # friendly_name (A user-friendly name of the dataset) + # description (A user-friendly description of the dataset) + # location (The geographic location where the dataset should reside) + dataset_id: google_trends + description: "The Google Trends dataset will provide critical signals that individual users and businesses alike can leverage to make better data-driven decisions. This dataset simplifies the manual interaction with the existing Google Trends UI by automating and exposing anonymized, aggregated, and indexed search data in BigQuery." diff --git a/datasets/google_trends/top_n_terms/pipeline.yaml b/datasets/google_trends/top_n_terms/pipeline.yaml new file mode 100644 index 000000000..ff3f849e7 --- /dev/null +++ b/datasets/google_trends/top_n_terms/pipeline.yaml @@ -0,0 +1,60 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + - type: bigquery_table + table_id: top_n_terms + description: "Daily top 25 terms in the United States with score, ranking, time, and designated market area" + + - type: bigquery_table + table_id: top_rising_terms + description: "Daily top rising terms in the United States with score, ranking, time, and designated market area" + +dag: + initialize: + dag_id: top_n_terms + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-06-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + - operator: "BigQueryToBigQueryOperator" + description: "Task to run a BQ to BQ operation" + + args: + task_id: "fetch_and_load_top_n" + source_project_dataset_tables: ["{{ var.json.google_trends.top_n.source_project_dataset_table }}"] + destination_project_dataset_table: "{{ var.json.google_trends.top_n.destination_project_dataset_table }}" + impersonation_chain: "{{ var.json.google_trends.service_account }}" + write_disposition: "WRITE_TRUNCATE" + + - operator: "BigQueryToBigQueryOperator" + description: "Task to run a BQ to BQ operation" + + args: + task_id: "fetch_and_load_top_rising" + source_project_dataset_tables: ["{{ var.json.google_trends.top_rising.source_project_dataset_table }}"] + destination_project_dataset_table: "{{ var.json.google_trends.top_rising.destination_project_dataset_table }}" + impersonation_chain: "{{ var.json.google_trends.service_account }}" + write_disposition: "WRITE_TRUNCATE" + + graph_paths: + - "fetch_and_load_top_n" + - "fetch_and_load_top_rising" From 679e61b8dcafce771bb39bdaac80f60af8ae726b Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Mon, 14 Jun 2021 13:31:16 -0400 Subject: [PATCH 4/5] renamed top_n_terms table to top_terms --- .../_terraform/google_trends_dataset.tf | 26 +++++++++ datasets/google_trends/_terraform/provider.tf | 28 ++++++++++ .../_terraform/top_terms_pipeline.tf | 56 +++++++++++++++++++ .../google_trends/_terraform/variables.tf | 23 ++++++++ .../{top_n_terms => top_terms}/pipeline.yaml | 4 +- 5 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 datasets/google_trends/_terraform/google_trends_dataset.tf create mode 100644 datasets/google_trends/_terraform/provider.tf create mode 100644 datasets/google_trends/_terraform/top_terms_pipeline.tf create mode 100644 datasets/google_trends/_terraform/variables.tf rename datasets/google_trends/{top_n_terms => top_terms}/pipeline.yaml (97%) diff --git a/datasets/google_trends/_terraform/google_trends_dataset.tf b/datasets/google_trends/_terraform/google_trends_dataset.tf new file mode 100644 index 000000000..709ff4556 --- /dev/null +++ b/datasets/google_trends/_terraform/google_trends_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "google_trends" { + dataset_id = "google_trends" + project = var.project_id + description = "The Google Trends dataset will provide critical signals that individual users and businesses alike can leverage to make better data-driven decisions. This dataset simplifies the manual interaction with the existing Google Trends UI by automating and exposing anonymized, aggregated, and indexed search data in BigQuery." +} + +output "bigquery_dataset-google_trends-dataset_id" { + value = google_bigquery_dataset.google_trends.dataset_id +} diff --git a/datasets/google_trends/_terraform/provider.tf b/datasets/google_trends/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/google_trends/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/google_trends/_terraform/top_terms_pipeline.tf b/datasets/google_trends/_terraform/top_terms_pipeline.tf new file mode 100644 index 000000000..61029dd55 --- /dev/null +++ b/datasets/google_trends/_terraform/top_terms_pipeline.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "top_terms" { + project = var.project_id + dataset_id = "google_trends" + table_id = "top_terms" + + description = "Daily top 25 terms in the United States with score, ranking, time, and designated market area" + + depends_on = [ + google_bigquery_dataset.google_trends + ] +} + +output "bigquery_table-top_terms-table_id" { + value = google_bigquery_table.top_terms.table_id +} + +output "bigquery_table-top_terms-id" { + value = google_bigquery_table.top_terms.id +} + +resource "google_bigquery_table" "top_rising_terms" { + project = var.project_id + dataset_id = "google_trends" + table_id = "top_rising_terms" + + description = "Daily top rising terms in the United States with score, ranking, time, and designated market area" + + depends_on = [ + google_bigquery_dataset.google_trends + ] +} + +output "bigquery_table-top_rising_terms-table_id" { + value = google_bigquery_table.top_rising_terms.table_id +} + +output "bigquery_table-top_rising_terms-id" { + value = google_bigquery_table.top_rising_terms.id +} diff --git a/datasets/google_trends/_terraform/variables.tf b/datasets/google_trends/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/google_trends/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/google_trends/top_n_terms/pipeline.yaml b/datasets/google_trends/top_terms/pipeline.yaml similarity index 97% rename from datasets/google_trends/top_n_terms/pipeline.yaml rename to datasets/google_trends/top_terms/pipeline.yaml index ff3f849e7..50e0b1dde 100644 --- a/datasets/google_trends/top_n_terms/pipeline.yaml +++ b/datasets/google_trends/top_terms/pipeline.yaml @@ -15,7 +15,7 @@ --- resources: - type: bigquery_table - table_id: top_n_terms + table_id: top_terms description: "Daily top 25 terms in the United States with score, ranking, time, and designated market area" - type: bigquery_table @@ -24,7 +24,7 @@ resources: dag: initialize: - dag_id: top_n_terms + dag_id: top_terms default_args: owner: "Google" depends_on_past: False From 7dfb031205da9b7c60c0fba94fc5a661de2a0e6d Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Thu, 17 Jun 2021 14:52:16 -0400 Subject: [PATCH 5/5] regenerate DAG --- .../google_trends/top_terms/top_terms_dag.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 datasets/google_trends/top_terms/top_terms_dag.py diff --git a/datasets/google_trends/top_terms/top_terms_dag.py b/datasets/google_trends/top_terms/top_terms_dag.py new file mode 100644 index 000000000..4e98e112c --- /dev/null +++ b/datasets/google_trends/top_terms/top_terms_dag.py @@ -0,0 +1,58 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.contrib.operators import bigquery_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-06-01", +} + + +with DAG( + dag_id="google_trends.top_terms", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Task to run a BQ to BQ operation + fetch_and_load_top_n = bigquery_to_bigquery.BigQueryToBigQueryOperator( + task_id="fetch_and_load_top_n", + source_project_dataset_tables=[ + "{{ var.json.google_trends.top_n.source_project_dataset_table }}" + ], + destination_project_dataset_table="{{ var.json.google_trends.top_n.destination_project_dataset_table }}", + impersonation_chain="{{ var.json.google_trends.service_account }}", + write_disposition="WRITE_TRUNCATE", + ) + + # Task to run a BQ to BQ operation + fetch_and_load_top_rising = bigquery_to_bigquery.BigQueryToBigQueryOperator( + task_id="fetch_and_load_top_rising", + source_project_dataset_tables=[ + "{{ var.json.google_trends.top_rising.source_project_dataset_table }}" + ], + destination_project_dataset_table="{{ var.json.google_trends.top_rising.destination_project_dataset_table }}", + impersonation_chain="{{ var.json.google_trends.service_account }}", + write_disposition="WRITE_TRUNCATE", + ) + + fetch_and_load_top_n + fetch_and_load_top_rising