Skip to content

Commit

Permalink
feat: Onboard Google Trends dataset for top N terms (#92)
Browse files Browse the repository at this point in the history
* feat: Support BQ to BQ operator

* feat: Added BigQueryToBigQueryOperator in pipeline.yaml sample

* feat: Google trends pipeline config

* renamed top_n_terms table to top_terms

* regenerate DAG
  • Loading branch information
adlersantos committed Jun 17, 2021
1 parent 8eaaae9 commit df96d1d
Show file tree
Hide file tree
Showing 7 changed files with 309 additions and 0 deletions.
26 changes: 26 additions & 0 deletions datasets/google_trends/_terraform/google_trends_dataset.tf
@@ -0,0 +1,26 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_dataset" "google_trends" {
dataset_id = "google_trends"
project = var.project_id
description = "The Google Trends dataset will provide critical signals that individual users and businesses alike can leverage to make better data-driven decisions. This dataset simplifies the manual interaction with the existing Google Trends UI by automating and exposing anonymized, aggregated, and indexed search data in BigQuery."
}

output "bigquery_dataset-google_trends-dataset_id" {
value = google_bigquery_dataset.google_trends.dataset_id
}
28 changes: 28 additions & 0 deletions datasets/google_trends/_terraform/provider.tf
@@ -0,0 +1,28 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


provider "google" {
project = var.project_id
impersonate_service_account = var.impersonating_acct
region = var.region
}

data "google_client_openid_userinfo" "me" {}

output "impersonating-account" {
value = data.google_client_openid_userinfo.me.email
}
56 changes: 56 additions & 0 deletions datasets/google_trends/_terraform/top_terms_pipeline.tf
@@ -0,0 +1,56 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "top_terms" {
project = var.project_id
dataset_id = "google_trends"
table_id = "top_terms"

description = "Daily top 25 terms in the United States with score, ranking, time, and designated market area"

depends_on = [
google_bigquery_dataset.google_trends
]
}

output "bigquery_table-top_terms-table_id" {
value = google_bigquery_table.top_terms.table_id
}

output "bigquery_table-top_terms-id" {
value = google_bigquery_table.top_terms.id
}

resource "google_bigquery_table" "top_rising_terms" {
project = var.project_id
dataset_id = "google_trends"
table_id = "top_rising_terms"

description = "Daily top rising terms in the United States with score, ranking, time, and designated market area"

depends_on = [
google_bigquery_dataset.google_trends
]
}

output "bigquery_table-top_rising_terms-table_id" {
value = google_bigquery_table.top_rising_terms.table_id
}

output "bigquery_table-top_rising_terms-id" {
value = google_bigquery_table.top_rising_terms.id
}
23 changes: 23 additions & 0 deletions datasets/google_trends/_terraform/variables.tf
@@ -0,0 +1,23 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


variable "project_id" {}
variable "bucket_name_prefix" {}
variable "impersonating_acct" {}
variable "region" {}
variable "env" {}

58 changes: 58 additions & 0 deletions datasets/google_trends/dataset.yaml
@@ -0,0 +1,58 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

dataset:
# The `dataset` block includes properties for your dataset that will be shown
# to users of your data on the Google Cloud website.

# Must be exactly the same name as the folder name your dataset.yaml is in.
name: google_trends

# A friendly, human-readable name of the dataset
friendly_name: ~

# A short, descriptive summary of the dataset.
description: ~

# A list of sources the dataset is derived from, using the YAML list syntax.
dataset_sources: ~

# A list of terms and conditions that users of the dataset should agree on,
# using the YAML list syntax.
terms_of_use: ~


resources:
# A list of Google Cloud resources needed by your dataset. In principle, all
# pipelines under a dataset should be able to share these resources.
#
# The currently supported resources are shown below. Use only the resources
# you need, and delete the rest as needed by your pipeline.
#
# We will keep adding to the list below to support more Google Cloud resources
# over time. If a resource you need isn't supported, please file an issue on
# the repository.

- type: bigquery_dataset
# Google BigQuery dataset to namespace all tables managed by this folder
#
# Required Properties:
# dataset_id
#
# Optional Properties:
# friendly_name (A user-friendly name of the dataset)
# description (A user-friendly description of the dataset)
# location (The geographic location where the dataset should reside)
dataset_id: google_trends
description: "The Google Trends dataset will provide critical signals that individual users and businesses alike can leverage to make better data-driven decisions. This dataset simplifies the manual interaction with the existing Google Trends UI by automating and exposing anonymized, aggregated, and indexed search data in BigQuery."
60 changes: 60 additions & 0 deletions datasets/google_trends/top_terms/pipeline.yaml
@@ -0,0 +1,60 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
resources:
- type: bigquery_table
table_id: top_terms
description: "Daily top 25 terms in the United States with score, ranking, time, and designated market area"

- type: bigquery_table
table_id: top_rising_terms
description: "Daily top rising terms in the United States with score, ranking, time, and designated market area"

dag:
initialize:
dag_id: top_terms
default_args:
owner: "Google"
depends_on_past: False
start_date: '2021-06-01'
max_active_runs: 1
schedule_interval: "@daily"
catchup: False
default_view: graph

tasks:
- operator: "BigQueryToBigQueryOperator"
description: "Task to run a BQ to BQ operation"

args:
task_id: "fetch_and_load_top_n"
source_project_dataset_tables: ["{{ var.json.google_trends.top_n.source_project_dataset_table }}"]
destination_project_dataset_table: "{{ var.json.google_trends.top_n.destination_project_dataset_table }}"
impersonation_chain: "{{ var.json.google_trends.service_account }}"
write_disposition: "WRITE_TRUNCATE"

- operator: "BigQueryToBigQueryOperator"
description: "Task to run a BQ to BQ operation"

args:
task_id: "fetch_and_load_top_rising"
source_project_dataset_tables: ["{{ var.json.google_trends.top_rising.source_project_dataset_table }}"]
destination_project_dataset_table: "{{ var.json.google_trends.top_rising.destination_project_dataset_table }}"
impersonation_chain: "{{ var.json.google_trends.service_account }}"
write_disposition: "WRITE_TRUNCATE"

graph_paths:
- "fetch_and_load_top_n"
- "fetch_and_load_top_rising"
58 changes: 58 additions & 0 deletions datasets/google_trends/top_terms/top_terms_dag.py
@@ -0,0 +1,58 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from airflow import DAG
from airflow.contrib.operators import bigquery_to_bigquery

default_args = {
"owner": "Google",
"depends_on_past": False,
"start_date": "2021-06-01",
}


with DAG(
dag_id="google_trends.top_terms",
default_args=default_args,
max_active_runs=1,
schedule_interval="@daily",
catchup=False,
default_view="graph",
) as dag:

# Task to run a BQ to BQ operation
fetch_and_load_top_n = bigquery_to_bigquery.BigQueryToBigQueryOperator(
task_id="fetch_and_load_top_n",
source_project_dataset_tables=[
"{{ var.json.google_trends.top_n.source_project_dataset_table }}"
],
destination_project_dataset_table="{{ var.json.google_trends.top_n.destination_project_dataset_table }}",
impersonation_chain="{{ var.json.google_trends.service_account }}",
write_disposition="WRITE_TRUNCATE",
)

# Task to run a BQ to BQ operation
fetch_and_load_top_rising = bigquery_to_bigquery.BigQueryToBigQueryOperator(
task_id="fetch_and_load_top_rising",
source_project_dataset_tables=[
"{{ var.json.google_trends.top_rising.source_project_dataset_table }}"
],
destination_project_dataset_table="{{ var.json.google_trends.top_rising.destination_project_dataset_table }}",
impersonation_chain="{{ var.json.google_trends.service_account }}",
write_disposition="WRITE_TRUNCATE",
)

fetch_and_load_top_n
fetch_and_load_top_rising

0 comments on commit df96d1d

Please sign in to comment.