From e68b4f87c19e1c1d1c370c042861fb17d6d89957 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Tue, 8 Jun 2021 19:31:07 -0400 Subject: [PATCH] feat: Onboard COVID-19 Vaccination Access dataset (#74) --- .../covid19_vaccination_access_dataset.tf | 26 + .../_terraform/provider.tf | 28 ++ .../_terraform/vaccination_access_dataset.tf | 25 + .../vaccination_access_to_bq_pipeline.tf | 96 ++++ .../_terraform/variables.tf | 23 + .../covid19_vaccination_access/dataset.yaml | 58 +++ .../vaccination_access_to_bq/pipeline.yaml | 336 +++++++++++++ .../vaccination_access_to_bq_dag.py | 458 ++++++++++++++++++ 8 files changed, 1050 insertions(+) create mode 100644 datasets/covid19_vaccination_access/_terraform/covid19_vaccination_access_dataset.tf create mode 100644 datasets/covid19_vaccination_access/_terraform/provider.tf create mode 100644 datasets/covid19_vaccination_access/_terraform/vaccination_access_dataset.tf create mode 100644 datasets/covid19_vaccination_access/_terraform/vaccination_access_to_bq_pipeline.tf create mode 100644 datasets/covid19_vaccination_access/_terraform/variables.tf create mode 100644 datasets/covid19_vaccination_access/dataset.yaml create mode 100644 datasets/covid19_vaccination_access/vaccination_access_to_bq/pipeline.yaml create mode 100644 datasets/covid19_vaccination_access/vaccination_access_to_bq/vaccination_access_to_bq_dag.py diff --git a/datasets/covid19_vaccination_access/_terraform/covid19_vaccination_access_dataset.tf b/datasets/covid19_vaccination_access/_terraform/covid19_vaccination_access_dataset.tf new file mode 100644 index 000000000..2d848dc6b --- /dev/null +++ b/datasets/covid19_vaccination_access/_terraform/covid19_vaccination_access_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "covid19_vaccination_access" { + dataset_id = "covid19_vaccination_access" + project = var.project_id + description = "The dataset contains catchment areas surrounding COVID-19 vaccination sites (sometimes called facilities). A catchment area represents the area within which a site can be reached within a designated period of time. Each vaccination site has a number of catchment areas, each representing a combination of a typical traveling time (for example, 15 minutes or less) and mode of transport (such as, walking, driving, or public transport)." +} + +output "bigquery_dataset-covid19_vaccination_access-dataset_id" { + value = google_bigquery_dataset.covid19_vaccination_access.dataset_id +} diff --git a/datasets/covid19_vaccination_access/_terraform/provider.tf b/datasets/covid19_vaccination_access/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/covid19_vaccination_access/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/covid19_vaccination_access/_terraform/vaccination_access_dataset.tf b/datasets/covid19_vaccination_access/_terraform/vaccination_access_dataset.tf new file mode 100644 index 000000000..bea62a017 --- /dev/null +++ b/datasets/covid19_vaccination_access/_terraform/vaccination_access_dataset.tf @@ -0,0 +1,25 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "vaccination_access" { + dataset_id = "vaccination_access" + project = var.project_id +} + +output "bigquery_dataset-vaccination_access-dataset_id" { + value = google_bigquery_dataset.vaccination_access.dataset_id +} diff --git a/datasets/covid19_vaccination_access/_terraform/vaccination_access_to_bq_pipeline.tf b/datasets/covid19_vaccination_access/_terraform/vaccination_access_to_bq_pipeline.tf new file mode 100644 index 000000000..0173b9809 --- /dev/null +++ b/datasets/covid19_vaccination_access/_terraform/vaccination_access_to_bq_pipeline.tf @@ -0,0 +1,96 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "facility_boundary_us_all" { + project = var.project_id + dataset_id = "covid19_vaccination_access" + table_id = "facility_boundary_us_all" + + + + depends_on = [ + google_bigquery_dataset.covid19_vaccination_access + ] +} + +output "bigquery_table-facility_boundary_us_all-table_id" { + value = google_bigquery_table.facility_boundary_us_all.table_id +} + +output "bigquery_table-facility_boundary_us_all-id" { + value = google_bigquery_table.facility_boundary_us_all.id +} + +resource "google_bigquery_table" "facility_boundary_us_drive" { + project = var.project_id + dataset_id = "covid19_vaccination_access" + table_id = "facility_boundary_us_drive" + + + + depends_on = [ + google_bigquery_dataset.covid19_vaccination_access + ] +} + +output "bigquery_table-facility_boundary_us_drive-table_id" { + value = google_bigquery_table.facility_boundary_us_drive.table_id +} + +output "bigquery_table-facility_boundary_us_drive-id" { + value = google_bigquery_table.facility_boundary_us_drive.id +} + +resource "google_bigquery_table" "facility_boundary_us_transit" { + project = var.project_id + dataset_id = "covid19_vaccination_access" + table_id = "facility_boundary_us_transit" + + + + depends_on = [ + google_bigquery_dataset.covid19_vaccination_access + ] +} + +output "bigquery_table-facility_boundary_us_transit-table_id" { + value = google_bigquery_table.facility_boundary_us_transit.table_id +} + +output "bigquery_table-facility_boundary_us_transit-id" { + value = google_bigquery_table.facility_boundary_us_transit.id +} + +resource "google_bigquery_table" "facility_boundary_us_walk" { + project = var.project_id + dataset_id = "covid19_vaccination_access" + table_id = "facility_boundary_us_walk" + + + + depends_on = [ + google_bigquery_dataset.covid19_vaccination_access + ] +} + +output "bigquery_table-facility_boundary_us_walk-table_id" { + value = google_bigquery_table.facility_boundary_us_walk.table_id +} + +output "bigquery_table-facility_boundary_us_walk-id" { + value = google_bigquery_table.facility_boundary_us_walk.id +} diff --git a/datasets/covid19_vaccination_access/_terraform/variables.tf b/datasets/covid19_vaccination_access/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/covid19_vaccination_access/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/covid19_vaccination_access/dataset.yaml b/datasets/covid19_vaccination_access/dataset.yaml new file mode 100644 index 000000000..b02e9dbfe --- /dev/null +++ b/datasets/covid19_vaccination_access/dataset.yaml @@ -0,0 +1,58 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + # The `dataset` block includes properties for your dataset that will be shown + # to users of your data on the Google Cloud website. + + # Must be exactly the same name as the folder name your dataset.yaml is in. + name: covid19_vaccination_access + + # A friendly, human-readable name of the dataset + friendly_name: ~ + + # A short, descriptive summary of the dataset. + description: "The dataset contains catchment areas surrounding COVID-19 vaccination sites (sometimes called facilities). A catchment area represents the area within which a site can be reached within a designated period of time. Each vaccination site has a number of catchment areas, each representing a combination of a typical traveling time (for example, 15 minutes or less) and mode of transport (such as, walking, driving, or public transport)." + + # A list of sources the dataset is derived from, using the YAML list syntax. + dataset_sources: ~ + + # A list of terms and conditions that users of the dataset should agree on, + # using the YAML list syntax. + terms_of_use: ~ + + +resources: + # A list of Google Cloud resources needed by your dataset. In principle, all + # pipelines under a dataset should be able to share these resources. + # + # The currently supported resources are shown below. Use only the resources + # you need, and delete the rest as needed by your pipeline. + # + # We will keep adding to the list below to support more Google Cloud resources + # over time. If a resource you need isn't supported, please file an issue on + # the repository. + + - type: bigquery_dataset + # Google BigQuery dataset to namespace all tables managed by this folder + # + # Required Properties: + # dataset_id + # + # Optional Properties: + # friendly_name (A user-friendly name of the dataset) + # description (A user-friendly description of the dataset) + # location (The geographic location where the dataset should reside) + dataset_id: covid19_vaccination_access + description: "The dataset contains catchment areas surrounding COVID-19 vaccination sites (sometimes called facilities). A catchment area represents the area within which a site can be reached within a designated period of time. Each vaccination site has a number of catchment areas, each representing a combination of a typical traveling time (for example, 15 minutes or less) and mode of transport (such as, walking, driving, or public transport)." diff --git a/datasets/covid19_vaccination_access/vaccination_access_to_bq/pipeline.yaml b/datasets/covid19_vaccination_access/vaccination_access_to_bq/pipeline.yaml new file mode 100644 index 000000000..cac89234e --- /dev/null +++ b/datasets/covid19_vaccination_access/vaccination_access_to_bq/pipeline.yaml @@ -0,0 +1,336 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + - type: bigquery_table + table_id: facility_boundary_us_all + description: "This table is formed by combining the data from the individual US tables of drive, transit and walk. It represents the boundaries of areas surrounding vaccination facilities from which people can reach the facility within certain duration. It is computed for 3 travel modes of drive, transit and walk and for predetermined time periods." + - type: bigquery_table + table_id: facility_boundary_us_drive + description: "This table represents the boundaries of areas surrounding vaccination facilities from which people can reach the facility by driving within predetermined time periods." + - type: bigquery_table + table_id: facility_boundary_us_transit + description: "This table represents the boundaries of areas surrounding vaccination facilities from which people can reach the facility by public transit within predetermined time periods." + - type: bigquery_table + table_id: facility_boundary_us_walk + description: "This table represents the boundaries of areas surrounding vaccination facilities from which people can reach the facility by walking within predetermined time periods." + +dag: + initialize: + dag_id: vaccination_access_to_bq + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-05-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV file from covid19-open-data bucket to facility_boundary_us_all" + args: + task_id: "gcs_to_bq_table_us_all" + bucket: "{{ var.json.covid19_vaccination_access.source_bucket }}" + source_objects: ["{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-all.csv"] + source_format: "CSV" + destination_project_dataset_table: "covid19_vaccination_access.facility_boundary_us_all" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "facility_place_id" + type: "STRING" + mode: "REQUIRED" + description: "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk." + - name: "facility_provider_id" + type: "STRING" + mode: "NULLABLE" + description: "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c." + - name: "facility_name" + type: "STRING" + mode: "NULLABLE" + description: "The name of the vaccination site. For example, St. Joseph's Hospital." + - name: "facility_latitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The latitude of the vaccination site. For example, 36.0507" + - name: "facility_longitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The longitude of the vaccination site. For example, 41.4356" + - name: "facility_country_region" + type: "STRING" + mode: "NULLABLE" + description: "The name of the country or region in English. For example, United States." + - name: "facility_country_code" + type: "STRING" + mode: "NULLABLE" + description: "The ISO 3166-1 code for the country or region. For example, US." + - name: "facility_sub_region_1" + type: "STRING" + mode: "NULLABLE" + description: "The name of a region in the country. For example, California." + - name: "facility_sub_region_1_code" + type: "STRING" + mode: "NULLABLE" + description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." + - name: "facility_sub_region_2" + type: "STRING" + mode: "NULLABLE" + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." + - name: "facility_sub_region_2_code" + type: "STRING" + mode: "NULLABLE" + description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." + - name: "facility_region_place_id" + type: "STRING" + mode: "NULLABLE" + description: "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." + - name: "mode_of_transportation" + type: "STRING" + mode: "NULLABLE" + description: "The mode of transport used to calculate the catchment boundary. For example, driving." + - name: "travel_time_threshold_minutes" + type: "INTEGER" + mode: "NULLABLE" + description: "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30." + - name: "facility_catchment_boundary" + type: "GEOGRAPHY" + mode: "NULLABLE" + description: "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points." + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV file from covid19-open-data bucket to facility_boundary_us_drive" + args: + task_id: "gcs_to_bq_table_us_drive" + bucket: "{{ var.json.covid19_vaccination_access.source_bucket }}" + source_objects: ["{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-drive.csv"] + source_format: "CSV" + destination_project_dataset_table: "covid19_vaccination_access.facility_boundary_us_drive" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "facility_place_id" + type: "STRING" + mode: "REQUIRED" + description: "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk." + - name: "facility_provider_id" + type: "STRING" + mode: "NULLABLE" + description: "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c." + - name: "facility_name" + type: "STRING" + mode: "NULLABLE" + description: "The name of the vaccination site. For example, St. Joseph's Hospital." + - name: "facility_latitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The latitude of the vaccination site. For example, 36.0507" + - name: "facility_longitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The longitude of the vaccination site. For example, 41.4356" + - name: "facility_country_region" + type: "STRING" + mode: "NULLABLE" + description: "The name of the country or region in English. For example, United States." + - name: "facility_country_code" + type: "STRING" + mode: "NULLABLE" + description: "The ISO 3166-1 code for the country or region. For example, US." + - name: "facility_sub_region_1" + type: "STRING" + mode: "NULLABLE" + description: "The name of a region in the country. For example, California." + - name: "facility_sub_region_1_code" + type: "STRING" + mode: "NULLABLE" + description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." + - name: "facility_sub_region_2" + type: "STRING" + mode: "NULLABLE" + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." + - name: "facility_sub_region_2_code" + type: "STRING" + mode: "NULLABLE" + description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." + - name: "facility_region_place_id" + type: "STRING" + mode: "NULLABLE" + description: "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." + - name: "mode_of_transportation" + type: "STRING" + mode: "NULLABLE" + description: "The mode of transport used to calculate the catchment boundary. For example, driving." + - name: "travel_time_threshold_minutes" + type: "INTEGER" + mode: "NULLABLE" + description: "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30." + - name: "facility_catchment_boundary" + type: "GEOGRAPHY" + mode: "NULLABLE" + description: "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points." + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV file from covid19-open-data bucket to facility_boundary_us_transit" + args: + task_id: "gcs_to_bq_table_us_transit" + bucket: "{{ var.json.covid19_vaccination_access.source_bucket }}" + source_objects: ["{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-transit.csv"] + source_format: "CSV" + destination_project_dataset_table: "covid19_vaccination_access.facility_boundary_us_transit" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "facility_place_id" + type: "STRING" + mode: "REQUIRED" + description: "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk." + - name: "facility_provider_id" + type: "STRING" + mode: "NULLABLE" + description: "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c." + - name: "facility_name" + type: "STRING" + mode: "NULLABLE" + description: "The name of the vaccination site. For example, St. Joseph's Hospital." + - name: "facility_latitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The latitude of the vaccination site. For example, 36.0507" + - name: "facility_longitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The longitude of the vaccination site. For example, 41.4356" + - name: "facility_country_region" + type: "STRING" + mode: "NULLABLE" + description: "The name of the country or region in English. For example, United States." + - name: "facility_country_code" + type: "STRING" + mode: "NULLABLE" + description: "The ISO 3166-1 code for the country or region. For example, US." + - name: "facility_sub_region_1" + type: "STRING" + mode: "NULLABLE" + description: "The name of a region in the country. For example, California." + - name: "facility_sub_region_1_code" + type: "STRING" + mode: "NULLABLE" + description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." + - name: "facility_sub_region_2" + type: "STRING" + mode: "NULLABLE" + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." + - name: "facility_sub_region_2_code" + type: "STRING" + mode: "NULLABLE" + description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." + - name: "facility_region_place_id" + type: "STRING" + mode: "NULLABLE" + description: "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." + - name: "mode_of_transportation" + type: "STRING" + mode: "NULLABLE" + description: "The mode of transport used to calculate the catchment boundary. For example, driving." + - name: "travel_time_threshold_minutes" + type: "INTEGER" + mode: "NULLABLE" + description: "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30." + - name: "facility_catchment_boundary" + type: "GEOGRAPHY" + mode: "NULLABLE" + description: "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points." + + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV file from covid19-open-data bucket to facility_boundary_us_walk" + args: + task_id: "gcs_to_bq_table_us_walk" + bucket: "{{ var.json.covid19_vaccination_access.source_bucket }}" + source_objects: ["{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-walk.csv"] + source_format: "CSV" + destination_project_dataset_table: "covid19_vaccination_access.facility_boundary_us_walk" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "facility_place_id" + type: "STRING" + mode: "REQUIRED" + description: "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk." + - name: "facility_provider_id" + type: "STRING" + mode: "NULLABLE" + description: "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c." + - name: "facility_name" + type: "STRING" + mode: "NULLABLE" + description: "The name of the vaccination site. For example, St. Joseph's Hospital." + - name: "facility_latitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The latitude of the vaccination site. For example, 36.0507" + - name: "facility_longitude" + type: "FLOAT" + mode: "REQUIRED" + description: "The longitude of the vaccination site. For example, 41.4356" + - name: "facility_country_region" + type: "STRING" + mode: "NULLABLE" + description: "The name of the country or region in English. For example, United States." + - name: "facility_country_code" + type: "STRING" + mode: "NULLABLE" + description: "The ISO 3166-1 code for the country or region. For example, US." + - name: "facility_sub_region_1" + type: "STRING" + mode: "NULLABLE" + description: "The name of a region in the country. For example, California." + - name: "facility_sub_region_1_code" + type: "STRING" + mode: "NULLABLE" + description: "A country-specific ISO 3166-2 code for the region. For example, US-CA." + - name: "facility_sub_region_2" + type: "STRING" + mode: "NULLABLE" + description: "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough." + - name: "facility_sub_region_2_code" + type: "STRING" + mode: "NULLABLE" + description: "In the US, the FIPS code for a US county (or equivalent). For example, 06085." + - name: "facility_region_place_id" + type: "STRING" + mode: "NULLABLE" + description: "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA." + - name: "mode_of_transportation" + type: "STRING" + mode: "NULLABLE" + description: "The mode of transport used to calculate the catchment boundary. For example, driving." + - name: "travel_time_threshold_minutes" + type: "INTEGER" + mode: "NULLABLE" + description: "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30." + - name: "facility_catchment_boundary" + type: "GEOGRAPHY" + mode: "NULLABLE" + description: "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points." + + graph_paths: + - "gcs_to_bq_table_us_all" + - "gcs_to_bq_table_us_drive" + - "gcs_to_bq_table_us_transit" + - "gcs_to_bq_table_us_walk" diff --git a/datasets/covid19_vaccination_access/vaccination_access_to_bq/vaccination_access_to_bq_dag.py b/datasets/covid19_vaccination_access/vaccination_access_to_bq/vaccination_access_to_bq_dag.py new file mode 100644 index 000000000..b18b64f35 --- /dev/null +++ b/datasets/covid19_vaccination_access/vaccination_access_to_bq/vaccination_access_to_bq_dag.py @@ -0,0 +1,458 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.contrib.operators import gcs_to_bq + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-05-01", +} + + +with DAG( + dag_id="covid19_vaccination_access.vaccination_access_to_bq", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Task to load CSV file from covid19-open-data bucket to facility_boundary_us_all + gcs_to_bq_table_us_all = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( + task_id="gcs_to_bq_table_us_all", + bucket="{{ var.json.covid19_vaccination_access.source_bucket }}", + source_objects=[ + "{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-all.csv" + ], + source_format="CSV", + destination_project_dataset_table="covid19_vaccination_access.facility_boundary_us_all", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "facility_place_id", + "type": "STRING", + "mode": "REQUIRED", + "description": "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk.", + }, + { + "name": "facility_provider_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c.", + }, + { + "name": "facility_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the vaccination site. For example, St. Joseph's Hospital.", + }, + { + "name": "facility_latitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The latitude of the vaccination site. For example, 36.0507", + }, + { + "name": "facility_longitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The longitude of the vaccination site. For example, 41.4356", + }, + { + "name": "facility_country_region", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the country or region in English. For example, United States.", + }, + { + "name": "facility_country_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "The ISO 3166-1 code for the country or region. For example, US.", + }, + { + "name": "facility_sub_region_1", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of a region in the country. For example, California.", + }, + { + "name": "facility_sub_region_1_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "A country-specific ISO 3166-2 code for the region. For example, US-CA.", + }, + { + "name": "facility_sub_region_2", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough.", + }, + { + "name": "facility_sub_region_2_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "In the US, the FIPS code for a US county (or equivalent). For example, 06085.", + }, + { + "name": "facility_region_place_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA.", + }, + { + "name": "mode_of_transportation", + "type": "STRING", + "mode": "NULLABLE", + "description": "The mode of transport used to calculate the catchment boundary. For example, driving.", + }, + { + "name": "travel_time_threshold_minutes", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30.", + }, + { + "name": "facility_catchment_boundary", + "type": "GEOGRAPHY", + "mode": "NULLABLE", + "description": "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points.", + }, + ], + ) + + # Task to load CSV file from covid19-open-data bucket to facility_boundary_us_drive + gcs_to_bq_table_us_drive = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( + task_id="gcs_to_bq_table_us_drive", + bucket="{{ var.json.covid19_vaccination_access.source_bucket }}", + source_objects=[ + "{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-drive.csv" + ], + source_format="CSV", + destination_project_dataset_table="covid19_vaccination_access.facility_boundary_us_drive", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "facility_place_id", + "type": "STRING", + "mode": "REQUIRED", + "description": "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk.", + }, + { + "name": "facility_provider_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c.", + }, + { + "name": "facility_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the vaccination site. For example, St. Joseph's Hospital.", + }, + { + "name": "facility_latitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The latitude of the vaccination site. For example, 36.0507", + }, + { + "name": "facility_longitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The longitude of the vaccination site. For example, 41.4356", + }, + { + "name": "facility_country_region", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the country or region in English. For example, United States.", + }, + { + "name": "facility_country_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "The ISO 3166-1 code for the country or region. For example, US.", + }, + { + "name": "facility_sub_region_1", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of a region in the country. For example, California.", + }, + { + "name": "facility_sub_region_1_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "A country-specific ISO 3166-2 code for the region. For example, US-CA.", + }, + { + "name": "facility_sub_region_2", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough.", + }, + { + "name": "facility_sub_region_2_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "In the US, the FIPS code for a US county (or equivalent). For example, 06085.", + }, + { + "name": "facility_region_place_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA.", + }, + { + "name": "mode_of_transportation", + "type": "STRING", + "mode": "NULLABLE", + "description": "The mode of transport used to calculate the catchment boundary. For example, driving.", + }, + { + "name": "travel_time_threshold_minutes", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30.", + }, + { + "name": "facility_catchment_boundary", + "type": "GEOGRAPHY", + "mode": "NULLABLE", + "description": "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points.", + }, + ], + ) + + # Task to load CSV file from covid19-open-data bucket to facility_boundary_us_transit + gcs_to_bq_table_us_transit = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( + task_id="gcs_to_bq_table_us_transit", + bucket="{{ var.json.covid19_vaccination_access.source_bucket }}", + source_objects=[ + "{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-transit.csv" + ], + source_format="CSV", + destination_project_dataset_table="covid19_vaccination_access.facility_boundary_us_transit", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "facility_place_id", + "type": "STRING", + "mode": "REQUIRED", + "description": "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk.", + }, + { + "name": "facility_provider_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c.", + }, + { + "name": "facility_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the vaccination site. For example, St. Joseph's Hospital.", + }, + { + "name": "facility_latitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The latitude of the vaccination site. For example, 36.0507", + }, + { + "name": "facility_longitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The longitude of the vaccination site. For example, 41.4356", + }, + { + "name": "facility_country_region", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the country or region in English. For example, United States.", + }, + { + "name": "facility_country_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "The ISO 3166-1 code for the country or region. For example, US.", + }, + { + "name": "facility_sub_region_1", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of a region in the country. For example, California.", + }, + { + "name": "facility_sub_region_1_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "A country-specific ISO 3166-2 code for the region. For example, US-CA.", + }, + { + "name": "facility_sub_region_2", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough.", + }, + { + "name": "facility_sub_region_2_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "In the US, the FIPS code for a US county (or equivalent). For example, 06085.", + }, + { + "name": "facility_region_place_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA.", + }, + { + "name": "mode_of_transportation", + "type": "STRING", + "mode": "NULLABLE", + "description": "The mode of transport used to calculate the catchment boundary. For example, driving.", + }, + { + "name": "travel_time_threshold_minutes", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30.", + }, + { + "name": "facility_catchment_boundary", + "type": "GEOGRAPHY", + "mode": "NULLABLE", + "description": "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points.", + }, + ], + ) + + # Task to load CSV file from covid19-open-data bucket to facility_boundary_us_walk + gcs_to_bq_table_us_walk = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( + task_id="gcs_to_bq_table_us_walk", + bucket="{{ var.json.covid19_vaccination_access.source_bucket }}", + source_objects=[ + "{{ var.json.covid19_vaccination_access.source_prefix }}/facility-boundary-us-walk.csv" + ], + source_format="CSV", + destination_project_dataset_table="covid19_vaccination_access.facility_boundary_us_walk", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "facility_place_id", + "type": "STRING", + "mode": "REQUIRED", + "description": "The Google Place ID of the vaccination site. For example, ChIJV3woGFkSK4cRWP9s3-kIFGk.", + }, + { + "name": "facility_provider_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "An identifier imported from the provider of the vaccination site information. In the US, we use the ID provided by VaccineFinder when available. For example, 7ede5bd5-44da-4a59-b4d9-b3a49c53472c.", + }, + { + "name": "facility_name", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the vaccination site. For example, St. Joseph's Hospital.", + }, + { + "name": "facility_latitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The latitude of the vaccination site. For example, 36.0507", + }, + { + "name": "facility_longitude", + "type": "FLOAT", + "mode": "REQUIRED", + "description": "The longitude of the vaccination site. For example, 41.4356", + }, + { + "name": "facility_country_region", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of the country or region in English. For example, United States.", + }, + { + "name": "facility_country_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "The ISO 3166-1 code for the country or region. For example, US.", + }, + { + "name": "facility_sub_region_1", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name of a region in the country. For example, California.", + }, + { + "name": "facility_sub_region_1_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "A country-specific ISO 3166-2 code for the region. For example, US-CA.", + }, + { + "name": "facility_sub_region_2", + "type": "STRING", + "mode": "NULLABLE", + "description": "The name (or type) of a region in the country. Typically a subdivision of sub_region_1. For example, Santa Clara County or municipal_borough.", + }, + { + "name": "facility_sub_region_2_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "In the US, the FIPS code for a US county (or equivalent). For example, 06085.", + }, + { + "name": "facility_region_place_id", + "type": "STRING", + "mode": "NULLABLE", + "description": "The Google place ID for the most-specific region, used in Google Places API and on Google Maps. For example, ChIJd_Y0eVIvkIARuQyDN0F1LBA.", + }, + { + "name": "mode_of_transportation", + "type": "STRING", + "mode": "NULLABLE", + "description": "The mode of transport used to calculate the catchment boundary. For example, driving.", + }, + { + "name": "travel_time_threshold_minutes", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "The maximum travel time, in minutes, used to calculate the catchment boundary. For example, 30.", + }, + { + "name": "facility_catchment_boundary", + "type": "GEOGRAPHY", + "mode": "NULLABLE", + "description": "A GeoJSON representation of the catchment area boundary of the site, for a particular mode of transportation and travel time threshold. Consists of multiple latitude and longitude points.", + }, + ], + ) + + gcs_to_bq_table_us_all + gcs_to_bq_table_us_drive + gcs_to_bq_table_us_transit + gcs_to_bq_table_us_walk