diff --git a/datasets/sunroof/_images/run_csv_transform_kub/Dockerfile b/datasets/sunroof/_images/run_csv_transform_kub/Dockerfile new file mode 100644 index 000000000..748bc3bec --- /dev/null +++ b/datasets/sunroof/_images/run_csv_transform_kub/Dockerfile @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8 +ENV PYTHONUNBUFFERED True +COPY requirements.txt ./ +RUN python3 -m pip install --no-cache-dir -r requirements.txt +WORKDIR /custom +COPY ./csv_transform.py . +CMD ["python3", "csv_transform.py"] diff --git a/datasets/sunroof/_images/run_csv_transform_kub/csv_transform.py b/datasets/sunroof/_images/run_csv_transform_kub/csv_transform.py new file mode 100644 index 000000000..3b0be4884 --- /dev/null +++ b/datasets/sunroof/_images/run_csv_transform_kub/csv_transform.py @@ -0,0 +1,205 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import pathlib + +import pandas as pd +from google.cloud import storage + + +def main( + source_url: str, + source_file: pathlib.Path, + target_file: pathlib.Path, + chunksize: str, + target_gcs_bucket: str, + target_gcs_path: str, +) -> None: + + logging.info("Sunroof solar potential started") + + pathlib.Path("./files").mkdir(parents=True, exist_ok=True) + download_file_gs(source_url, source_file) + + chunksz = int(chunksize) + + logging.info(f"Opening batch file {source_file}") + with pd.read_csv( + source_file, # path to main source file to load in batches + engine="python", + encoding="utf-8", + quotechar='"', # string separator, typically double-quotes + chunksize=chunksz, # size of batch data, in no. of records + sep=",", # data column separator, typically "," + ) as reader: + for chunk_number, chunk in enumerate(reader): + target_file_batch = str(target_file).replace( + ".csv", "-" + str(chunk_number) + ".csv" + ) + df = pd.DataFrame() + df = pd.concat([df, chunk]) + process_chunk(df, target_file_batch, target_file, (not chunk_number == 0)) + + upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) + + logging.info("Sunroof solar potential process completed") + + +def process_chunk( + df: pd.DataFrame, target_file_batch: str, target_file: str, skip_header: bool +) -> None: + df = rename_headers(df) + df = remove_nan_cols(df) + df = generate_location(df) + df = reorder_headers(df) + save_to_new_file(df, file_path=str(target_file_batch)) + append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) + + +def append_batch_file( + batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool +) -> None: + data_file = open(batch_file_path, "r") + if truncate_file: + target_file = open(target_file_path, "w+").close() + target_file = open(target_file_path, "a+") + if skip_header: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path} with skip header" + ) + next(data_file) + else: + logging.info(f"Appending batch file {batch_file_path} to {target_file_path}") + target_file.write(data_file.read()) + data_file.close() + target_file.close() + if os.path.exists(batch_file_path): + os.remove(batch_file_path) + + +def generate_location(df: pd.DataFrame) -> pd.DataFrame: + logging.info("Generating location data") + df["center_point"] = ( + "POINT( " + df["lng_avg"].map(str) + " " + df["lat_avg"].map(str) + " )" + ) + + return df + + +def reorder_headers(df: pd.DataFrame) -> pd.DataFrame: + logging.info("Reordering headers..") + df = df[ + [ + "region_name", + "state_name", + "lat_max", + "lat_min", + "lng_max", + "lng_min", + "lat_avg", + "lng_avg", + "yearly_sunlight_kwh_kw_threshold_avg", + "count_qualified", + "percent_covered", + "percent_qualified", + "number_of_panels_n", + "number_of_panels_s", + "number_of_panels_e", + "number_of_panels_w", + "number_of_panels_f", + "number_of_panels_median", + "number_of_panels_total", + "kw_median", + "kw_total", + "yearly_sunlight_kwh_n", + "yearly_sunlight_kwh_s", + "yearly_sunlight_kwh_e", + "yearly_sunlight_kwh_w", + "yearly_sunlight_kwh_f", + "yearly_sunlight_kwh_median", + "yearly_sunlight_kwh_total", + "install_size_kw_buckets", + "carbon_offset_metric_tons", + "existing_installs_count", + "center_point", + ] + ] + + return df + + +def remove_nan(dt_str: str) -> int: + if not dt_str or str(dt_str) == "nan": + return int() + else: + return int(dt_str) + + +def remove_nan_cols(df: pd.DataFrame) -> pd.DataFrame: + logging.info("Resolve NaN data") + cols = { + "count_qualified", + "existing_installs_count", + "number_of_panels_n", + "number_of_panels_s", + "number_of_panels_e", + "number_of_panels_w", + "number_of_panels_f", + "number_of_panels_median", + "number_of_panels_total", + } + + for col in cols: + df[col] = df[col].apply(remove_nan) + + return df + + +def rename_headers(df: pd.DataFrame) -> pd.DataFrame: + logging.info("Renaming columns") + header_names = {"install_size_kw_buckets_json": "install_size_kw_buckets"} + df = df.rename(columns=header_names) + + return df + + +def save_to_new_file(df: pd.DataFrame, file_path) -> None: + df.to_csv(file_path, index=False) + + +def download_file_gs(source_url: str, source_file: pathlib.Path) -> None: + with open(source_file, "wb+") as file_obj: + storage.Client().download_blob_to_file(source_url, file_obj) + + +def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: + storage_client = storage.Client() + bucket = storage_client.bucket(gcs_bucket) + blob = bucket.blob(gcs_path) + blob.upload_from_filename(file_path) + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + main( + source_url=os.environ["SOURCE_URL"], + source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), + target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), + chunksize=os.environ["CHUNKSIZE"], + target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], + target_gcs_path=os.environ["TARGET_GCS_PATH"], + ) diff --git a/datasets/sunroof/_images/run_csv_transform_kub/requirements.txt b/datasets/sunroof/_images/run_csv_transform_kub/requirements.txt new file mode 100644 index 000000000..a13f29317 --- /dev/null +++ b/datasets/sunroof/_images/run_csv_transform_kub/requirements.txt @@ -0,0 +1,2 @@ +pandas +google-cloud-storage diff --git a/datasets/sunroof/_terraform/provider.tf b/datasets/sunroof/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/sunroof/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/sunroof/_terraform/solar_potential_by_census_tract_pipeline.tf b/datasets/sunroof/_terraform/solar_potential_by_census_tract_pipeline.tf new file mode 100644 index 000000000..b8eecce01 --- /dev/null +++ b/datasets/sunroof/_terraform/solar_potential_by_census_tract_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "solar_potential_by_census_tract" { + project = var.project_id + dataset_id = "sunroof" + table_id = "solar_potential_by_census_tract" + + description = "Sunroof Solar Potential By Census Tract" + + + + + depends_on = [ + google_bigquery_dataset.sunroof + ] +} + +output "bigquery_table-solar_potential_by_census_tract-table_id" { + value = google_bigquery_table.solar_potential_by_census_tract.table_id +} + +output "bigquery_table-solar_potential_by_census_tract-id" { + value = google_bigquery_table.solar_potential_by_census_tract.id +} diff --git a/datasets/sunroof/_terraform/solar_potential_by_postal_code_pipeline.tf b/datasets/sunroof/_terraform/solar_potential_by_postal_code_pipeline.tf new file mode 100644 index 000000000..82040a0b6 --- /dev/null +++ b/datasets/sunroof/_terraform/solar_potential_by_postal_code_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "solar_potential_by_postal_code" { + project = var.project_id + dataset_id = "sunroof" + table_id = "solar_potential_by_postal_code" + + description = "Sunroof Solar Potential By Postal Code" + + + + + depends_on = [ + google_bigquery_dataset.sunroof + ] +} + +output "bigquery_table-solar_potential_by_postal_code-table_id" { + value = google_bigquery_table.solar_potential_by_postal_code.table_id +} + +output "bigquery_table-solar_potential_by_postal_code-id" { + value = google_bigquery_table.solar_potential_by_postal_code.id +} diff --git a/datasets/sunroof/_terraform/sunroof_dataset.tf b/datasets/sunroof/_terraform/sunroof_dataset.tf new file mode 100644 index 000000000..68d241622 --- /dev/null +++ b/datasets/sunroof/_terraform/sunroof_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "sunroof" { + dataset_id = "sunroof" + project = var.project_id + description = "sunroof" +} + +output "bigquery_dataset-sunroof-dataset_id" { + value = google_bigquery_dataset.sunroof.dataset_id +} diff --git a/datasets/sunroof/_terraform/variables.tf b/datasets/sunroof/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/sunroof/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/sunroof/dataset.yaml b/datasets/sunroof/dataset.yaml new file mode 100644 index 000000000..58de08fae --- /dev/null +++ b/datasets/sunroof/dataset.yaml @@ -0,0 +1,26 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: sunroof + friendly_name: sunroof + description: sunroof + dataset_sources: ~ + terms_of_use: ~ + + +resources: + - type: bigquery_dataset + dataset_id: sunroof + description: sunroof diff --git a/datasets/sunroof/solar_potential_by_census_tract/pipeline.yaml b/datasets/sunroof/solar_potential_by_census_tract/pipeline.yaml new file mode 100644 index 000000000..2a552e2d3 --- /dev/null +++ b/datasets/sunroof/solar_potential_by_census_tract/pipeline.yaml @@ -0,0 +1,210 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "solar_potential_by_census_tract" + description: "Sunroof Solar Potential By Census Tract" + +dag: + airflow_version: 2 + initialize: + dag_id: solar_potential_by_census_tract + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" # run once a week at Sunday 12am + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "solar_potential_by_census_tract" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.sunroof.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "gs://project-sunroof/csv/latest/project-sunroof-census_tract.csv" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/sunroof/solar_potential_by_census_tract/data_output.csv" + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/sunroof/solar_potential_by_census_tract/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "sunroof.solar_potential_by_census_tract" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "region_name" + "type": "STRING" + "description": "Census Tract" + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "Name of the state containing that region" + "mode": "NULLABLE" + - "name": "lat_max" + "type": "FLOAT" + "description": "maximum latitude for that region" + "mode": "NULLABLE" + - "name": "lat_min" + "type": "FLOAT" + "description": "minimum latitude for that region" + "mode": "NULLABLE" + - "name": "lng_max" + "type": "FLOAT" + "description": "maximum longitude for that region" + "mode": "NULLABLE" + - "name": "lng_min" + "type": "FLOAT" + "description": "minimum longitude for that region" + "mode": "NULLABLE" + - "name": "lat_avg" + "type": "FLOAT" + "description": "average latitude for that region" + "mode": "NULLABLE" + - "name": "lng_avg" + "type": "FLOAT" + "description": "average longitude for that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_kw_threshold_avg" + "type": "FLOAT" + "description": "75% of the optimimum sunlight in the county containing that zip code" + "mode": "NULLABLE" + - "name": "count_qualified" + "type": "INTEGER" + "description": "# of buildings in Google Maps that are suitable for solar" + "mode": "NULLABLE" + - "name": "percent_covered" + "type": "FLOAT" + "description": "% of buildings in Google Maps covered by Project Sunroof" + "mode": "NULLABLE" + - "name": "percent_qualified" + "type": "FLOAT" + "description": "% of buildings covered by Project Sunroof that are suitable for solar" + "mode": "NULLABLE" + - "name": "number_of_panels_n" + "type": "INTEGER" + "description": "# of solar panels potential for north-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_s" + "type": "INTEGER" + "description": "# of solar panels potential for south-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_e" + "type": "INTEGER" + "description": "# of solar panels potential for east-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_w" + "type": "INTEGER" + "description": "# of solar panels potential for west-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_f" + "type": "INTEGER" + "description": "# of solar panels potential for flat roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_median" + "type": "INTEGER" + "description": "# of panels that fit on the median roof" + "mode": "NULLABLE" + - "name": "number_of_panels_total" + "type": "INTEGER" + "description": "# of solar panels potential for all roof space in that region, assuming 1.650m 0.992m panels" + "mode": "NULLABLE" + - "name": "kw_median" + "type": "FLOAT" + "description": "kW of solar potential for the median building in that region (assuming 250 watts per panel)" + "mode": "NULLABLE" + - "name": "kw_total" + "type": "FLOAT" + "description": "# of kW of solar potential for all roof types in that region (assuming 250 watts per panel)" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_n" + "type": "FLOAT" + "description": "total solar energy generation potential for north-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_s" + "type": "FLOAT" + "description": "total solar energy generation potential for south-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_e" + "type": "FLOAT" + "description": "total solar energy generation potential for east-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_w" + "type": "FLOAT" + "description": "total solar energy generation potential for west-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_f" + "type": "FLOAT" + "description": "total solar energy generation potential for flat roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_median" + "type": "FLOAT" + "description": "kWh/kw/yr for the median roof, in DC (not AC) terms" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_total" + "type": "FLOAT" + "description": "total solar energy generation potential for all roof space in that region" + "mode": "NULLABLE" + - "name": "install_size_kw_buckets" + "type": "STRING" + "description": "# of buildings with potential for various installation size buckets. Format is a JSON array, where each element is a tuple containing (1) lower bound of bucket, in kW, and (2) number of buildings in that bucket." + "mode": "NULLABLE" + - "name": "carbon_offset_metric_tons" + "type": "FLOAT" + "description": "The potential carbon dioxide abatement of the solar capacity that meets the technical potential criteria. The calculation uses eGRID subregion CO2 equivalent non-baseload output emission rates. https://www.epa.gov/sites/production/files/2015-10/documents/egrid2012_summarytables_0.pdf" + "mode": "NULLABLE" + - "name": "existing_installs_count" + "type": "INTEGER" + "description": "# of buildings estimated to have a solar installation, at time of data collection" + "mode": "NULLABLE" + - "name": "center_point" + "type": "GEOGRAPHY" + "description": "" + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/sunroof/solar_potential_by_census_tract/solar_potential_by_census_tract_dag.py b/datasets/sunroof/solar_potential_by_census_tract/solar_potential_by_census_tract_dag.py new file mode 100644 index 000000000..7d211ef49 --- /dev/null +++ b/datasets/sunroof/solar_potential_by_census_tract/solar_potential_by_census_tract_dag.py @@ -0,0 +1,277 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="sunroof.solar_potential_by_census_tract", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="solar_potential_by_census_tract", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.sunroof.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "gs://project-sunroof/csv/latest/project-sunroof-census_tract.csv", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/sunroof/solar_potential_by_census_tract/data_output.csv", + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/sunroof/solar_potential_by_census_tract/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="sunroof.solar_potential_by_census_tract", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "region_name", + "type": "STRING", + "description": "Census Tract", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "Name of the state containing that region", + "mode": "NULLABLE", + }, + { + "name": "lat_max", + "type": "FLOAT", + "description": "maximum latitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lat_min", + "type": "FLOAT", + "description": "minimum latitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lng_max", + "type": "FLOAT", + "description": "maximum longitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lng_min", + "type": "FLOAT", + "description": "minimum longitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lat_avg", + "type": "FLOAT", + "description": "average latitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lng_avg", + "type": "FLOAT", + "description": "average longitude for that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_kw_threshold_avg", + "type": "FLOAT", + "description": "75% of the optimimum sunlight in the county containing that zip code", + "mode": "NULLABLE", + }, + { + "name": "count_qualified", + "type": "INTEGER", + "description": "# of buildings in Google Maps that are suitable for solar", + "mode": "NULLABLE", + }, + { + "name": "percent_covered", + "type": "FLOAT", + "description": "% of buildings in Google Maps covered by Project Sunroof", + "mode": "NULLABLE", + }, + { + "name": "percent_qualified", + "type": "FLOAT", + "description": "% of buildings covered by Project Sunroof that are suitable for solar", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_n", + "type": "INTEGER", + "description": "# of solar panels potential for north-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_s", + "type": "INTEGER", + "description": "# of solar panels potential for south-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_e", + "type": "INTEGER", + "description": "# of solar panels potential for east-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_w", + "type": "INTEGER", + "description": "# of solar panels potential for west-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_f", + "type": "INTEGER", + "description": "# of solar panels potential for flat roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_median", + "type": "INTEGER", + "description": "# of panels that fit on the median roof", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_total", + "type": "INTEGER", + "description": "# of solar panels potential for all roof space in that region, assuming 1.650m 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "kw_median", + "type": "FLOAT", + "description": "kW of solar potential for the median building in that region (assuming 250 watts per panel)", + "mode": "NULLABLE", + }, + { + "name": "kw_total", + "type": "FLOAT", + "description": "# of kW of solar potential for all roof types in that region (assuming 250 watts per panel)", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_n", + "type": "FLOAT", + "description": "total solar energy generation potential for north-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_s", + "type": "FLOAT", + "description": "total solar energy generation potential for south-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_e", + "type": "FLOAT", + "description": "total solar energy generation potential for east-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_w", + "type": "FLOAT", + "description": "total solar energy generation potential for west-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_f", + "type": "FLOAT", + "description": "total solar energy generation potential for flat roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_median", + "type": "FLOAT", + "description": "kWh/kw/yr for the median roof, in DC (not AC) terms", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_total", + "type": "FLOAT", + "description": "total solar energy generation potential for all roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "install_size_kw_buckets", + "type": "STRING", + "description": "# of buildings with potential for various installation size buckets. Format is a JSON array, where each element is a tuple containing (1) lower bound of bucket, in kW, and (2) number of buildings in that bucket.", + "mode": "NULLABLE", + }, + { + "name": "carbon_offset_metric_tons", + "type": "FLOAT", + "description": "The potential carbon dioxide abatement of the solar capacity that meets the technical potential criteria. The calculation uses eGRID subregion CO2 equivalent non-baseload output emission rates. https://www.epa.gov/sites/production/files/2015-10/documents/egrid2012_summarytables_0.pdf", + "mode": "NULLABLE", + }, + { + "name": "existing_installs_count", + "type": "INTEGER", + "description": "# of buildings estimated to have a solar installation, at time of data collection", + "mode": "NULLABLE", + }, + { + "name": "center_point", + "type": "GEOGRAPHY", + "description": "", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/sunroof/solar_potential_by_postal_code/pipeline.yaml b/datasets/sunroof/solar_potential_by_postal_code/pipeline.yaml new file mode 100644 index 000000000..fcf62e0a9 --- /dev/null +++ b/datasets/sunroof/solar_potential_by_postal_code/pipeline.yaml @@ -0,0 +1,210 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "solar_potential_by_postal_code" + description: "Sunroof Solar Potential By Postal Code" + +dag: + airflow_version: 2 + initialize: + dag_id: solar_potential_by_postal_code + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" # run once a week at Sunday 12am + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "solar_potential_by_postal_code" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.sunroof.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "gs://project-sunroof/csv/latest/project-sunroof-postal_code.csv" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/sunroof/solar_potential_by_postal_code/data_output.csv" + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/sunroof/solar_potential_by_postal_code/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "sunroof.solar_potential_by_postal_code" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "region_name" + "type": "STRING" + "description": "Census Tract" + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "Name of the state containing that region" + "mode": "NULLABLE" + - "name": "lat_max" + "type": "FLOAT" + "description": "maximum latitude for that region" + "mode": "NULLABLE" + - "name": "lat_min" + "type": "FLOAT" + "description": "minimum latitude for that region" + "mode": "NULLABLE" + - "name": "lng_max" + "type": "FLOAT" + "description": "maximum longitude for that region" + "mode": "NULLABLE" + - "name": "lng_min" + "type": "FLOAT" + "description": "minimum longitude for that region" + "mode": "NULLABLE" + - "name": "lat_avg" + "type": "FLOAT" + "description": "average latitude for that region" + "mode": "NULLABLE" + - "name": "lng_avg" + "type": "FLOAT" + "description": "average longitude for that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_kw_threshold_avg" + "type": "FLOAT" + "description": "75% of the optimimum sunlight in the county containing that zip code" + "mode": "NULLABLE" + - "name": "count_qualified" + "type": "INTEGER" + "description": "# of buildings in Google Maps that are suitable for solar" + "mode": "NULLABLE" + - "name": "percent_covered" + "type": "FLOAT" + "description": "% of buildings in Google Maps covered by Project Sunroof" + "mode": "NULLABLE" + - "name": "percent_qualified" + "type": "FLOAT" + "description": "% of buildings covered by Project Sunroof that are suitable for solar" + "mode": "NULLABLE" + - "name": "number_of_panels_n" + "type": "INTEGER" + "description": "# of solar panels potential for north-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_s" + "type": "INTEGER" + "description": "# of solar panels potential for south-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_e" + "type": "INTEGER" + "description": "# of solar panels potential for east-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_w" + "type": "INTEGER" + "description": "# of solar panels potential for west-facing roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_f" + "type": "INTEGER" + "description": "# of solar panels potential for flat roof space in that region, assuming 1.650m x 0.992m panels" + "mode": "NULLABLE" + - "name": "number_of_panels_median" + "type": "INTEGER" + "description": "# of panels that fit on the median roof" + "mode": "NULLABLE" + - "name": "number_of_panels_total" + "type": "INTEGER" + "description": "# of solar panels potential for all roof space in that region, assuming 1.650m 0.992m panels" + "mode": "NULLABLE" + - "name": "kw_median" + "type": "FLOAT" + "description": "kW of solar potential for the median building in that region (assuming 250 watts per panel)" + "mode": "NULLABLE" + - "name": "kw_total" + "type": "FLOAT" + "description": "# of kW of solar potential for all roof types in that region (assuming 250 watts per panel)" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_n" + "type": "FLOAT" + "description": "total solar energy generation potential for north-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_s" + "type": "FLOAT" + "description": "total solar energy generation potential for south-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_e" + "type": "FLOAT" + "description": "total solar energy generation potential for east-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_w" + "type": "FLOAT" + "description": "total solar energy generation potential for west-facing roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_f" + "type": "FLOAT" + "description": "total solar energy generation potential for flat roof space in that region" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_median" + "type": "FLOAT" + "description": "kWh/kw/yr for the median roof, in DC (not AC) terms" + "mode": "NULLABLE" + - "name": "yearly_sunlight_kwh_total" + "type": "FLOAT" + "description": "total solar energy generation potential for all roof space in that region" + "mode": "NULLABLE" + - "name": "install_size_kw_buckets_json" + "type": "STRING" + "description": "# of buildings with potential for various installation size buckets. Format is a JSON array, where each element is a tuple containing (1) lower bound of bucket, in kW, and (2) number of buildings in that bucket." + "mode": "NULLABLE" + - "name": "carbon_offset_metric_tons" + "type": "FLOAT" + "description": "The potential carbon dioxide abatement of the solar capacity that meets the technical potential criteria. The calculation uses eGRID subregion CO2 equivalent non-baseload output emission rates. https://www.epa.gov/sites/production/files/2015-10/documents/egrid2012_summarytables_0.pdf" + "mode": "NULLABLE" + - "name": "existing_installs_count" + "type": "INTEGER" + "description": "# of buildings estimated to have a solar installation, at time of data collection" + "mode": "NULLABLE" + - "name": "center_point" + "type": "GEOGRAPHY" + "description": "" + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/sunroof/solar_potential_by_postal_code/solar_potential_by_postal_code_dag.py b/datasets/sunroof/solar_potential_by_postal_code/solar_potential_by_postal_code_dag.py new file mode 100644 index 000000000..9fb1625b5 --- /dev/null +++ b/datasets/sunroof/solar_potential_by_postal_code/solar_potential_by_postal_code_dag.py @@ -0,0 +1,277 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="sunroof.solar_potential_by_postal_code", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="solar_potential_by_postal_code", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.sunroof.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "gs://project-sunroof/csv/latest/project-sunroof-postal_code.csv", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/sunroof/solar_potential_by_postal_code/data_output.csv", + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/sunroof/solar_potential_by_postal_code/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="sunroof.solar_potential_by_postal_code", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "region_name", + "type": "STRING", + "description": "Census Tract", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "Name of the state containing that region", + "mode": "NULLABLE", + }, + { + "name": "lat_max", + "type": "FLOAT", + "description": "maximum latitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lat_min", + "type": "FLOAT", + "description": "minimum latitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lng_max", + "type": "FLOAT", + "description": "maximum longitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lng_min", + "type": "FLOAT", + "description": "minimum longitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lat_avg", + "type": "FLOAT", + "description": "average latitude for that region", + "mode": "NULLABLE", + }, + { + "name": "lng_avg", + "type": "FLOAT", + "description": "average longitude for that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_kw_threshold_avg", + "type": "FLOAT", + "description": "75% of the optimimum sunlight in the county containing that zip code", + "mode": "NULLABLE", + }, + { + "name": "count_qualified", + "type": "INTEGER", + "description": "# of buildings in Google Maps that are suitable for solar", + "mode": "NULLABLE", + }, + { + "name": "percent_covered", + "type": "FLOAT", + "description": "% of buildings in Google Maps covered by Project Sunroof", + "mode": "NULLABLE", + }, + { + "name": "percent_qualified", + "type": "FLOAT", + "description": "% of buildings covered by Project Sunroof that are suitable for solar", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_n", + "type": "INTEGER", + "description": "# of solar panels potential for north-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_s", + "type": "INTEGER", + "description": "# of solar panels potential for south-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_e", + "type": "INTEGER", + "description": "# of solar panels potential for east-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_w", + "type": "INTEGER", + "description": "# of solar panels potential for west-facing roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_f", + "type": "INTEGER", + "description": "# of solar panels potential for flat roof space in that region, assuming 1.650m x 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_median", + "type": "INTEGER", + "description": "# of panels that fit on the median roof", + "mode": "NULLABLE", + }, + { + "name": "number_of_panels_total", + "type": "INTEGER", + "description": "# of solar panels potential for all roof space in that region, assuming 1.650m 0.992m panels", + "mode": "NULLABLE", + }, + { + "name": "kw_median", + "type": "FLOAT", + "description": "kW of solar potential for the median building in that region (assuming 250 watts per panel)", + "mode": "NULLABLE", + }, + { + "name": "kw_total", + "type": "FLOAT", + "description": "# of kW of solar potential for all roof types in that region (assuming 250 watts per panel)", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_n", + "type": "FLOAT", + "description": "total solar energy generation potential for north-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_s", + "type": "FLOAT", + "description": "total solar energy generation potential for south-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_e", + "type": "FLOAT", + "description": "total solar energy generation potential for east-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_w", + "type": "FLOAT", + "description": "total solar energy generation potential for west-facing roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_f", + "type": "FLOAT", + "description": "total solar energy generation potential for flat roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_median", + "type": "FLOAT", + "description": "kWh/kw/yr for the median roof, in DC (not AC) terms", + "mode": "NULLABLE", + }, + { + "name": "yearly_sunlight_kwh_total", + "type": "FLOAT", + "description": "total solar energy generation potential for all roof space in that region", + "mode": "NULLABLE", + }, + { + "name": "install_size_kw_buckets_json", + "type": "STRING", + "description": "# of buildings with potential for various installation size buckets. Format is a JSON array, where each element is a tuple containing (1) lower bound of bucket, in kW, and (2) number of buildings in that bucket.", + "mode": "NULLABLE", + }, + { + "name": "carbon_offset_metric_tons", + "type": "FLOAT", + "description": "The potential carbon dioxide abatement of the solar capacity that meets the technical potential criteria. The calculation uses eGRID subregion CO2 equivalent non-baseload output emission rates. https://www.epa.gov/sites/production/files/2015-10/documents/egrid2012_summarytables_0.pdf", + "mode": "NULLABLE", + }, + { + "name": "existing_installs_count", + "type": "INTEGER", + "description": "# of buildings estimated to have a solar installation, at time of data collection", + "mode": "NULLABLE", + }, + { + "name": "center_point", + "type": "GEOGRAPHY", + "description": "", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq