From b717e9d6377e393e6c4814321fee6c797fee5d99 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 27 Oct 2021 06:34:51 +0000 Subject: [PATCH 01/26] feat: Added annual_summaries, tested locally. Errors in AF --- .../_images/run_csv_transform_kub/Dockerfile | 21 + .../run_csv_transform_kub/csv_transform.py | 382 ++++++++++++++++ .../run_csv_transform_kub/requirements.txt | 5 + .../_terraform/annual_summaries_pipeline.tf | 39 ++ datasets/epa/_terraform/epa_dataset.tf | 26 ++ datasets/epa/_terraform/provider.tf | 28 ++ datasets/epa/_terraform/variables.tf | 23 + .../annual_summaries/annual_summaries_dag.py | 418 ++++++++++++++++++ datasets/epa/annual_summaries/pipeline.yaml | 306 +++++++++++++ datasets/epa/dataset.yaml | 27 ++ 10 files changed, 1275 insertions(+) create mode 100644 datasets/epa/_images/run_csv_transform_kub/Dockerfile create mode 100644 datasets/epa/_images/run_csv_transform_kub/csv_transform.py create mode 100644 datasets/epa/_images/run_csv_transform_kub/requirements.txt create mode 100644 datasets/epa/_terraform/annual_summaries_pipeline.tf create mode 100644 datasets/epa/_terraform/epa_dataset.tf create mode 100644 datasets/epa/_terraform/provider.tf create mode 100644 datasets/epa/_terraform/variables.tf create mode 100644 datasets/epa/annual_summaries/annual_summaries_dag.py create mode 100644 datasets/epa/annual_summaries/pipeline.yaml create mode 100644 datasets/epa/dataset.yaml diff --git a/datasets/epa/_images/run_csv_transform_kub/Dockerfile b/datasets/epa/_images/run_csv_transform_kub/Dockerfile new file mode 100644 index 000000000..748bc3bec --- /dev/null +++ b/datasets/epa/_images/run_csv_transform_kub/Dockerfile @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8 +ENV PYTHONUNBUFFERED True +COPY requirements.txt ./ +RUN python3 -m pip install --no-cache-dir -r requirements.txt +WORKDIR /custom +COPY ./csv_transform.py . +CMD ["python3", "csv_transform.py"] diff --git a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa/_images/run_csv_transform_kub/csv_transform.py new file mode 100644 index 000000000..b002a4863 --- /dev/null +++ b/datasets/epa/_images/run_csv_transform_kub/csv_transform.py @@ -0,0 +1,382 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import fnmatch +import json +import logging +import os +import pathlib +import typing +import zipfile as zip + +# import numpy +import pandas as pd +import requests +from google.cloud import storage + + +def main( + source_url: str, + start_year: int, + source_file: pathlib.Path, + target_file: pathlib.Path, + chunksize: str, + target_gcs_bucket: str, + target_gcs_path: str, + data_names: typing.List[str], + data_dtypes: dict +) -> None: + + logging.info("Annual Summaries process started") + + pathlib.Path("./files").mkdir(parents=True, exist_ok=True) + dest_path = os.path.split(source_file)[0] + end_year = (datetime.datetime.today().year - 2) + download_url_files_from_year_range(source_url, start_year, end_year, dest_path, True, False) + st_year = (datetime.datetime.today().year - 1) + end_year = (datetime.datetime.today().year) + download_url_files_from_year_range(source_url, st_year, end_year, dest_path, True, True) + file_group_wildcard = os.path.split(source_url)[1].replace("_~year~.zip", "") + source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") + target = source.replace(".csv", "_output.csv") + + key_list = ["state_code", "county_code", "site_num", "sample_duration", "pollutant_standard", "metric_used", "method_name", "address", "date_of_last_change"] + process_source_file( + source, + target, + data_names, + # {}, + data_dtypes, + int(chunksize), + key_list + ) + + # trip_data_filepath = str(target_file).replace(".csv", "_trip_data.csv") + # logging.info(f"Opening {trip_data_filepath}") + # df_trip_data = pd.read_csv( + # trip_data_filepath, + # engine="python", + # encoding="utf-8", + # quotechar='"', # string separator, typically double-quotes + # sep="|", # data column separator, typically "," + # ) + + # tripdata_filepath = str(target_file).replace(".csv", "_tripdata.csv") + # logging.info(f"Opening {tripdata_filepath}") + # df_tripdata = pd.read_csv( + # tripdata_filepath, + # engine="python", + # encoding="utf-8", + # quotechar='"', # string separator, typically double-quotes + # sep="|", # data column separator, typically "," + # ) + + # logging.info("Dropping duplicate rows") + # df = df_trip_data + # df.drop_duplicates( + # subset=["key_val"], keep="last", inplace=True, ignore_index=False + # ) + # df_tripdata.drop_duplicates( + # subset=["key_val"], keep="last", inplace=True, ignore_index=False + # ) + + # logging.info("Populating empty trip-id values") + # df_tripdata["trip_id"] = df_tripdata["key_val"].str.replace("-", "") + + # logging.info("Creating indexes") + # df.set_index("key", inplace=True) + # df_tripdata.set_index("key", inplace=True) + + # logging.info("Merging data") + # df = df.append(df_tripdata, sort=True) + + # logging.info("Creating subscriber_type_new") + # df["subscriber_type_new"] = df.apply( + # lambda x: str(x.subscription_type) + # if not str(x.subscriber_type) + # else str(x.subscriber_type), + # axis=1, + # ) + # df = df.drop(columns=["subscriber_type"]) + + # logging.info("Resolving datatypes") + # df["member_birth_year"] = df["member_birth_year"].fillna(0).astype(int) + + # df = rename_headers_output_file(df) + # df = reorder_headers(df) + + # save_to_new_file(df, target_file, ",") + # upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) + + logging.info("Annual Summaries process completed") + + +def download_url_files_from_year_range(source_url: str, start_year: int, end_year: int, dest_path: str, remove_file: bool=False, continue_on_error: bool=False): + # for yr in range(start_year, (datetime.datetime.today().year - 1), 1): + for yr in range(start_year, end_year + 1, 1): + src_url = source_url.replace("~year~", str(yr)) + dest_file = dest_path + "/source_" + os.path.split(src_url)[1] + download_file_http(src_url, dest_file) + unpack_file(dest_file, dest_path, "zip") + if remove_file : + os.remove(dest_file) + + +def download_file_http(source_url: str, source_file: pathlib.Path, continue_on_error: bool=False) -> None: + logging.info(f"Downloading {source_url} to {source_file}") + try: + src_file = requests.get(source_url, stream=True) + with open(source_file, "wb") as f: + for chunk in src_file: + f.write(chunk) + except: + if not continue_on_error: + logging.info(f"Unable to obtain {source_url}") + else: + logging.info(f"Unable to obtain {source_url}. Continuing execution.") + + +def process_source_file( + source_file: str, target_file: str, names: list, dtypes: dict, chunksize: int, key_list: list +) -> None: + logging.info(f"Opening batch file {source_file}") + with pd.read_csv( + source_file, # path to main source file to load in batches + engine="python", + encoding="utf-8", + quotechar='"', # string separator, typically double-quotes + chunksize=chunksize, # size of batch data, in no. of records + sep=",", # data column separator, typically "," + header=None, # use when the data file does not contain a header + names=names, + dtype=dtypes, + keep_default_na=True, + na_values=[' '] + # parse_dates=["start_date", "end_date"], + ) as reader: + for chunk_number, chunk in enumerate(reader): + target_file_batch = str(target_file).replace( + ".csv", "-" + str(chunk_number) + ".csv" + ) + df = pd.DataFrame() + df = pd.concat([df, chunk]) + process_chunk(df, target_file_batch, target_file, (not chunk_number == 0), key_list) + + +def process_chunk( + df: pd.DataFrame, target_file_batch: str, target_file: str, skip_header: bool, key_list: list +) -> None: + df = resolve_date_format(df, "%Y-%m-%d %H:%M") + # df = add_key(df, key_list) + save_to_new_file(df, file_path=str(target_file_batch), sep=",") + append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) + + +def add_key(df: pd.DataFrame, key_list: list) -> pd.DataFrame: + logging.info(f"Adding key column(s) {key_list}") + df["key"] = "" + for key in key_list: + df["key"] = df.apply( + lambda x: str(x[key]) + if not str(x["key"]) + else str(x["key"]) + "-" + str(x[key]), + axis=1, + ) + df["key_val"] = df["key"] + + return df + + +def reorder_headers(df: pd.DataFrame) -> pd.DataFrame: + logging.info("Reordering headers output file") + df = df[ + [ + "trip_id", + "duration_sec", + "start_date", + "start_station_name", + "start_station_id", + "end_date", + "end_station_name", + "end_station_id", + "bike_number", + "zip_code", + "subscriber_type", + "subscription_type", + "start_station_latitude", + "start_station_longitude", + "end_station_latitude", + "end_station_longitude", + "member_birth_year", + "member_gender", + "bike_share_for_all_trip", + "start_station_geom", + "end_station_geom", + ] + ] + + return df + + +def concatenate_files( + target_file_path: str, dest_path: str, file_group_wildcard: str, incl_file_source_path: bool=False, separator: str="," +) -> str: + target_file_dir = os.path.split(str(target_file_path))[0] + target_file_path = str(target_file_path).replace(".csv", "_" + file_group_wildcard + ".csv") + logging.info(f"Concatenating files {target_file_dir}/*{file_group_wildcard}") + if os.path.isfile(target_file_path): + os.unlink(target_file_path) + for src_file_path in sorted( + fnmatch.filter(os.listdir(dest_path), "*" + file_group_wildcard + "*") + ): + src_file_path = dest_path + "/" + src_file_path + with open(src_file_path, "r") as src_file: + with open(target_file_path, "a+") as target_file: + next(src_file) + logging.info( + f"Reading from file {src_file_path}, writing to file {target_file_path}" + ) + for line in src_file: + if incl_file_source_path: + line = ( + '"' + os.path.split(src_file_path)[1].strip() + '"' + separator + line + ) # include the file source + else: + line = ( + line + ) + target_file.write(line) + + return target_file_path + + +def listdirs(rootdir: str) -> list: + rtn_list = [] + for file in os.listdir(rootdir): + d = os.path.join(rootdir, file) + if os.path.isdir(d): + rtn_list.append(d) + for elem in listdirs(d): + rtn_list.append(elem) + return rtn_list + + +def resolve_date_format( + df: pd.DataFrame, from_format: str +) -> pd.DataFrame: + logging.info("Resolving Date Format") + for col in df.columns: + if df[col].dtype == 'datetime64[ns]': + logging.info(f"Resolving datetime on {col}") + df[col] = df[col].apply(lambda x: convert_dt_format(str(x), from_format)) + + return df + + +def convert_dt_format(dt_str: str, from_format: str) -> str: + # rtnval = "" + if not dt_str or str(dt_str).lower() == "nan" or str(dt_str).lower() == "nat": + rtnval = "" + elif len(dt_str.strip()) == 10: + # if there is no time format + rtnval = dt_str + ' 00:00:00' + elif len(dt_str.strip().split(" ")[1]) == 8: + # if format of time portion is 00:00:00 then use 00:00 format + dt_str = dt_str[:-3] + rtnval = datetime.datetime.strptime(dt_str, from_format).strftime("%Y-%m-%d %H:%M:%S") + elif (len(dt_str.strip().split("-")[0]) == 4) and ( + len(from_format.strip().split("/")[0]) == 2 + ): + # if the format of the date portion of the data is in YYYY-MM-DD format + # and from_format is in MM-DD-YYYY then resolve this by modifying the from_format + # to use the YYYY-MM-DD. This resolves mixed date formats in files + from_format = "%Y-%m-%d " + from_format.strip().split(" ")[1] + else: + dt_str = "" + + # return datetime.datetime.strptime(dt_str, from_format).strftime("%Y-%m-%d %H:%M:%S") + return rtnval + + +def append_batch_file( + batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool +) -> None: + data_file = open(batch_file_path, "r") + if truncate_file: + target_file = open(target_file_path, "w+").close() + target_file = open(target_file_path, "a+") + if skip_header: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path} with skip header" + ) + next(data_file) + else: + logging.info(f"Appending batch file {batch_file_path} to {target_file_path}") + target_file.write(data_file.read()) + data_file.close() + target_file.close() + if os.path.exists(batch_file_path): + os.remove(batch_file_path) + + +def save_to_new_file(df, file_path, sep="|") -> None: + logging.info(f"Saving to file {file_path} separator='{sep}'") + df.to_csv(file_path, sep=sep, index=False) + + +def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: + logging.info("Uploading to GCS {gcs_bucket} in {gcs_path}") + storage_client = storage.Client() + bucket = storage_client.bucket(gcs_bucket) + blob = bucket.blob(gcs_path) + blob.upload_from_filename(file_path) + + +def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> None: + if os.path.exists(infile): + if compression_type == "zip": + logging.info(f"Unpacking {infile} to {dest_path}") + with zip.ZipFile(infile, mode="r") as zipf: + zipf.extractall(dest_path) + zipf.close() + else: + logging.info( + f"{infile} ignored as it is not compressed or is of unknown compression" + ) + else: + logging.info(f"{infile} not unpacked because it does not exist.") + + +def zip_decompress(infile: str, dest_path: str) -> None: + logging.info(f"Unpacking {infile} to {dest_path}") + with zip.ZipFile(infile, mode="r") as zipf: + zipf.extractall(dest_path) + zipf.close() + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + main( + source_url=os.environ["SOURCE_URL"], + source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), + target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), + start_year=int(os.environ["START_YEAR"]), + chunksize=os.environ["CHUNKSIZE"], + target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], + target_gcs_path=os.environ["TARGET_GCS_PATH"], + data_names=json.loads(os.environ["DATA_NAMES"]), + data_dtypes=json.loads(os.environ["DATA_DTYPES"]), + ) diff --git a/datasets/epa/_images/run_csv_transform_kub/requirements.txt b/datasets/epa/_images/run_csv_transform_kub/requirements.txt new file mode 100644 index 000000000..88bfd2aba --- /dev/null +++ b/datasets/epa/_images/run_csv_transform_kub/requirements.txt @@ -0,0 +1,5 @@ +requests +numpy +pandas +google-cloud-storage +gsutil diff --git a/datasets/epa/_terraform/annual_summaries_pipeline.tf b/datasets/epa/_terraform/annual_summaries_pipeline.tf new file mode 100644 index 000000000..80eb2bd5b --- /dev/null +++ b/datasets/epa/_terraform/annual_summaries_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_annual_summaries" { + project = var.project_id + dataset_id = "epa" + table_id = "annual_summaries" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa + ] +} + +output "bigquery_table-epa_annual_summaries-table_id" { + value = google_bigquery_table.epa_annual_summaries.table_id +} + +output "bigquery_table-epa_annual_summaries-id" { + value = google_bigquery_table.epa_annual_summaries.id +} diff --git a/datasets/epa/_terraform/epa_dataset.tf b/datasets/epa/_terraform/epa_dataset.tf new file mode 100644 index 000000000..ce6e58582 --- /dev/null +++ b/datasets/epa/_terraform/epa_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "epa" { + dataset_id = "epa" + project = var.project_id + description = "epa" +} + +output "bigquery_dataset-epa-dataset_id" { + value = google_bigquery_dataset.epa.dataset_id +} diff --git a/datasets/epa/_terraform/provider.tf b/datasets/epa/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/epa/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/epa/_terraform/variables.tf b/datasets/epa/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/epa/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/epa/annual_summaries/annual_summaries_dag.py b/datasets/epa/annual_summaries/annual_summaries_dag.py new file mode 100644 index 000000000..f25ad59a1 --- /dev/null +++ b/datasets/epa/annual_summaries/annual_summaries_dag.py @@ -0,0 +1,418 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa.annual_summaries", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="annual_summaries", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa.container_registry.run_csv_transform_kub_annual_summaries }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa/annual_summaries/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64", "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/epa/annual_summaries/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa.container_registry.annual_summaries_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "metric_used", + "type": "STRING", + "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "year", + "type": "INTEGER", + "description": "The year the annual summary data represents.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the year.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "completeness_indicator", + "type": "STRING", + "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "valid_day_count", + "type": "INTEGER", + "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days.", + "mode": "NULLABLE", + }, + { + "name": "required_day_count", + "type": "INTEGER", + "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required.", + "mode": "NULLABLE", + }, + { + "name": "exceptional_data_count", + "type": "INTEGER", + "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality).", + "mode": "NULLABLE", + }, + { + "name": "null_data_count", + "type": "INTEGER", + "description": "The count of scheduled samples when no data was collected and the reason for no data was reported.", + "mode": "NULLABLE", + }, + { + "name": "primary_exceedance_count", + "type": "INTEGER", + "description": "The number of samples during the year that exceeded the primary air quality standard.", + "mode": "NULLABLE", + }, + { + "name": "secondary_exceedance_count", + "type": "INTEGER", + "description": "The number of samples during the year that exceeded the secondary air quality standard.", + "mode": "NULLABLE", + }, + { + "name": "certification_indicator", + "type": "STRING", + "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated.", + "mode": "NULLABLE", + }, + { + "name": "num_obs_below_mdl", + "type": "INTEGER", + "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations.", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the year.", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_standard_dev", + "type": "FLOAT", + "description": "The standard deviation about the mean of the values for the year.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "first_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "second_max_value", + "type": "FLOAT", + "description": "The second highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "second_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "third_max_value", + "type": "FLOAT", + "description": "The third highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "third_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "fourth_max_value", + "type": "FLOAT", + "description": "The fourth highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "fourth_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "first_max_non_overlapping_value", + "type": "FLOAT", + "description": "For 8-hour CO averages, the highest value of the year.", + "mode": "NULLABLE", + }, + { + "name": "first_no_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "second_max_non_overlapping_value", + "type": "FLOAT", + "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value.", + "mode": "NULLABLE", + }, + { + "name": "second_no_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "ninety_nine_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "ninety_eight_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "ninety_five_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "ninety_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "seventy_five_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "fifty_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median).", + "mode": "NULLABLE", + }, + { + "name": "ten_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "date", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa/annual_summaries/pipeline.yaml b/datasets/epa/annual_summaries/pipeline.yaml new file mode 100644 index 000000000..6f1fc108c --- /dev/null +++ b/datasets/epa/annual_summaries/pipeline.yaml @@ -0,0 +1,306 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "annual_summaries" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: annual_summaries + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "annual_summaries" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa.container_registry.run_csv_transform_kub_annual_summaries }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa/annual_summaries/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64", "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa/annual_summaries/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa.container_registry.annual_summaries_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "metric_used" + "type": "STRING" + "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "year" + "type": "INTEGER" + "description": "The year the annual summary data represents." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the year." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "completeness_indicator" + "type": "STRING" + "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter." + "mode": "NULLABLE" + - "name": "valid_day_count" + "type": "INTEGER" + "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days." + "mode": "NULLABLE" + - "name": "required_day_count" + "type": "INTEGER" + "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required." + "mode": "NULLABLE" + - "name": "exceptional_data_count" + "type": "INTEGER" + "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality)." + "mode": "NULLABLE" + - "name": "null_data_count" + "type": "INTEGER" + "description": "The count of scheduled samples when no data was collected and the reason for no data was reported." + "mode": "NULLABLE" + - "name": "primary_exceedance_count" + "type": "INTEGER" + "description": "The number of samples during the year that exceeded the primary air quality standard." + "mode": "NULLABLE" + - "name": "secondary_exceedance_count" + "type": "INTEGER" + "description": "The number of samples during the year that exceeded the secondary air quality standard." + "mode": "NULLABLE" + - "name": "certification_indicator" + "type": "STRING" + "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated." + "mode": "NULLABLE" + - "name": "num_obs_below_mdl" + "type": "INTEGER" + "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the year." + "mode": "NULLABLE" + - "name": "arithmetic_standard_dev" + "type": "FLOAT" + "description": "The standard deviation about the mean of the values for the year." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the year." + "mode": "NULLABLE" + - "name": "first_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "second_max_value" + "type": "FLOAT" + "description": "The second highest value for the year." + "mode": "NULLABLE" + - "name": "second_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "third_max_value" + "type": "FLOAT" + "description": "The third highest value for the year." + "mode": "NULLABLE" + - "name": "third_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "fourth_max_value" + "type": "FLOAT" + "description": "The fourth highest value for the year." + "mode": "NULLABLE" + - "name": "fourth_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "first_max_non_overlapping_value" + "type": "FLOAT" + "description": "For 8-hour CO averages, the highest value of the year." + "mode": "NULLABLE" + - "name": "first_no_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "second_max_non_overlapping_value" + "type": "FLOAT" + "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value." + "mode": "NULLABLE" + - "name": "second_no_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "ninety_nine_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "ninety_eight_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "ninety_five_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "ninety_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "seventy_five_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "fifty_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median)." + "mode": "NULLABLE" + - "name": "ten_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "date" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa/dataset.yaml b/datasets/epa/dataset.yaml new file mode 100644 index 000000000..5afccd868 --- /dev/null +++ b/datasets/epa/dataset.yaml @@ -0,0 +1,27 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: epa + friendly_name: ~ + description: ~ + dataset_sources: ~ + terms_of_use: ~ + + +resources: + + - type: bigquery_dataset + dataset_id: epa + description: epa From bb846aaad1079b890efed245bb963be529a217d0 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 27 Oct 2021 17:02:24 +0000 Subject: [PATCH 02/26] feat: Added co_daily_summaries. Not ready for production --- .../run_csv_transform_kub/csv_transform.py | 146 +++------- .../annual_summaries/annual_summaries_dag.py | 4 +- datasets/epa/annual_summaries/pipeline.yaml | 5 +- .../co_daily_summary/co_daily_summary_dag.py | 262 ++++++++++++++++++ datasets/epa/co_daily_summary/pipeline.yaml | 203 ++++++++++++++ 5 files changed, 514 insertions(+), 106 deletions(-) create mode 100644 datasets/epa/co_daily_summary/co_daily_summary_dag.py create mode 100644 datasets/epa/co_daily_summary/pipeline.yaml diff --git a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa/_images/run_csv_transform_kub/csv_transform.py index b002a4863..7789d047b 100644 --- a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa/_images/run_csv_transform_kub/csv_transform.py @@ -39,7 +39,7 @@ def main( data_dtypes: dict ) -> None: - logging.info("Annual Summaries process started") + logging.info("Pipeline process started") pathlib.Path("./files").mkdir(parents=True, exist_ok=True) dest_path = os.path.split(source_file)[0] @@ -50,81 +50,23 @@ def main( download_url_files_from_year_range(source_url, st_year, end_year, dest_path, True, True) file_group_wildcard = os.path.split(source_url)[1].replace("_~year~.zip", "") source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") - target = source.replace(".csv", "_output.csv") key_list = ["state_code", "county_code", "site_num", "sample_duration", "pollutant_standard", "metric_used", "method_name", "address", "date_of_last_change"] process_source_file( source, - target, + target_file, data_names, - # {}, data_dtypes, int(chunksize), key_list ) - # trip_data_filepath = str(target_file).replace(".csv", "_trip_data.csv") - # logging.info(f"Opening {trip_data_filepath}") - # df_trip_data = pd.read_csv( - # trip_data_filepath, - # engine="python", - # encoding="utf-8", - # quotechar='"', # string separator, typically double-quotes - # sep="|", # data column separator, typically "," - # ) - - # tripdata_filepath = str(target_file).replace(".csv", "_tripdata.csv") - # logging.info(f"Opening {tripdata_filepath}") - # df_tripdata = pd.read_csv( - # tripdata_filepath, - # engine="python", - # encoding="utf-8", - # quotechar='"', # string separator, typically double-quotes - # sep="|", # data column separator, typically "," - # ) - - # logging.info("Dropping duplicate rows") - # df = df_trip_data - # df.drop_duplicates( - # subset=["key_val"], keep="last", inplace=True, ignore_index=False - # ) - # df_tripdata.drop_duplicates( - # subset=["key_val"], keep="last", inplace=True, ignore_index=False - # ) - - # logging.info("Populating empty trip-id values") - # df_tripdata["trip_id"] = df_tripdata["key_val"].str.replace("-", "") - - # logging.info("Creating indexes") - # df.set_index("key", inplace=True) - # df_tripdata.set_index("key", inplace=True) - - # logging.info("Merging data") - # df = df.append(df_tripdata, sort=True) - - # logging.info("Creating subscriber_type_new") - # df["subscriber_type_new"] = df.apply( - # lambda x: str(x.subscription_type) - # if not str(x.subscriber_type) - # else str(x.subscriber_type), - # axis=1, - # ) - # df = df.drop(columns=["subscriber_type"]) - - # logging.info("Resolving datatypes") - # df["member_birth_year"] = df["member_birth_year"].fillna(0).astype(int) - - # df = rename_headers_output_file(df) - # df = reorder_headers(df) - - # save_to_new_file(df, target_file, ",") - # upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) - - logging.info("Annual Summaries process completed") + upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) + + logging.info("Pipeline process completed") def download_url_files_from_year_range(source_url: str, start_year: int, end_year: int, dest_path: str, remove_file: bool=False, continue_on_error: bool=False): - # for yr in range(start_year, (datetime.datetime.today().year - 1), 1): for yr in range(start_year, end_year + 1, 1): src_url = source_url.replace("~year~", str(yr)) dest_file = dest_path + "/source_" + os.path.split(src_url)[1] @@ -199,35 +141,35 @@ def add_key(df: pd.DataFrame, key_list: list) -> pd.DataFrame: return df -def reorder_headers(df: pd.DataFrame) -> pd.DataFrame: - logging.info("Reordering headers output file") - df = df[ - [ - "trip_id", - "duration_sec", - "start_date", - "start_station_name", - "start_station_id", - "end_date", - "end_station_name", - "end_station_id", - "bike_number", - "zip_code", - "subscriber_type", - "subscription_type", - "start_station_latitude", - "start_station_longitude", - "end_station_latitude", - "end_station_longitude", - "member_birth_year", - "member_gender", - "bike_share_for_all_trip", - "start_station_geom", - "end_station_geom", - ] - ] - - return df +# def reorder_headers(df: pd.DataFrame) -> pd.DataFrame: +# logging.info("Reordering headers output file") +# df = df[ +# [ +# "trip_id", +# "duration_sec", +# "start_date", +# "start_station_name", +# "start_station_id", +# "end_date", +# "end_station_name", +# "end_station_id", +# "bike_number", +# "zip_code", +# "subscriber_type", +# "subscription_type", +# "start_station_latitude", +# "start_station_longitude", +# "end_station_latitude", +# "end_station_longitude", +# "member_birth_year", +# "member_gender", +# "bike_share_for_all_trip", +# "start_station_geom", +# "end_station_geom", +# ] +# ] + +# return df def concatenate_files( @@ -262,15 +204,15 @@ def concatenate_files( return target_file_path -def listdirs(rootdir: str) -> list: - rtn_list = [] - for file in os.listdir(rootdir): - d = os.path.join(rootdir, file) - if os.path.isdir(d): - rtn_list.append(d) - for elem in listdirs(d): - rtn_list.append(elem) - return rtn_list +# def listdirs(rootdir: str) -> list: +# rtn_list = [] +# for file in os.listdir(rootdir): +# d = os.path.join(rootdir, file) +# if os.path.isdir(d): +# rtn_list.append(d) +# for elem in listdirs(d): +# rtn_list.append(elem) +# return rtn_list def resolve_date_format( @@ -337,7 +279,7 @@ def save_to_new_file(df, file_path, sep="|") -> None: def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: - logging.info("Uploading to GCS {gcs_bucket} in {gcs_path}") + logging.info(f"Uploading to GCS {gcs_bucket} in {gcs_path}") storage_client = storage.Client() bucket = storage_client.bucket(gcs_bucket) blob = bucket.blob(gcs_path) diff --git a/datasets/epa/annual_summaries/annual_summaries_dag.py b/datasets/epa/annual_summaries/annual_summaries_dag.py index f25ad59a1..7d03baaf8 100644 --- a/datasets/epa/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa/annual_summaries/annual_summaries_dag.py @@ -56,7 +56,7 @@ } }, image_pull_policy="Always", - image="{{ var.json.epa.container_registry.run_csv_transform_kub_annual_summaries }}", + image="{{ var.json.epa.container_registry.run_csv_transform_kub }}", env_vars={ "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip", "START_YEAR": "1980", @@ -77,7 +77,7 @@ bucket="{{ var.value.composer_bucket }}", source_objects=["data/epa/annual_summaries/data_output.csv"], source_format="CSV", - destination_project_dataset_table="{{ var.json.epa.container_registry.annual_summaries_destination_table }}", + destination_project_dataset_table="epa_historical_air_quality.air_quality_annual_summary", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa/annual_summaries/pipeline.yaml b/datasets/epa/annual_summaries/pipeline.yaml index 6f1fc108c..a7fc2fbc2 100644 --- a/datasets/epa/annual_summaries/pipeline.yaml +++ b/datasets/epa/annual_summaries/pipeline.yaml @@ -52,7 +52,7 @@ dag: values: - "pool-e2-standard-4" image_pull_policy: "Always" - image: "{{ var.json.epa.container_registry.run_csv_transform_kub_annual_summaries }}" + image: "{{ var.json.epa.container_registry.run_csv_transform_kub }}" env_vars: SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip" START_YEAR: "1980" @@ -77,7 +77,8 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa/annual_summaries/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa.container_registry.annual_summaries_destination_table }}" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.air_quality_annual_summary" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa/co_daily_summary/co_daily_summary_dag.py b/datasets/epa/co_daily_summary/co_daily_summary_dag.py new file mode 100644 index 000000000..9c546bd73 --- /dev/null +++ b/datasets/epa/co_daily_summary/co_daily_summary_dag.py @@ -0,0 +1,262 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa.co_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="co_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42101_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa/co_daily_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "int32", "method_code": "int32", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/epa/annual_summaries/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.co_daily_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "date", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa/co_daily_summary/pipeline.yaml b/datasets/epa/co_daily_summary/pipeline.yaml new file mode 100644 index 000000000..8f560ae5c --- /dev/null +++ b/datasets/epa/co_daily_summary/pipeline.yaml @@ -0,0 +1,203 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "co_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: co_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "co_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42101_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa/co_daily_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "int32", "method_code": "int32", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa/annual_summaries/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.co_daily_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "date" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + graph_paths: + - "transform_csv >> load_to_bq" From 953d1cdd945d6cf7145bd363781236149158e70a Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 27 Oct 2021 19:34:47 +0000 Subject: [PATCH 03/26] feat: Added co_hourly_summary. Not ready for production. --- .../run_csv_transform_kub/csv_transform.py | 4 +- .../co_daily_summary/co_daily_summary_dag.py | 4 +- datasets/epa/co_daily_summary/pipeline.yaml | 4 +- .../co_hourly_summary_dag.py | 232 ++++++++++++++++++ datasets/epa/co_hourly_summary/pipeline.yaml | 183 ++++++++++++++ 5 files changed, 422 insertions(+), 5 deletions(-) create mode 100644 datasets/epa/co_hourly_summary/co_hourly_summary_dag.py create mode 100644 datasets/epa/co_hourly_summary/pipeline.yaml diff --git a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa/_images/run_csv_transform_kub/csv_transform.py index 7789d047b..f08ad7062 100644 --- a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa/_images/run_csv_transform_kub/csv_transform.py @@ -173,7 +173,7 @@ def add_key(df: pd.DataFrame, key_list: list) -> pd.DataFrame: def concatenate_files( - target_file_path: str, dest_path: str, file_group_wildcard: str, incl_file_source_path: bool=False, separator: str="," + target_file_path: str, dest_path: str, file_group_wildcard: str, incl_file_source_path: bool=False, separator: str=",", delete_src_file: bool=True ) -> str: target_file_dir = os.path.split(str(target_file_path))[0] target_file_path = str(target_file_path).replace(".csv", "_" + file_group_wildcard + ".csv") @@ -200,6 +200,8 @@ def concatenate_files( line ) target_file.write(line) + if os.path.isfile(src_file_path) and delete_src_file: + os.unlink(src_file_path) return target_file_path diff --git a/datasets/epa/co_daily_summary/co_daily_summary_dag.py b/datasets/epa/co_daily_summary/co_daily_summary_dag.py index 9c546bd73..17348fa28 100644 --- a/datasets/epa/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa/co_daily_summary/co_daily_summary_dag.py @@ -66,7 +66,7 @@ "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa/co_daily_summary/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "int32", "method_code": "int32", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, resources={"limit_memory": "8G", "limit_cpu": "3"}, ) @@ -75,7 +75,7 @@ load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=["data/epa/annual_summaries/data_output.csv"], + source_objects=["data/epa/co_daily_summary/data_output.csv"], source_format="CSV", destination_project_dataset_table="epa_historical_air_quality.co_daily_summary", skip_leading_rows=1, diff --git a/datasets/epa/co_daily_summary/pipeline.yaml b/datasets/epa/co_daily_summary/pipeline.yaml index 8f560ae5c..217a17350 100644 --- a/datasets/epa/co_daily_summary/pipeline.yaml +++ b/datasets/epa/co_daily_summary/pipeline.yaml @@ -64,7 +64,7 @@ dag: DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ] DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "int32", "method_code": "int32", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } resources: limit_memory: "8G" limit_cpu: "3" @@ -75,7 +75,7 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa/annual_summaries/data_output.csv"] + source_objects: ["data/epa/co_daily_summary/data_output.csv"] source_format: "CSV" # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" destination_project_dataset_table: "epa_historical_air_quality.co_daily_summary" diff --git a/datasets/epa/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa/co_hourly_summary/co_hourly_summary_dag.py new file mode 100644 index 000000000..2d9756c7d --- /dev/null +++ b/datasets/epa/co_hourly_summary/co_hourly_summary_dag.py @@ -0,0 +1,232 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa.co_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="co_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa/co_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/epa/hourly_summaries/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.co_hourly_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa/co_hourly_summary/pipeline.yaml b/datasets/epa/co_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..0c93cae1e --- /dev/null +++ b/datasets/epa/co_hourly_summary/pipeline.yaml @@ -0,0 +1,183 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "co_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: co_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "co_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa/co_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa/hourly_summaries/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.co_hourly_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + graph_paths: + - "transform_csv >> load_to_bq" From e3966b4e53ec457266ab5b4ddfa05a194010d42b Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 27 Oct 2021 21:44:32 +0000 Subject: [PATCH 04/26] fix: Changed dataset name --- .../_images/run_csv_transform_kub/Dockerfile | 0 .../run_csv_transform_kub/csv_transform.py | 139 +++++++---- .../run_csv_transform_kub/requirements.txt | 0 .../_terraform/annual_summaries_pipeline.tf | 0 .../_terraform/epa_dataset.tf | 0 .../_terraform/provider.tf | 0 .../_terraform/variables.tf | 0 .../annual_summaries/annual_summaries_dag.py | 17 +- .../annual_summaries/pipeline.yaml | 31 ++- .../co_daily_summary/co_daily_summary_dag.py | 14 +- .../co_daily_summary/pipeline.yaml | 21 +- .../co_hourly_summary_dag.py | 14 +- .../co_hourly_summary/pipeline.yaml | 19 +- .../dataset.yaml | 10 +- .../hap_daily_summary_dag.py | 234 ++++++++++++++++++ .../hap_daily_summary/pipeline.yaml | 194 +++++++++++++++ 16 files changed, 608 insertions(+), 85 deletions(-) rename datasets/{epa => epa_historical_air_quality}/_images/run_csv_transform_kub/Dockerfile (100%) rename datasets/{epa => epa_historical_air_quality}/_images/run_csv_transform_kub/csv_transform.py (73%) rename datasets/{epa => epa_historical_air_quality}/_images/run_csv_transform_kub/requirements.txt (100%) rename datasets/{epa => epa_historical_air_quality}/_terraform/annual_summaries_pipeline.tf (100%) rename datasets/{epa => epa_historical_air_quality}/_terraform/epa_dataset.tf (100%) rename datasets/{epa => epa_historical_air_quality}/_terraform/provider.tf (100%) rename datasets/{epa => epa_historical_air_quality}/_terraform/variables.tf (100%) rename datasets/{epa => epa_historical_air_quality}/annual_summaries/annual_summaries_dag.py (86%) rename datasets/{epa => epa_historical_air_quality}/annual_summaries/pipeline.yaml (84%) rename datasets/{epa => epa_historical_air_quality}/co_daily_summary/co_daily_summary_dag.py (89%) rename datasets/{epa => epa_historical_air_quality}/co_daily_summary/pipeline.yaml (87%) rename datasets/{epa => epa_historical_air_quality}/co_hourly_summary/co_hourly_summary_dag.py (89%) rename datasets/{epa => epa_historical_air_quality}/co_hourly_summary/pipeline.yaml (87%) rename datasets/{epa => epa_historical_air_quality}/dataset.yaml (74%) create mode 100644 datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml diff --git a/datasets/epa/_images/run_csv_transform_kub/Dockerfile b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/Dockerfile similarity index 100% rename from datasets/epa/_images/run_csv_transform_kub/Dockerfile rename to datasets/epa_historical_air_quality/_images/run_csv_transform_kub/Dockerfile diff --git a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py similarity index 73% rename from datasets/epa/_images/run_csv_transform_kub/csv_transform.py rename to datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py index f08ad7062..c5092c1bf 100644 --- a/datasets/epa/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -36,29 +36,38 @@ def main( target_gcs_bucket: str, target_gcs_path: str, data_names: typing.List[str], - data_dtypes: dict + data_dtypes: dict, ) -> None: logging.info("Pipeline process started") pathlib.Path("./files").mkdir(parents=True, exist_ok=True) dest_path = os.path.split(source_file)[0] - end_year = (datetime.datetime.today().year - 2) - download_url_files_from_year_range(source_url, start_year, end_year, dest_path, True, False) - st_year = (datetime.datetime.today().year - 1) - end_year = (datetime.datetime.today().year) - download_url_files_from_year_range(source_url, st_year, end_year, dest_path, True, True) + end_year = datetime.datetime.today().year - 2 + download_url_files_from_year_range( + source_url, start_year, end_year, dest_path, True, False + ) + st_year = datetime.datetime.today().year - 1 + end_year = datetime.datetime.today().year + download_url_files_from_year_range( + source_url, st_year, end_year, dest_path, True, True + ) file_group_wildcard = os.path.split(source_url)[1].replace("_~year~.zip", "") source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") - key_list = ["state_code", "county_code", "site_num", "sample_duration", "pollutant_standard", "metric_used", "method_name", "address", "date_of_last_change"] + key_list = [ + "state_code", + "county_code", + "site_num", + "sample_duration", + "pollutant_standard", + "metric_used", + "method_name", + "address", + "date_of_last_change", + ] process_source_file( - source, - target_file, - data_names, - data_dtypes, - int(chunksize), - key_list + source, target_file, data_names, data_dtypes, int(chunksize), key_list ) upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) @@ -66,32 +75,55 @@ def main( logging.info("Pipeline process completed") -def download_url_files_from_year_range(source_url: str, start_year: int, end_year: int, dest_path: str, remove_file: bool=False, continue_on_error: bool=False): +def download_url_files_from_year_range( + source_url: str, + start_year: int, + end_year: int, + dest_path: str, + remove_file: bool = False, + continue_on_error: bool = False, +): for yr in range(start_year, end_year + 1, 1): src_url = source_url.replace("~year~", str(yr)) dest_file = dest_path + "/source_" + os.path.split(src_url)[1] download_file_http(src_url, dest_file) unpack_file(dest_file, dest_path, "zip") - if remove_file : + if remove_file: os.remove(dest_file) -def download_file_http(source_url: str, source_file: pathlib.Path, continue_on_error: bool=False) -> None: +def download_file_http( + source_url: str, source_file: pathlib.Path, continue_on_error: bool = False +) -> None: logging.info(f"Downloading {source_url} to {source_file}") try: src_file = requests.get(source_url, stream=True) with open(source_file, "wb") as f: for chunk in src_file: f.write(chunk) - except: + except requests.exceptions.RequestException as e: + if e == requests.exceptions.HTTPError: + err_msg = "A HTTP error occurred." + elif e == requests.exceptions.Timeout: + err_msg = "A HTTP timeout error occurred." + elif e == requests.exceptions.TooManyRedirects: + err_msg = "Too Many Redirects occurred." if not continue_on_error: - logging.info(f"Unable to obtain {source_url}") + logging.info(f"{err_msg} Unable to obtain {source_url}") + raise SystemExit(e) else: - logging.info(f"Unable to obtain {source_url}. Continuing execution.") + logging.info( + f"{err_msg} Unable to obtain {source_url}. Continuing execution." + ) def process_source_file( - source_file: str, target_file: str, names: list, dtypes: dict, chunksize: int, key_list: list + source_file: str, + target_file: str, + names: list, + dtypes: dict, + chunksize: int, + key_list: list, ) -> None: logging.info(f"Opening batch file {source_file}") with pd.read_csv( @@ -105,7 +137,7 @@ def process_source_file( names=names, dtype=dtypes, keep_default_na=True, - na_values=[' '] + na_values=[" "] # parse_dates=["start_date", "end_date"], ) as reader: for chunk_number, chunk in enumerate(reader): @@ -114,11 +146,17 @@ def process_source_file( ) df = pd.DataFrame() df = pd.concat([df, chunk]) - process_chunk(df, target_file_batch, target_file, (not chunk_number == 0), key_list) + process_chunk( + df, target_file_batch, target_file, (not chunk_number == 0), key_list + ) def process_chunk( - df: pd.DataFrame, target_file_batch: str, target_file: str, skip_header: bool, key_list: list + df: pd.DataFrame, + target_file_batch: str, + target_file: str, + skip_header: bool, + key_list: list, ) -> None: df = resolve_date_format(df, "%Y-%m-%d %H:%M") # df = add_key(df, key_list) @@ -173,10 +211,17 @@ def add_key(df: pd.DataFrame, key_list: list) -> pd.DataFrame: def concatenate_files( - target_file_path: str, dest_path: str, file_group_wildcard: str, incl_file_source_path: bool=False, separator: str=",", delete_src_file: bool=True + target_file_path: str, + dest_path: str, + file_group_wildcard: str, + incl_file_source_path: bool = False, + separator: str = ",", + delete_src_file: bool = True, ) -> str: target_file_dir = os.path.split(str(target_file_path))[0] - target_file_path = str(target_file_path).replace(".csv", "_" + file_group_wildcard + ".csv") + target_file_path = str(target_file_path).replace( + ".csv", "_" + file_group_wildcard + ".csv" + ) logging.info(f"Concatenating files {target_file_dir}/*{file_group_wildcard}") if os.path.isfile(target_file_path): os.unlink(target_file_path) @@ -193,12 +238,14 @@ def concatenate_files( for line in src_file: if incl_file_source_path: line = ( - '"' + os.path.split(src_file_path)[1].strip() + '"' + separator + line + '"' + + os.path.split(src_file_path)[1].strip() + + '"' + + separator + + line ) # include the file source else: - line = ( - line - ) + line = line target_file.write(line) if os.path.isfile(src_file_path) and delete_src_file: os.unlink(src_file_path) @@ -217,38 +264,38 @@ def concatenate_files( # return rtn_list -def resolve_date_format( - df: pd.DataFrame, from_format: str -) -> pd.DataFrame: +def resolve_date_format(df: pd.DataFrame, from_format: str) -> pd.DataFrame: logging.info("Resolving Date Format") for col in df.columns: - if df[col].dtype == 'datetime64[ns]': - logging.info(f"Resolving datetime on {col}") - df[col] = df[col].apply(lambda x: convert_dt_format(str(x), from_format)) + if df[col].dtype == "datetime64[ns]": + logging.info(f"Resolving datetime on {col}") + df[col] = df[col].apply(lambda x: convert_dt_format(str(x), from_format)) return df def convert_dt_format(dt_str: str, from_format: str) -> str: - # rtnval = "" + # rtnval = "" if not dt_str or str(dt_str).lower() == "nan" or str(dt_str).lower() == "nat": rtnval = "" elif len(dt_str.strip()) == 10: # if there is no time format - rtnval = dt_str + ' 00:00:00' + rtnval = dt_str + " 00:00:00" elif len(dt_str.strip().split(" ")[1]) == 8: # if format of time portion is 00:00:00 then use 00:00 format dt_str = dt_str[:-3] - rtnval = datetime.datetime.strptime(dt_str, from_format).strftime("%Y-%m-%d %H:%M:%S") + rtnval = datetime.datetime.strptime(dt_str, from_format).strftime( + "%Y-%m-%d %H:%M:%S" + ) elif (len(dt_str.strip().split("-")[0]) == 4) and ( - len(from_format.strip().split("/")[0]) == 2 - ): - # if the format of the date portion of the data is in YYYY-MM-DD format - # and from_format is in MM-DD-YYYY then resolve this by modifying the from_format - # to use the YYYY-MM-DD. This resolves mixed date formats in files - from_format = "%Y-%m-%d " + from_format.strip().split(" ")[1] + len(from_format.strip().split("/")[0]) == 2 + ): + # if the format of the date portion of the data is in YYYY-MM-DD format + # and from_format is in MM-DD-YYYY then resolve this by modifying the from_format + # to use the YYYY-MM-DD. This resolves mixed date formats in files + from_format = "%Y-%m-%d " + from_format.strip().split(" ")[1] else: - dt_str = "" + dt_str = "" # return datetime.datetime.strptime(dt_str, from_format).strftime("%Y-%m-%d %H:%M:%S") return rtnval @@ -298,7 +345,7 @@ def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> N else: logging.info( f"{infile} ignored as it is not compressed or is of unknown compression" - ) + ) else: logging.info(f"{infile} not unpacked because it does not exist.") diff --git a/datasets/epa/_images/run_csv_transform_kub/requirements.txt b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt similarity index 100% rename from datasets/epa/_images/run_csv_transform_kub/requirements.txt rename to datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt diff --git a/datasets/epa/_terraform/annual_summaries_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf similarity index 100% rename from datasets/epa/_terraform/annual_summaries_pipeline.tf rename to datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf diff --git a/datasets/epa/_terraform/epa_dataset.tf b/datasets/epa_historical_air_quality/_terraform/epa_dataset.tf similarity index 100% rename from datasets/epa/_terraform/epa_dataset.tf rename to datasets/epa_historical_air_quality/_terraform/epa_dataset.tf diff --git a/datasets/epa/_terraform/provider.tf b/datasets/epa_historical_air_quality/_terraform/provider.tf similarity index 100% rename from datasets/epa/_terraform/provider.tf rename to datasets/epa_historical_air_quality/_terraform/provider.tf diff --git a/datasets/epa/_terraform/variables.tf b/datasets/epa_historical_air_quality/_terraform/variables.tf similarity index 100% rename from datasets/epa/_terraform/variables.tf rename to datasets/epa_historical_air_quality/_terraform/variables.tf diff --git a/datasets/epa/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py similarity index 86% rename from datasets/epa/annual_summaries/annual_summaries_dag.py rename to datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 7d03baaf8..48d6de07f 100644 --- a/datasets/epa/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -14,8 +14,9 @@ from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow.providers.google.cloud.transfers import gcs_to_bigquery +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod + default_args = { "owner": "Google", @@ -25,7 +26,7 @@ with DAG( - dag_id="epa.annual_summaries", + dag_id="epa_historical_air_quality.annual_summaries", default_args=default_args, max_active_runs=1, schedule_interval="@daily", @@ -56,7 +57,7 @@ } }, image_pull_policy="Always", - image="{{ var.json.epa.container_registry.run_csv_transform_kub }}", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip", "START_YEAR": "1980", @@ -64,9 +65,9 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa/annual_summaries/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64", "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "TARGET_GCS_PATH": "data/epa_historical_air_quality/annual_summaries/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, resources={"limit_memory": "8G", "limit_cpu": "3"}, ) @@ -75,7 +76,9 @@ load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=["data/epa/annual_summaries/data_output.csv"], + source_objects=[ + "data/epa_historical_air_quality/annual_summaries/data_output.csv" + ], source_format="CSV", destination_project_dataset_table="epa_historical_air_quality.air_quality_annual_summary", skip_leading_rows=1, diff --git a/datasets/epa/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml similarity index 84% rename from datasets/epa/annual_summaries/pipeline.yaml rename to datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml index a7fc2fbc2..e5d6753d6 100644 --- a/datasets/epa/annual_summaries/pipeline.yaml +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -52,7 +52,7 @@ dag: values: - "pool-e2-standard-4" image_pull_policy: "Always" - image: "{{ var.json.epa.container_registry.run_csv_transform_kub }}" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip" START_YEAR: "1980" @@ -60,11 +60,31 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa/annual_summaries/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/annual_summaries/data_output.csv" DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", + "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", + "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", + "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", + "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", + "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", + "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", + "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", + "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64", "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", + "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", + "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64", + "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", + "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", + "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", + "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", + "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", + "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } resources: limit_memory: "8G" limit_cpu: "3" @@ -75,7 +95,7 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa/annual_summaries/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/annual_summaries/data_output.csv"] source_format: "CSV" # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" destination_project_dataset_table: "epa_historical_air_quality.air_quality_annual_summary" @@ -303,5 +323,6 @@ dag: "type": "date" "description": "The date the last time any numeric values in this record were updated in the AQS data system." "mode": "NULLABLE" + graph_paths: - "transform_csv >> load_to_bq" diff --git a/datasets/epa/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py similarity index 89% rename from datasets/epa/co_daily_summary/co_daily_summary_dag.py rename to datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py index 17348fa28..b40cae448 100644 --- a/datasets/epa/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -25,7 +25,7 @@ with DAG( - dag_id="epa.co_daily_summary", + dag_id="epa_historical_air_quality.co_daily_summary", default_args=default_args, max_active_runs=1, schedule_interval="@daily", @@ -56,7 +56,7 @@ } }, image_pull_policy="Always", - image="{{ var.json.epa.container_registry.run_csv_transform_kub }}", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42101_~year~.zip", "START_YEAR": "1990", @@ -64,9 +64,9 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa/co_daily_summary/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_daily_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, resources={"limit_memory": "8G", "limit_cpu": "3"}, ) @@ -75,7 +75,9 @@ load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=["data/epa/co_daily_summary/data_output.csv"], + source_objects=[ + "data/epa_historical_air_quality/co_daily_summary/data_output.csv" + ], source_format="CSV", destination_project_dataset_table="epa_historical_air_quality.co_daily_summary", skip_leading_rows=1, diff --git a/datasets/epa/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml similarity index 87% rename from datasets/epa/co_daily_summary/pipeline.yaml rename to datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml index 217a17350..93f428104 100644 --- a/datasets/epa/co_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -52,7 +52,7 @@ dag: values: - "pool-e2-standard-4" image_pull_policy: "Always" - image: "{{ var.json.epa.container_registry.run_csv_transform_kub }}" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42101_~year~.zip" START_YEAR: "1990" @@ -60,11 +60,21 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa/co_daily_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/co_daily_summary/data_output.csv" DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } resources: limit_memory: "8G" limit_cpu: "3" @@ -75,7 +85,7 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa/co_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/co_daily_summary/data_output.csv"] source_format: "CSV" # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" destination_project_dataset_table: "epa_historical_air_quality.co_daily_summary" @@ -199,5 +209,6 @@ dag: "type": "TIMESTAMP" "description": "The date the last time any numeric values in this record were updated in the AQS data system." "mode": "NULLABLE" + graph_paths: - "transform_csv >> load_to_bq" diff --git a/datasets/epa/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py similarity index 89% rename from datasets/epa/co_hourly_summary/co_hourly_summary_dag.py rename to datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py index 2d9756c7d..0b38261f8 100644 --- a/datasets/epa/co_hourly_summary/co_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -25,7 +25,7 @@ with DAG( - dag_id="epa.co_hourly_summary", + dag_id="epa_historical_air_quality.co_hourly_summary", default_args=default_args, max_active_runs=1, schedule_interval="@daily", @@ -56,7 +56,7 @@ } }, image_pull_policy="Always", - image="{{ var.json.epa.container_registry.run_csv_transform_kub }}", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_~year~.zip", "START_YEAR": "1990", @@ -64,9 +64,9 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa/co_hourly_summary/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', + "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code",\n "method_name", "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str",\n "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str",\n "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', }, resources={"limit_memory": "8G", "limit_cpu": "3"}, ) @@ -75,7 +75,9 @@ load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=["data/epa/hourly_summaries/data_output.csv"], + source_objects=[ + "data/epa_historical_air_quality/hourly_summaries/data_output.csv" + ], source_format="CSV", destination_project_dataset_table="epa_historical_air_quality.co_hourly_summary", skip_leading_rows=1, diff --git a/datasets/epa/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml similarity index 87% rename from datasets/epa/co_hourly_summary/pipeline.yaml rename to datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml index 0c93cae1e..ab74d1e8e 100644 --- a/datasets/epa/co_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -52,7 +52,7 @@ dag: values: - "pool-e2-standard-4" image_pull_policy: "Always" - image: "{{ var.json.epa.container_registry.run_csv_transform_kub }}" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_~year~.zip" START_YEAR: "1990" @@ -60,11 +60,19 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa/co_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/co_hourly_summary/data_output.csv" DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", "method_code", "method_name", "local_site_name", "address", "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", + "method_name", "state_name", "county_name", "date_of_last_change" ] DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "date_local": "datetime64[ns]", "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" } + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str", + "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", + "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" } resources: limit_memory: "8G" limit_cpu: "3" @@ -75,7 +83,7 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa/hourly_summaries/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/hourly_summaries/data_output.csv"] source_format: "CSV" # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" destination_project_dataset_table: "epa_historical_air_quality.co_hourly_summary" @@ -179,5 +187,6 @@ dag: "type": "TIMESTAMP" "description": "The date the last time any numeric values in this record were updated in the AQS data system." "mode": "NULLABLE" + graph_paths: - "transform_csv >> load_to_bq" diff --git a/datasets/epa/dataset.yaml b/datasets/epa_historical_air_quality/dataset.yaml similarity index 74% rename from datasets/epa/dataset.yaml rename to datasets/epa_historical_air_quality/dataset.yaml index 5afccd868..ab6deb945 100644 --- a/datasets/epa/dataset.yaml +++ b/datasets/epa_historical_air_quality/dataset.yaml @@ -13,9 +13,9 @@ # limitations under the License. dataset: - name: epa - friendly_name: ~ - description: ~ + name: epa_historical_air_quality + friendly_name: epa_historical_air_quality + description: "EPA Historical Air Quality Datasets" dataset_sources: ~ terms_of_use: ~ @@ -23,5 +23,5 @@ dataset: resources: - type: bigquery_dataset - dataset_id: epa - description: epa + dataset_id: epa_historical_air_quality + description: "EPA Historical Air Quality Datasets" diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py new file mode 100644 index 000000000..39f341aa8 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.hap_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="hap_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_daily_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/hap_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.hap_daily_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml new file mode 100644 index 000000000..c8a18a658 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -0,0 +1,194 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "hap_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: hap_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "hap_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_daily_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/hap_daily_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.hap_daily_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" From 71a9d267c703ee3cab2b5d1c3ca6c30c01082aad Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 27 Oct 2021 23:32:46 +0000 Subject: [PATCH 05/26] fix: Attempt to resolve AF load_to_bq errors --- .../_terraform/annual_summaries_pipeline.tf | 14 +++---- .../_terraform/co_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/co_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../epa_historical_air_quality_dataset.tf | 26 +++++++++++++ .../_terraform/hap_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../annual_summaries/annual_summaries_dag.py | 2 +- 6 files changed, 151 insertions(+), 8 deletions(-) create mode 100644 datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf diff --git a/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf index 80eb2bd5b..7084028c2 100644 --- a/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf +++ b/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf @@ -15,9 +15,9 @@ */ -resource "google_bigquery_table" "epa_annual_summaries" { +resource "google_bigquery_table" "epa_historical_air_quality_annual_summaries" { project = var.project_id - dataset_id = "epa" + dataset_id = "epa_historical_air_quality" table_id = "annual_summaries" description = "epaspc" @@ -26,14 +26,14 @@ resource "google_bigquery_table" "epa_annual_summaries" { depends_on = [ - google_bigquery_dataset.epa + google_bigquery_dataset.epa_historical_air_quality ] } -output "bigquery_table-epa_annual_summaries-table_id" { - value = google_bigquery_table.epa_annual_summaries.table_id +output "bigquery_table-epa_historical_air_quality_annual_summaries-table_id" { + value = google_bigquery_table.epa_historical_air_quality_annual_summaries.table_id } -output "bigquery_table-epa_annual_summaries-id" { - value = google_bigquery_table.epa_annual_summaries.id +output "bigquery_table-epa_historical_air_quality_annual_summaries-id" { + value = google_bigquery_table.epa_historical_air_quality_annual_summaries.id } diff --git a/datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf new file mode 100644 index 000000000..4b475afed --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_co_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "co_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_co_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_co_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf new file mode 100644 index 000000000..96131d79d --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_co_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "co_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_co_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_co_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf b/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf new file mode 100644 index 000000000..b1917471e --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "epa_historical_air_quality" { + dataset_id = "epa_historical_air_quality" + project = var.project_id + description = "EPA Historical Air Quality Datasets" +} + +output "bigquery_dataset-epa_historical_air_quality-dataset_id" { + value = google_bigquery_dataset.epa_historical_air_quality.dataset_id +} diff --git a/datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf new file mode 100644 index 000000000..b8aac1e45 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_hap_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "hap_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_hap_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_hap_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 48d6de07f..202df8775 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -14,8 +14,8 @@ from airflow import DAG -from airflow.providers.google.cloud.transfers import gcs_to_bigquery from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { From 5efa0b5a1a44fee243f8da1b5a9e1a177dffdb62 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 28 Oct 2021 16:39:50 +0000 Subject: [PATCH 06/26] fix: Resolves issues with AF failure to execute DAG, also, some datatype fixes. --- .../annual_summaries/annual_summaries_dag.py | 6 +++--- .../annual_summaries/pipeline.yaml | 4 ++-- .../co_daily_summary/co_daily_summary_dag.py | 2 +- .../co_daily_summary/pipeline.yaml | 2 +- .../co_hourly_summary/co_hourly_summary_dag.py | 2 +- .../co_hourly_summary/pipeline.yaml | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 202df8775..23ce9d9cd 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -13,9 +13,9 @@ # limitations under the License. +from airflow.providers.google.cloud.transfers import gcs_to_bigquery from airflow import DAG from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { @@ -67,7 +67,7 @@ "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa_historical_air_quality/annual_summaries/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, resources={"limit_memory": "8G", "limit_cpu": "3"}, ) @@ -411,7 +411,7 @@ }, { "name": "date_of_last_change", - "type": "date", + "type": "TIMESTAMP", "description": "The date the last time any numeric values in this record were updated in the AQS data system.", "mode": "NULLABLE", }, diff --git a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml index e5d6753d6..c63a9eab0 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -78,7 +78,7 @@ dag: "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", - "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "float64", "secondary_exceedance_count": "float64", + "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str", "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", @@ -320,7 +320,7 @@ dag: "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." "mode": "NULLABLE" - "name": "date_of_last_change" - "type": "date" + "type": "TIMESTAMP" "description": "The date the last time any numeric values in this record were updated in the AQS data system." "mode": "NULLABLE" diff --git a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py index b40cae448..8b01a9dea 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -152,7 +152,7 @@ }, { "name": "date_local", - "type": "date", + "type": "TIMESTAMP", "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", "mode": "NULLABLE", }, diff --git a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml index 93f428104..51a7de6cd 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -138,7 +138,7 @@ dag: "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" "mode": "NULLABLE" - "name": "date_local" - "type": "date" + "type": "TIMESTAMP" "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." "mode": "NULLABLE" - "name": "units_of_measure" diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py index 0b38261f8..b14bd62b5 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -157,7 +157,7 @@ "mode": "NULLABLE", }, { - "name": "time_local", + "name": "time_gmt", "type": "STRING", "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", "mode": "NULLABLE", diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml index ab74d1e8e..46b2017b3 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -139,7 +139,7 @@ dag: "type": "TIMESTAMP" "description": "The calendar date of the sample in Greenwich Mean Time." "mode": "NULLABLE" - - "name": "time_local" + - "name": "time_gmt" "type": "STRING" "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." "mode": "NULLABLE" From 7a6440723069331caba47696680b1549e88a147c Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 28 Oct 2021 18:55:56 +0000 Subject: [PATCH 07/26] feat: Added HAP Hourly Summary. Fixed schema issues in HAP Daily Summary. --- .../annual_summaries/annual_summaries_dag.py | 3 +- .../hap_daily_summary_dag.py | 82 ++++-- .../hap_daily_summary/pipeline.yaml | 70 ++++-- .../hap_hourly_summary_dag.py | 235 ++++++++++++++++++ .../hap_hourly_summary/pipeline.yaml | 192 ++++++++++++++ 5 files changed, 529 insertions(+), 53 deletions(-) create mode 100644 datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 23ce9d9cd..6e825326e 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -13,10 +13,9 @@ # limitations under the License. -from airflow.providers.google.cloud.transfers import gcs_to_bigquery from airflow import DAG from airflow.providers.cncf.kubernetes.operators import kubernetes_pod - +from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { "owner": "Google", diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py index 39f341aa8..208e9fed7 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -139,68 +139,74 @@ "mode": "NULLABLE", }, { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", "mode": "NULLABLE", }, { - "name": "time_local", + "name": "pollutant_standard", "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", "mode": "NULLABLE", }, { - "name": "date_gmt", + "name": "date_local", "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", "mode": "NULLABLE", }, { - "name": "time_local", + "name": "units_of_measure", "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", "mode": "NULLABLE", }, { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", "mode": "NULLABLE", }, { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", "mode": "NULLABLE", }, { - "name": "mdl", + "name": "observation_percent", "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", "mode": "NULLABLE", }, { - "name": "uncertainty", + "name": "arithmetic_mean", "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "description": "The average (arithmetic mean) value for the day.", "mode": "NULLABLE", }, { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", "mode": "NULLABLE", }, { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", "mode": "NULLABLE", }, { "name": "method_code", - "type": "STRING", + "type": "INTEGER", "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", "mode": "NULLABLE", }, @@ -210,6 +216,18 @@ "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", "mode": "NULLABLE", }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, { "name": "state_name", "type": "STRING", @@ -222,6 +240,18 @@ "description": "The name of the county where the monitoring site is located.", "mode": "NULLABLE", }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, { "name": "date_of_last_change", "type": "TIMESTAMP", diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml index c8a18a658..de4725598 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -129,54 +129,66 @@ dag: "type": "STRING" "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" - "name": "date_local" "type": "TIMESTAMP" "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." "mode": "NULLABLE" - - "name": "time_local" + - "name": "units_of_measure" "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." "mode": "NULLABLE" - - "name": "time_local" + - "name": "event_type" "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." "mode": "NULLABLE" - - "name": "mdl" + - "name": "arithmetic_mean" "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "description": "The average (arithmetic mean) value for the day." "mode": "NULLABLE" - - "name": "uncertainty" + - "name": "first_max_value" "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "description": "The highest value for the day." "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." "mode": "NULLABLE" - "name": "method_code" - "type": "STRING" + "type": "INTEGER" "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." "mode": "NULLABLE" - "name": "method_name" "type": "STRING" "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" - "name": "state_name" "type": "STRING" "description": "The name of the state where the monitoring site is located." @@ -185,6 +197,14 @@ dag: "type": "STRING" "description": "The name of the county where the monitoring site is located." "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" - "name": "date_of_last_change" "type": "TIMESTAMP" "description": "The date the last time any numeric values in this record were updated in the AQS data system." diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py new file mode 100644 index 000000000..aa0d39e79 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -0,0 +1,235 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow import DAG +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.hap_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="hap_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_~year~.zip", + "START_YEAR": "1993", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.hap_hourly_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..033dc67b1 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml @@ -0,0 +1,192 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "hap_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: hap_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "hap_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_~year~.zip" + START_YEAR: "1993" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/hap_hourly_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.hap_hourly_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" From 27d962ea7a1deb78869fb5f9356fd7210c6aac35 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 28 Oct 2021 21:52:09 +0000 Subject: [PATCH 08/26] fix: datatype fixes --- .../annual_summaries/annual_summaries_dag.py | 1 + .../hap_hourly_summary_dag.py | 3 +- .../lead_daily_summary_dag.py | 264 ++++++++++++++++++ .../lead_daily_summary/pipeline.yaml | 214 ++++++++++++++ 4 files changed, 480 insertions(+), 2 deletions(-) create mode 100644 datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 6e825326e..a148c82e7 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -17,6 +17,7 @@ from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow.providers.google.cloud.transfers import gcs_to_bigquery + default_args = { "owner": "Google", "depends_on_past": False, diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py index aa0d39e79..118a09778 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -13,11 +13,10 @@ # limitations under the License. -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow.providers.google.cloud.transfers import gcs_to_bigquery - default_args = { "owner": "Google", "depends_on_past": False, diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py new file mode 100644 index 000000000..52165e8f8 --- /dev/null +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.lead_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="lead_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/lead_daily_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/lead_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.lead_daily_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml new file mode 100644 index 000000000..647a2bdb4 --- /dev/null +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -0,0 +1,214 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "lead_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: lead_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "lead_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/lead_daily_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/lead_daily_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.lead_daily_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" From 38d4e6a05f27271d58530825653dfc323a4754e2 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 28 Oct 2021 22:27:02 +0000 Subject: [PATCH 09/26] feat: Added no2 daily and hourly. --- .../annual_summaries/annual_summaries_dag.py | 1 - .../no2_daily_summary_dag.py | 264 ++++++++++++++++++ .../no2_daily_summary/pipeline.yaml | 214 ++++++++++++++ .../no2_hourly_summary_dag.py | 234 ++++++++++++++++ .../no2_hourly_summary/pipeline.yaml | 192 +++++++++++++ 5 files changed, 904 insertions(+), 1 deletion(-) create mode 100644 datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index a148c82e7..6e825326e 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -17,7 +17,6 @@ from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow.providers.google.cloud.transfers import gcs_to_bigquery - default_args = { "owner": "Google", "depends_on_past": False, diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py new file mode 100644 index 000000000..e9f0e1079 --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.no2_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="no2_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42602_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_daily_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/no2_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.no2_daily_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml new file mode 100644 index 000000000..793207f79 --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml @@ -0,0 +1,214 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "no2_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: no2_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "no2_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42602_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_daily_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/no2_daily_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.no2_daily_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py new file mode 100644 index 000000000..c430330f5 --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.no2_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="no2_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.no2_hourly_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..d417e76e3 --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml @@ -0,0 +1,192 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "no2_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: no2_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "no2_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/no2_hourly_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.no2_hourly_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" From ac99d563925cbef993f2a898069aed26ca85745b Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 28 Oct 2021 23:48:16 +0000 Subject: [PATCH 10/26] feat: Added NONOxNOy pipelines daily and hourly --- .../nonoxnoy_daily_summary_dag.py | 264 ++++++++++++++++++ .../nonoxnoy_daily_summary/pipeline.yaml | 214 ++++++++++++++ .../nonoxnoy_hourly_summary_dag.py | 234 ++++++++++++++++ .../nonoxnoy_hourly_summary/pipeline.yaml | 192 +++++++++++++ 4 files changed, 904 insertions(+) create mode 100644 datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py new file mode 100644 index 000000000..a2f120fc9 --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.nonoxnoy_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="nonoxnoy_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_nonoxnoy_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/nonoxnoy_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.nonoxnoy_daily_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml new file mode 100644 index 000000000..e5dfcfd38 --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -0,0 +1,214 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "nonoxnoy_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: nonoxnoy_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "nonoxnoy_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_nonoxnoy_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/nonoxnoy_daily_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.nonoxnoy_daily_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py new file mode 100644 index 000000000..60fd572d6 --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.nonoxnoy_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="no2_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="epa_historical_air_quality.nonoxnoy_hourly_summary", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..9b5d0a68c --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml @@ -0,0 +1,192 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "nonoxnoy_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: nonoxnoy_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "no2_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv"] + source_format: "CSV" + # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "epa_historical_air_quality.nonoxnoy_hourly_summary" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" From f4c09c8ee72b53ec1f54cacc3a2e389f5ff61a35 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Fri, 29 Oct 2021 02:57:13 +0000 Subject: [PATCH 11/26] feat: Added multiple pipelines and assigned destination table to a variable value for each. --- .../annual_summaries/annual_summaries_dag.py | 5 +- .../annual_summaries/pipeline.yaml | 3 +- .../co_daily_summary/co_daily_summary_dag.py | 2 +- .../co_daily_summary/pipeline.yaml | 3 +- .../co_hourly_summary_dag.py | 2 +- .../co_hourly_summary/pipeline.yaml | 3 +- .../hap_daily_summary_dag.py | 2 +- .../hap_daily_summary/pipeline.yaml | 3 +- .../hap_hourly_summary_dag.py | 2 +- .../hap_hourly_summary/pipeline.yaml | 3 +- .../lead_daily_summary_dag.py | 2 +- .../lead_daily_summary/pipeline.yaml | 3 +- .../no2_daily_summary_dag.py | 2 +- .../no2_daily_summary/pipeline.yaml | 3 +- .../no2_hourly_summary_dag.py | 2 +- .../no2_hourly_summary/pipeline.yaml | 3 +- .../nonoxnoy_daily_summary_dag.py | 2 +- .../nonoxnoy_daily_summary/pipeline.yaml | 3 +- .../nonoxnoy_hourly_summary_dag.py | 2 +- .../nonoxnoy_hourly_summary/pipeline.yaml | 3 +- .../ozone_daily_summary_dag.py | 264 ++++++++++++++++++ .../ozone_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../ozone_hourly_summary_dag.py | 234 ++++++++++++++++ .../ozone_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../pm10_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../pm10_daily_summary_dag.py | 264 ++++++++++++++++++ .../pm10_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../pm10_hourly_summary_dag.py | 234 ++++++++++++++++ .../pm25_frm_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../pm25_frm_hourly_summary_dag.py | 234 ++++++++++++++++ .../pm25_nonfrm_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../pm25_nonfrm_daily_summary_dag.py | 264 ++++++++++++++++++ .../pm25_nonfrm_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../pm25_nonfrm_hourly_summary_dag.py | 234 ++++++++++++++++ .../pipeline.yaml | 213 ++++++++++++++ .../pipeline.yaml | 191 +++++++++++++ .../pressure_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../pressure_daily_summary_dag.py | 264 ++++++++++++++++++ .../pressure_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../pressure_hourly_summary_dag.py | 234 ++++++++++++++++ .../rh_and_dp_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../rh_and_dp_daily_summary_dag.py | 264 ++++++++++++++++++ .../rh_and_dp_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../rh_and_dp_hourly_summary_dag.py | 234 ++++++++++++++++ .../so2_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../so2_daily_summary_dag.py | 264 ++++++++++++++++++ .../so2_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../so2_hourly_summary_dag.py | 234 ++++++++++++++++ .../temperature_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../temperature_daily_summary_dag.py | 264 ++++++++++++++++++ .../temperature_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../temperature_hourly_summary_dag.py | 234 ++++++++++++++++ .../voc_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../voc_daily_summary_dag.py | 264 ++++++++++++++++++ .../voc_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../voc_hourly_summary_dag.py | 116 ++++++++ .../wind_daily_summary/pipeline.yaml | 213 ++++++++++++++ .../wind_daily_summary_dag.py | 264 ++++++++++++++++++ .../wind_hourly_summary/pipeline.yaml | 191 +++++++++++++ .../wind_hourly_summary_dag.py | 234 ++++++++++++++++ 60 files changed, 8851 insertions(+), 31 deletions(-) create mode 100644 datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 6e825326e..ae99b87ef 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -13,10 +13,11 @@ # limitations under the License. -from airflow import DAG from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow import DAG from airflow.providers.google.cloud.transfers import gcs_to_bigquery + default_args = { "owner": "Google", "depends_on_past": False, @@ -79,7 +80,7 @@ "data/epa_historical_air_quality/annual_summaries/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.air_quality_annual_summary", + destination_project_dataset_table="{{ var.value.container_registry.annual_summaries_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml index c63a9eab0..f35bd98f5 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -97,8 +97,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/annual_summaries/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.air_quality_annual_summary" + destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py index 8b01a9dea..c57a281f3 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/co_daily_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.co_daily_summary", + destination_project_dataset_table="{{ var.value.container_registry.annual_summaries_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml index 51a7de6cd..290923a5b 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -87,8 +87,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/co_daily_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.co_daily_summary" + destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py index b14bd62b5..e3a80cbe3 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/hourly_summaries/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.co_hourly_summary", + destination_project_dataset_table="{{ var.value.container_registry.co_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml index 46b2017b3..b8ffb5731 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -85,8 +85,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/hourly_summaries/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.co_hourly_summary" + destination_project_dataset_table: "{{ var.value.container_registry.co_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py index 208e9fed7..394632df5 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/hap_daily_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.hap_daily_summary", + destination_project_dataset_table="{{ var.value.container_registry.hap_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml index de4725598..02722baf4 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -87,8 +87,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/hap_daily_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.hap_daily_summary" + destination_project_dataset_table: "{{ var.value.container_registry.hap_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py index 118a09778..dc0718fe4 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.hap_hourly_summary", + destination_project_dataset_table="{{ var.value.container_registry.hap_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml index 033dc67b1..3d8e7c0d2 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml @@ -85,8 +85,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/hap_hourly_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.hap_hourly_summary" + destination_project_dataset_table: "{{ var.value.container_registry.hap_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py index 52165e8f8..51f28113a 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/lead_daily_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.lead_daily_summary", + destination_project_dataset_table="{{ var.value.container_registry.lead_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml index 647a2bdb4..1b0633c4f 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -87,8 +87,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/lead_daily_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.lead_daily_summary" + destination_project_dataset_table: "{{ var.value.container_registry.lead_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py index e9f0e1079..7362f3cbb 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/no2_daily_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.no2_daily_summary", + destination_project_dataset_table="{{ var.value.container_registry.no2_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml index 793207f79..f11d9e536 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml @@ -87,8 +87,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/no2_daily_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.no2_daily_summary" + destination_project_dataset_table: "{{ var.value.container_registry.no2_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py index c430330f5..352c8b5fd 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.no2_hourly_summary", + destination_project_dataset_table="{{ var.value.container_registry.no2_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml index d417e76e3..decb1a139 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml @@ -85,8 +85,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/no2_hourly_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.no2_hourly_summary" + destination_project_dataset_table: "{{ var.value.container_registry.no2_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py index a2f120fc9..7473ced67 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/nonoxnoy_daily_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.nonoxnoy_daily_summary", + destination_project_dataset_table="{{ var.value.container_registry.nonoxnoy_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml index e5dfcfd38..c22f73d8d 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -87,8 +87,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/nonoxnoy_daily_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.nonoxnoy_daily_summary" + destination_project_dataset_table: "{{ var.value.container_registry.nonoxnoy_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py index 60fd572d6..4a0b48151 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py @@ -79,7 +79,7 @@ "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="epa_historical_air_quality.nonoxnoy_hourly_summary", + destination_project_dataset_table="{{ var.value.container_registry.nonoxnoy_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml index 9b5d0a68c..5a5b02582 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml @@ -85,8 +85,7 @@ dag: bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv"] source_format: "CSV" - # destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" - destination_project_dataset_table: "epa_historical_air_quality.nonoxnoy_hourly_summary" + destination_project_dataset_table: "{{ var.value.container_registry.nonoxnoy_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py new file mode 100644 index 000000000..a019de464 --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.ozone_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="ozone_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_44201_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_ozone_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/ozone_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.ozone_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml new file mode 100644 index 000000000..262862ad8 --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "ozone_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: ozone_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "ozone_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_44201_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_ozone_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/ozone_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.ozone_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py new file mode 100644 index 000000000..36180a704 --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.ozone_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="ozone_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.ozone_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..a7800b4ef --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "ozone_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: ozone_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "ozone_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.ozone_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml new file mode 100644 index 000000000..7f26fa8c6 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm10_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm10_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm10_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_81102_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pm10_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm10_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm10_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py new file mode 100644 index 000000000..5dc301e60 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm10_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm10_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_81102_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_pm10_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm10_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pm10_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..b5311d5bb --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm10_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm10_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm10_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm10_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py new file mode 100644 index 000000000..9ac41b4c6 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm10_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm10_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pm10_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..512560468 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_frm_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_frm_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_frm_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm25_frm_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py new file mode 100644 index 000000000..8b4e9ced0 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_frm_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_frm_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pm25_frm_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml new file mode 100644 index 000000000..bf96e06bc --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_nonfrm_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_nonfrm_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_nonfrm_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_88502_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pm25_nonfrm_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm25_nonfrm_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py new file mode 100644 index 000000000..e97a42370 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_nonfrm_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_nonfrm_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_88502_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_pm25_nonfrm_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pm25_nonfrm_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..1a44678f5 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_nonfrm_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_nonfrm_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_nonfrm_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm25_nonfrm_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py new file mode 100644 index 000000000..695bc2c1e --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_nonfrm_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_nonfrm_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pm25_nonfrm_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml new file mode 100644 index 000000000..66b34521d --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_speciation_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_speciation_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_speciation_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pm25_speciation_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_speciation_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm25_speciation_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..e1bbb32ce --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_speciation_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_speciation_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_speciation_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_speciation_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pm25_speciation_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml new file mode 100644 index 000000000..f8a445b5f --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pressure_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pressure_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pressure_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pressure_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pressure_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pressure_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py new file mode 100644 index 000000000..87621221b --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pressure_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pressure_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_pressure_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pressure_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pressure_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..d3a6efa24 --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pressure_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pressure_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pressure_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.pressure_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py new file mode 100644 index 000000000..83fd77698 --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pressure_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pressure_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.pressure_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml new file mode 100644 index 000000000..a98d33c80 --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "rh_and_dp_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: rh_and_dp_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "rh_and_dp_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_rh_and_dp_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/rh_and_dp_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.rh_and_dp_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py new file mode 100644 index 000000000..6ff528bab --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.rh_and_dp_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="rh_and_dp_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_rh_and_dp_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/rh_and_dp_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.rh_and_dp_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..d2018e7da --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "rh_and_dp_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: rh_and_dp_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "rh_and_dp_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.rh_and_dp_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py new file mode 100644 index 000000000..f74296ada --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.rh_and_dp_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="rh_and_dp_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.rh_and_dp_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml new file mode 100644 index 000000000..c437dd8bf --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "so2_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: so2_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "so2_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42401_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_so2_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/so2_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.so2_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py new file mode 100644 index 000000000..f95c6c4e0 --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.so2_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="so2_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42401_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_so2_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/so2_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.so2_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..203b73405 --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "so2_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: so2_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "so2_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/so2_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.so2_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py new file mode 100644 index 000000000..3fe752a08 --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.so2_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="so2_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/so2_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.so2_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml new file mode 100644 index 000000000..76b38a62e --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "temperature_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: temperature_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "temperature_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_temperature_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/temperature_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.temperature_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py new file mode 100644 index 000000000..62065844b --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.temperature_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="temperature_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_temperature_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/temperature_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.temperature_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..a7cc091d0 --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "temperature_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: temperature_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "temperature_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.temperature_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py new file mode 100644 index 000000000..e3210f09c --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.temperature_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="temperature_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.temperature_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml new file mode 100644 index 000000000..b863323bc --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "voc_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: voc_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "voc_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_voc_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/voc_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.voc_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py new file mode 100644 index 000000000..1c7426e17 --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.voc_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="voc_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_voc_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/voc_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.voc_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..c668ba288 --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "voc_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: voc_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "voc_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/voc_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.voc_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py new file mode 100644 index 000000000..a6f041cbd --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -0,0 +1,116 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.voc_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="voc_hourly_summary", + namespace="default", + affinity={'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'cloud.google.com/gke-nodepool', 'operator': 'In', 'values': ['pool-e2-standard-4']}]}]}}}, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={'SOURCE_URL': 'https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip', 'START_YEAR': '1990', 'SOURCE_FILE': 'files/data.csv', 'TARGET_FILE': 'files/data_output.csv', 'CHUNKSIZE': '2500000', 'TARGET_GCS_BUCKET': '{{ var.value.composer_bucket }}', 'TARGET_GCS_PATH': 'data/epa_historical_air_quality/voc_hourly_summary/data_output.csv', 'DATA_NAMES': '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', 'DATA_DTYPES': '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }'}, + resources={'limit_memory': '8G', 'limit_cpu': '3'}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=['data/epa_historical_air_quality/voc_hourly_summary/data_output.csv'], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.voc_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[{'name': 'state_code', 'type': 'STRING', 'description': 'The FIPS code of the state in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'county_code', 'type': 'STRING', 'description': 'The FIPS code of the county in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'site_num', 'type': 'STRING', 'description': 'A unique number within the county identifying the site.', 'mode': 'NULLABLE'}, {'name': 'parameter_code', 'type': 'INTEGER', 'description': 'The AQS code corresponding to the parameter measured by the monitor.', 'mode': 'NULLABLE'}, {'name': 'poc', 'type': 'INTEGER', 'description': 'This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.', 'mode': 'NULLABLE'}, {'name': 'latitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance north of the equator measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'longitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'datum', 'type': 'STRING', 'description': 'The Datum associated with the Latitude and Longitude measures.', 'mode': 'NULLABLE'}, {'name': 'parameter_name', 'type': 'STRING', 'description': 'The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.', 'mode': 'NULLABLE'}, {'name': 'date_local', 'type': 'TIMESTAMP', 'description': 'The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.', 'mode': 'NULLABLE'}, {'name': 'time_local', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Local Standard Time.', 'mode': 'NULLABLE'}, {'name': 'date_gmt', 'type': 'TIMESTAMP', 'description': 'The calendar date of the sample in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'time_gmt', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'sample_measurement', 'type': 'FLOAT', 'description': 'The measured value in the standard units of measure for the parameter.', 'mode': 'NULLABLE'}, {'name': 'units_of_measure', 'type': 'STRING', 'description': 'The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.', 'mode': 'NULLABLE'}, {'name': 'mdl', 'type': 'FLOAT', 'description': 'The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.', 'mode': 'NULLABLE'}, {'name': 'uncertainty', 'type': 'FLOAT', 'description': 'The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.', 'mode': 'NULLABLE'}, {'name': 'qualifier', 'type': 'STRING', 'description': 'Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.', 'mode': 'NULLABLE'}, {'name': 'method_type', 'type': 'STRING', 'description': 'An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).', 'mode': 'NULLABLE'}, {'name': 'method_code', 'type': 'STRING', 'description': 'An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.', 'mode': 'NULLABLE'}, {'name': 'method_name', 'type': 'STRING', 'description': 'A short description of the processes, equipment, and protocols used in gathering and measuring the sample.', 'mode': 'NULLABLE'}, {'name': 'state_name', 'type': 'STRING', 'description': 'The name of the state where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'county_name', 'type': 'STRING', 'description': 'The name of the county where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'date_of_last_change', 'type': 'TIMESTAMP', 'description': 'The date the last time any numeric values in this record were updated in the AQS data system.', 'mode': 'NULLABLE'}], + ) + + transform_csv >> load_to_bq + ample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml new file mode 100644 index 000000000..d2ab4d4f7 --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "wind_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: wind_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "wind_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_wind_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/wind_daily_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.wind_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py new file mode 100644 index 000000000..432ad5d9d --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.wind_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="wind_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_wind_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/wind_daily_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.wind_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..d8bec20d7 --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "wind_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: wind_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "wind_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_~year~.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_hourly_summary/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/wind_hourly_summary/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.value.container_registry.wind_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py new file mode 100644 index 000000000..940de2efd --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.wind_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="wind_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/wind_hourly_summary/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.value.container_registry.wind_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq From 9625f2d83d692280f6c3735b686fba40a91638a4 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Fri, 29 Oct 2021 15:45:41 +0000 Subject: [PATCH 12/26] fix: Added terraform files for new pipelines. --- .../_terraform/hap_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/lead_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/no2_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/no2_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../nonoxnoy_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../nonoxnoy_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../ozone_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../ozone_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/pm10_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../pm10_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../pm25_frm_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../pm25_nonfrm_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../pm25_nonfrm_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../pm25_speciation_daily_summary_pipeline.tf | 39 +++++++++++++++++++ ...pm25_speciation_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../pressure_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../pressure_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../rh_and_dp_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../rh_and_dp_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/so2_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/so2_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../temperature_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../temperature_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/voc_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/voc_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ .../_terraform/wind_daily_summary_pipeline.tf | 39 +++++++++++++++++++ .../wind_hourly_summary_pipeline.tf | 39 +++++++++++++++++++ 27 files changed, 1053 insertions(+) create mode 100644 datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf diff --git a/datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf new file mode 100644 index 000000000..dd7896ae1 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_hap_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "hap_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf new file mode 100644 index 000000000..d7fd58f20 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_lead_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "lead_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_lead_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_lead_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf new file mode 100644 index 000000000..448a029f8 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_no2_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "no2_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_no2_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_no2_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf new file mode 100644 index 000000000..2d057b403 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_no2_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "no2_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf new file mode 100644 index 000000000..92f5294c7 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "nonoxnoy_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf new file mode 100644 index 000000000..4b57e8fba --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "nonoxnoy_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf new file mode 100644 index 000000000..19cff7cc2 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_ozone_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "ozone_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf new file mode 100644 index 000000000..517e8127c --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_ozone_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "ozone_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf new file mode 100644 index 000000000..af38e7681 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm10_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm10_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf new file mode 100644 index 000000000..d83d38c9e --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm10_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm10_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf new file mode 100644 index 000000000..3d64246b4 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_frm_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_frm_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf new file mode 100644 index 000000000..5faf05f88 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_nonfrm_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf new file mode 100644 index 000000000..8cb22a6ac --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_nonfrm_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf new file mode 100644 index 000000000..c4ce35a13 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_speciation_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf new file mode 100644 index 000000000..aa0da3bf7 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_speciation_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf new file mode 100644 index 000000000..f67bfa0eb --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pressure_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pressure_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf new file mode 100644 index 000000000..23fa46310 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pressure_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pressure_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf new file mode 100644 index 000000000..7bd465c09 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "rh_and_dp_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf new file mode 100644 index 000000000..f259b3cba --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "rh_and_dp_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf new file mode 100644 index 000000000..c2e5bfa02 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_so2_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "so2_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_so2_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_so2_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf new file mode 100644 index 000000000..5a74e4d45 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_so2_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "so2_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf new file mode 100644 index 000000000..98865c34e --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_temperature_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "temperature_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf new file mode 100644 index 000000000..09bd21923 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_temperature_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "temperature_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf new file mode 100644 index 000000000..7348fa307 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_voc_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "voc_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_voc_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_voc_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf new file mode 100644 index 000000000..7a337682d --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_voc_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "voc_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf new file mode 100644 index 000000000..90d444049 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_wind_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "wind_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_wind_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_wind_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf new file mode 100644 index 000000000..257bce937 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_wind_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "wind_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.id +} From c2fb0257d4acaa7d0c1995a3544a7aa3bd008c2d Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Fri, 29 Oct 2021 16:01:28 +0000 Subject: [PATCH 13/26] fix: Regenerated some dags. --- .../annual_summaries/annual_summaries_dag.py | 3 +- .../voc_hourly_summary_dag.py | 136 ++++++++++++++++-- 2 files changed, 128 insertions(+), 11 deletions(-) diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index ae99b87ef..7d89d06f7 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -13,11 +13,10 @@ # limitations under the License. -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow.providers.google.cloud.transfers import gcs_to_bigquery - default_args = { "owner": "Google", "depends_on_past": False, diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py index a6f041cbd..964b5f5b2 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -38,28 +38,146 @@ task_id="transform_csv", name="voc_hourly_summary", namespace="default", - affinity={'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'cloud.google.com/gke-nodepool', 'operator': 'In', 'values': ['pool-e2-standard-4']}]}]}}}, + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={'SOURCE_URL': 'https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip', 'START_YEAR': '1990', 'SOURCE_FILE': 'files/data.csv', 'TARGET_FILE': 'files/data_output.csv', 'CHUNKSIZE': '2500000', 'TARGET_GCS_BUCKET': '{{ var.value.composer_bucket }}', 'TARGET_GCS_PATH': 'data/epa_historical_air_quality/voc_hourly_summary/data_output.csv', 'DATA_NAMES': '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', 'DATA_DTYPES': '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }'}, - resources={'limit_memory': '8G', 'limit_cpu': '3'}, + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_hourly_summary/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, ) # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=['data/epa_historical_air_quality/voc_hourly_summary/data_output.csv'], + source_objects=[ + "data/epa_historical_air_quality/voc_hourly_summary/data_output.csv" + ], source_format="CSV", destination_project_dataset_table="{{ var.value.container_registry.voc_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", - schema_fields=[{'name': 'state_code', 'type': 'STRING', 'description': 'The FIPS code of the state in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'county_code', 'type': 'STRING', 'description': 'The FIPS code of the county in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'site_num', 'type': 'STRING', 'description': 'A unique number within the county identifying the site.', 'mode': 'NULLABLE'}, {'name': 'parameter_code', 'type': 'INTEGER', 'description': 'The AQS code corresponding to the parameter measured by the monitor.', 'mode': 'NULLABLE'}, {'name': 'poc', 'type': 'INTEGER', 'description': 'This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.', 'mode': 'NULLABLE'}, {'name': 'latitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance north of the equator measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'longitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'datum', 'type': 'STRING', 'description': 'The Datum associated with the Latitude and Longitude measures.', 'mode': 'NULLABLE'}, {'name': 'parameter_name', 'type': 'STRING', 'description': 'The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.', 'mode': 'NULLABLE'}, {'name': 'date_local', 'type': 'TIMESTAMP', 'description': 'The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.', 'mode': 'NULLABLE'}, {'name': 'time_local', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Local Standard Time.', 'mode': 'NULLABLE'}, {'name': 'date_gmt', 'type': 'TIMESTAMP', 'description': 'The calendar date of the sample in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'time_gmt', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'sample_measurement', 'type': 'FLOAT', 'description': 'The measured value in the standard units of measure for the parameter.', 'mode': 'NULLABLE'}, {'name': 'units_of_measure', 'type': 'STRING', 'description': 'The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.', 'mode': 'NULLABLE'}, {'name': 'mdl', 'type': 'FLOAT', 'description': 'The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.', 'mode': 'NULLABLE'}, {'name': 'uncertainty', 'type': 'FLOAT', 'description': 'The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.', 'mode': 'NULLABLE'}, {'name': 'qualifier', 'type': 'STRING', 'description': 'Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.', 'mode': 'NULLABLE'}, {'name': 'method_type', 'type': 'STRING', 'description': 'An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).', 'mode': 'NULLABLE'}, {'name': 'method_code', 'type': 'STRING', 'description': 'An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.', 'mode': 'NULLABLE'}, {'name': 'method_name', 'type': 'STRING', 'description': 'A short description of the processes, equipment, and protocols used in gathering and measuring the sample.', 'mode': 'NULLABLE'}, {'name': 'state_name', 'type': 'STRING', 'description': 'The name of the state where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'county_name', 'type': 'STRING', 'description': 'The name of the county where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'date_of_last_change', 'type': 'TIMESTAMP', 'description': 'The date the last time any numeric values in this record were updated in the AQS data system.', 'mode': 'NULLABLE'}], - ) - - transform_csv >> load_to_bq - ample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", "mode": "NULLABLE", }, { From 9535847642a814f53e945f4b9eaf657e754c99fe Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Mon, 1 Nov 2021 01:23:46 +0000 Subject: [PATCH 14/26] fix: Resolved variable issues in pipeline.yaml files so that they operate as intended in AF --- .../annual_summaries/annual_summaries_dag.py | 6 +- .../annual_summaries/pipeline.yaml | 6 +- .../co_daily_summary/co_daily_summary_dag.py | 6 +- .../co_daily_summary/pipeline.yaml | 6 +- .../co_hourly_summary_dag.py | 6 +- .../co_hourly_summary/pipeline.yaml | 6 +- .../hap_daily_summary_dag.py | 6 +- .../hap_daily_summary/pipeline.yaml | 6 +- .../hap_hourly_summary_dag.py | 6 +- .../hap_hourly_summary/pipeline.yaml | 6 +- .../lead_daily_summary_dag.py | 6 +- .../lead_daily_summary/pipeline.yaml | 6 +- .../no2_daily_summary_dag.py | 6 +- .../no2_daily_summary/pipeline.yaml | 6 +- .../no2_hourly_summary_dag.py | 6 +- .../no2_hourly_summary/pipeline.yaml | 6 +- .../nonoxnoy_daily_summary_dag.py | 6 +- .../nonoxnoy_daily_summary/pipeline.yaml | 6 +- .../nonoxnoy_hourly_summary_dag.py | 6 +- .../nonoxnoy_hourly_summary/pipeline.yaml | 6 +- .../ozone_daily_summary_dag.py | 6 +- .../ozone_daily_summary/pipeline.yaml | 6 +- .../ozone_hourly_summary_dag.py | 138 +-------- .../ozone_hourly_summary/pipeline.yaml | 6 +- .../pm10_daily_summary/pipeline.yaml | 6 +- .../pm10_daily_summary_dag.py | 6 +- .../pm10_hourly_summary/pipeline.yaml | 6 +- .../pm10_hourly_summary_dag.py | 6 +- .../pm25_frm_hourly_summary/pipeline.yaml | 6 +- .../pm25_frm_hourly_summary_dag.py | 6 +- .../pm25_nonfrm_daily_summary/pipeline.yaml | 6 +- .../pm25_nonfrm_daily_summary_dag.py | 6 +- .../pm25_nonfrm_hourly_summary/pipeline.yaml | 6 +- .../pm25_nonfrm_hourly_summary_dag.py | 6 +- .../pipeline.yaml | 6 +- .../pm25_speciation_daily_summary_dag.py | 264 ++++++++++++++++++ .../pipeline.yaml | 6 +- .../pm25_speciation_hourly_summary_dag.py | 234 ++++++++++++++++ .../pressure_daily_summary/pipeline.yaml | 6 +- .../pressure_daily_summary_dag.py | 6 +- .../pressure_hourly_summary/pipeline.yaml | 6 +- .../pressure_hourly_summary_dag.py | 6 +- .../rh_and_dp_daily_summary/pipeline.yaml | 6 +- .../rh_and_dp_daily_summary_dag.py | 6 +- .../rh_and_dp_hourly_summary/pipeline.yaml | 6 +- .../rh_and_dp_hourly_summary_dag.py | 6 +- .../so2_daily_summary/pipeline.yaml | 6 +- .../so2_daily_summary_dag.py | 6 +- .../so2_hourly_summary/pipeline.yaml | 6 +- .../so2_hourly_summary_dag.py | 6 +- .../temperature_daily_summary/pipeline.yaml | 6 +- .../temperature_daily_summary_dag.py | 6 +- .../temperature_hourly_summary/pipeline.yaml | 6 +- .../temperature_hourly_summary_dag.py | 6 +- .../voc_daily_summary/pipeline.yaml | 6 +- .../voc_daily_summary_dag.py | 6 +- .../voc_hourly_summary/pipeline.yaml | 6 +- .../voc_hourly_summary_dag.py | 186 +----------- .../wind_daily_summary/pipeline.yaml | 6 +- .../wind_daily_summary_dag.py | 6 +- .../wind_hourly_summary/pipeline.yaml | 6 +- .../wind_hourly_summary_dag.py | 6 +- 62 files changed, 689 insertions(+), 481 deletions(-) create mode 100644 datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py create mode 100644 datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 7d89d06f7..96cf50443 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/annual_summaries/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/annual_summaries/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/annual_summaries/data_output.csv" + "data/epa_historical_air_quality/annual_summaries/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.annual_summaries_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml index f35bd98f5..23e9a51e7 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/annual_summaries/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/annual_summaries/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -95,9 +95,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/annual_summaries/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/annual_summaries/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py index c57a281f3..cae5283ca 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_daily_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/co_daily_summary/data_output.csv" + "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.annual_summaries_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml index 290923a5b..25483b0fa 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/co_daily_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/co_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/co_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.annual_summaries_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py index e3a80cbe3..6dcf847bb 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code",\n "method_name", "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str",\n "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str",\n "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/hourly_summaries/data_output.csv" + "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.co_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.co_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml index b8ffb5731..8c5ca6928 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/co_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/hourly_summaries/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.co_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.co_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py index 394632df5..6cec05873 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_daily_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/hap_daily_summary/data_output.csv" + "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.hap_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.hap_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml index 02722baf4..4be18a09d 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_daily_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/hap_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.hap_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.hap_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py index dc0718fe4..fc2ee41c0 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.hap_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.hap_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml index 3d8e7c0d2..8dd9df12c 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/hap_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.hap_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.hap_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py index 51f28113a..e7b7ee022 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/lead_daily_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/lead_daily_summary/data_output.csv" + "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.lead_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.lead_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml index 1b0633c4f..915ec933f 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/lead_daily_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/lead_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.lead_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.lead_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py index 7362f3cbb..f69b7a714 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_daily_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/no2_daily_summary/data_output.csv" + "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.no2_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.no2_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml index f11d9e536..85aa4be7d 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_daily_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/no2_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.no2_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.no2_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py index 352c8b5fd..5aa5a7a0d 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.no2_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.no2_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml index decb1a139..651b03840 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/no2_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.no2_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.no2_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py index 7473ced67..d05392a54 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_nonoxnoy_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/nonoxnoy_daily_summary/data_output.csv" + "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.nonoxnoy_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml index c22f73d8d..1c8700be7 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_nonoxnoy_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/nonoxnoy_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.nonoxnoy_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py index 4a0b48151..27b069371 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.nonoxnoy_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml index 5a5b02582..8eca61207 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/nonoxnoy_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.nonoxnoy_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py index a019de464..0d35a6c27 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_ozone_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_ozone_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/ozone_daily_summary/data_output.csv" + "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.ozone_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.ozone_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml index 262862ad8..922fa715c 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_ozone_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_ozone_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/ozone_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.ozone_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.ozone_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py index 36180a704..2513a448f 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py @@ -38,146 +38,28 @@ task_id="transform_csv", name="ozone_hourly_summary", namespace="default", - affinity={ - "nodeAffinity": { - "requiredDuringSchedulingIgnoredDuringExecution": { - "nodeSelectorTerms": [ - { - "matchExpressions": [ - { - "key": "cloud.google.com/gke-nodepool", - "operator": "In", - "values": ["pool-e2-standard-4"], - } - ] - } - ] - } - } - }, + affinity={'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'cloud.google.com/gke-nodepool', 'operator': 'In', 'values': ['pool-e2-standard-4']}]}]}}}, image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={"limit_memory": "8G", "limit_cpu": "3"}, + env_vars={'SOURCE_URL': 'https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip', 'START_YEAR': '1990', 'SOURCE_FILE': 'files/data.csv', 'TARGET_FILE': 'files/data_output.csv', 'CHUNKSIZE': '2500000', 'TARGET_GCS_BUCKET': '{{ var.value.composer_bucket }}', 'TARGET_GCS_PATH': 'data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv', 'DATA_NAMES': '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', 'DATA_DTYPES': '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }'}, + resources={'limit_memory': '8G', 'limit_cpu': '3'}, ) # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv" - ], + source_objects=['data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv'], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.ozone_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.ozone_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + schema_fields=[{'name': 'state_code', 'type': 'STRING', 'description': 'The FIPS code of the state in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'county_code', 'type': 'STRING', 'description': 'The FIPS code of the county in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'site_num', 'type': 'STRING', 'description': 'A unique number within the county identifying the site.', 'mode': 'NULLABLE'}, {'name': 'parameter_code', 'type': 'INTEGER', 'description': 'The AQS code corresponding to the parameter measured by the monitor.', 'mode': 'NULLABLE'}, {'name': 'poc', 'type': 'INTEGER', 'description': 'This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.', 'mode': 'NULLABLE'}, {'name': 'latitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance north of the equator measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'longitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'datum', 'type': 'STRING', 'description': 'The Datum associated with the Latitude and Longitude measures.', 'mode': 'NULLABLE'}, {'name': 'parameter_name', 'type': 'STRING', 'description': 'The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.', 'mode': 'NULLABLE'}, {'name': 'date_local', 'type': 'TIMESTAMP', 'description': 'The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.', 'mode': 'NULLABLE'}, {'name': 'time_local', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Local Standard Time.', 'mode': 'NULLABLE'}, {'name': 'date_gmt', 'type': 'TIMESTAMP', 'description': 'The calendar date of the sample in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'time_gmt', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'sample_measurement', 'type': 'FLOAT', 'description': 'The measured value in the standard units of measure for the parameter.', 'mode': 'NULLABLE'}, {'name': 'units_of_measure', 'type': 'STRING', 'description': 'The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.', 'mode': 'NULLABLE'}, {'name': 'mdl', 'type': 'FLOAT', 'description': 'The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.', 'mode': 'NULLABLE'}, {'name': 'uncertainty', 'type': 'FLOAT', 'description': 'The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.', 'mode': 'NULLABLE'}, {'name': 'qualifier', 'type': 'STRING', 'description': 'Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.', 'mode': 'NULLABLE'}, {'name': 'method_type', 'type': 'STRING', 'description': 'An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).', 'mode': 'NULLABLE'}, {'name': 'method_code', 'type': 'STRING', 'description': 'An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.', 'mode': 'NULLABLE'}, {'name': 'method_name', 'type': 'STRING', 'description': 'A short description of the processes, equipment, and protocols used in gathering and measuring the sample.', 'mode': 'NULLABLE'}, {'name': 'state_name', 'type': 'STRING', 'description': 'The name of the state where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'county_name', 'type': 'STRING', 'description': 'The name of the county where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'date_of_last_change', 'type': 'TIMESTAMP', 'description': 'The date the last time any numeric values in this record were updated in the AQS data system.', 'mode': 'NULLABLE'}], + ) + + transform_csv >> load_to_bq + ample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", "mode": "NULLABLE", }, { diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml index a7800b4ef..bd9d1f873 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/ozone_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.ozone_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.ozone_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml index 7f26fa8c6..8730024bf 100644 --- a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pm10_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm10_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm10_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm10_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py index 5dc301e60..e63fb89e1 100644 --- a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_pm10_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pm10_daily_summary/data_output.csv" + "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pm10_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm10_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml index b5311d5bb..fd07e176d 100644 --- a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm10_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm10_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py index 9ac41b4c6..93d6d5865 100644 --- a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pm10_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pm10_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm10_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml index 512560468..a521796a0 100644 --- a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm25_frm_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_frm_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py index 8b4e9ced0..19ea97a9e 100644 --- a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pm25_frm_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pm25_frm_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_frm_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml index bf96e06bc..21e239a5d 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pm25_nonfrm_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm25_nonfrm_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py index e97a42370..5e93c4f4f 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_pm25_nonfrm_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/data_output.csv" + "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pm25_nonfrm_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml index 1a44678f5..5f2533213 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm25_nonfrm_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py index 695bc2c1e..3809645fc 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pm25_nonfrm_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml index 66b34521d..78e030eaf 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pm25_speciation_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_daily_speciation_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_speciation_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm25_speciation_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py new file mode 100644 index 000000000..1b21a8809 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_speciation_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_speciation_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_daily_speciation_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml index e1bbb32ce..39d37872f 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_speciation_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pm25_speciation_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py new file mode 100644 index 000000000..417d4d9ae --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_speciation_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_speciation_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml index f8a445b5f..914ac8569 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_pressure_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pressure_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pressure_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pressure_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py index 87621221b..8a6d00e9f 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_pressure_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pressure_daily_summary/data_output.csv" + "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pressure_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pressure_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml index d3a6efa24..85e4d8510 100644 --- a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.pressure_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pressure_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py index 83fd77698..2b5be13b5 100644 --- a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/pressure_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.pressure_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pressure_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml index a98d33c80..0af5e2e43 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_rh_and_dp_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/rh_and_dp_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.rh_and_dp_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py index 6ff528bab..eb5447632 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_rh_and_dp_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/rh_and_dp_daily_summary/data_output.csv" + "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.rh_and_dp_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml index d2018e7da..453853343 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.rh_and_dp_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py index f74296ada..36d0a46a2 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/rh_and_dp_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.rh_and_dp_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml index c437dd8bf..b72322ff9 100644 --- a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_so2_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/so2_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.so2_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.so2_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py index f95c6c4e0..f3ab2629a 100644 --- a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_so2_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/so2_daily_summary/data_output.csv" + "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.so2_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.so2_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml index 203b73405..e3958f8af 100644 --- a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/so2_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.so2_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.so2_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py index 3fe752a08..f9ed056cf 100644 --- a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/so2_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.so2_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.so2_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml index 76b38a62e..539331c1b 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_temperature_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/temperature_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.temperature_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.temperature_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py index 62065844b..e7c0ebbe0 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_temperature_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/temperature_daily_summary/data_output.csv" + "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.temperature_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.temperature_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml index a7cc091d0..9618ba4e9 100644 --- a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.temperature_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.temperature_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py index e3210f09c..634f7e238 100644 --- a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/temperature_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.temperature_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.temperature_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml index b863323bc..a9f939c49 100644 --- a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_voc_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/voc_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.voc_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.voc_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py index 1c7426e17..0555a4a73 100644 --- a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_voc_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/voc_daily_summary/data_output.csv" + "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.voc_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.voc_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml index c668ba288..fd2c11e72 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/voc_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.voc_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.voc_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py index 964b5f5b2..754c7db72 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -38,197 +38,25 @@ task_id="transform_csv", name="voc_hourly_summary", namespace="default", - affinity={ - "nodeAffinity": { - "requiredDuringSchedulingIgnoredDuringExecution": { - "nodeSelectorTerms": [ - { - "matchExpressions": [ - { - "key": "cloud.google.com/gke-nodepool", - "operator": "In", - "values": ["pool-e2-standard-4"], - } - ] - } - ] - } - } - }, + affinity={'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'cloud.google.com/gke-nodepool', 'operator': 'In', 'values': ['pool-e2-standard-4']}]}]}}}, image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_hourly_summary/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={"limit_memory": "8G", "limit_cpu": "3"}, + env_vars={'SOURCE_URL': 'https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip', 'START_YEAR': '1990', 'SOURCE_FILE': 'files/data.csv', 'TARGET_FILE': 'files/data_output.csv', 'CHUNKSIZE': '2500000', 'TARGET_GCS_BUCKET': '{{ var.value.composer_bucket }}', 'TARGET_GCS_PATH': 'data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv', 'DATA_NAMES': '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', 'DATA_DTYPES': '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }'}, + resources={'limit_memory': '8G', 'limit_cpu': '3'}, ) # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/voc_hourly_summary/data_output.csv" - ], + source_objects=['data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv'], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.voc_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.voc_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], + schema_fields=[{'name': 'state_code', 'type': 'STRING', 'description': 'The FIPS code of the state in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'county_code', 'type': 'STRING', 'description': 'The FIPS code of the county in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'site_num', 'type': 'STRING', 'description': 'A unique number within the county identifying the site.', 'mode': 'NULLABLE'}, {'name': 'parameter_code', 'type': 'INTEGER', 'description': 'The AQS code corresponding to the parameter measured by the monitor.', 'mode': 'NULLABLE'}, {'name': 'poc', 'type': 'INTEGER', 'description': 'This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.', 'mode': 'NULLABLE'}, {'name': 'latitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance north of the equator measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'longitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'datum', 'type': 'STRING', 'description': 'The Datum associated with the Latitude and Longitude measures.', 'mode': 'NULLABLE'}, {'name': 'parameter_name', 'type': 'STRING', 'description': 'The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.', 'mode': 'NULLABLE'}, {'name': 'date_local', 'type': 'TIMESTAMP', 'description': 'The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.', 'mode': 'NULLABLE'}, {'name': 'time_local', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Local Standard Time.', 'mode': 'NULLABLE'}, {'name': 'date_gmt', 'type': 'TIMESTAMP', 'description': 'The calendar date of the sample in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'time_gmt', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'sample_measurement', 'type': 'FLOAT', 'description': 'The measured value in the standard units of measure for the parameter.', 'mode': 'NULLABLE'}, {'name': 'units_of_measure', 'type': 'STRING', 'description': 'The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.', 'mode': 'NULLABLE'}, {'name': 'mdl', 'type': 'FLOAT', 'description': 'The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.', 'mode': 'NULLABLE'}, {'name': 'uncertainty', 'type': 'FLOAT', 'description': 'The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.', 'mode': 'NULLABLE'}, {'name': 'qualifier', 'type': 'STRING', 'description': 'Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.', 'mode': 'NULLABLE'}, {'name': 'method_type', 'type': 'STRING', 'description': 'An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).', 'mode': 'NULLABLE'}, {'name': 'method_code', 'type': 'STRING', 'description': 'An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.', 'mode': 'NULLABLE'}, {'name': 'method_name', 'type': 'STRING', 'description': 'A short description of the processes, equipment, and protocols used in gathering and measuring the sample.', 'mode': 'NULLABLE'}, {'name': 'state_name', 'type': 'STRING', 'description': 'The name of the state where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'county_name', 'type': 'STRING', 'description': 'The name of the county where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'date_of_last_change', 'type': 'TIMESTAMP', 'description': 'The date the last time any numeric values in this record were updated in the AQS data system.', 'mode': 'NULLABLE'}], ) transform_csv >> load_to_bq + \ No newline at end of file diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml index d2ab4d4f7..a8b0a9d62 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_wind_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", @@ -85,9 +85,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/wind_daily_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.wind_daily_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.wind_daily_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py index 432ad5d9d..b746e1bc1 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_wind_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/wind_daily_summary/data_output.csv" + "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.wind_daily_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.wind_daily_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml index d8bec20d7..834c5e06c 100644 --- a/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_hourly_summary/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "date_local", @@ -83,9 +83,9 @@ dag: args: task_id: "load_to_bq" bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/wind_hourly_summary/data_output.csv"] + source_objects: ["data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv"] source_format: "CSV" - destination_project_dataset_table: "{{ var.value.container_registry.wind_hourly_summary_destination_table }}" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.wind_hourly_summary_destination_table }}" skip_leading_rows: 1 allow_quoted_newlines: True write_disposition: "WRITE_TRUNCATE" diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py index 940de2efd..7412f4d37 100644 --- a/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_hourly_summary/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', }, @@ -76,10 +76,10 @@ task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", source_objects=[ - "data/epa_historical_air_quality/wind_hourly_summary/data_output.csv" + "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv" ], source_format="CSV", - destination_project_dataset_table="{{ var.value.container_registry.wind_hourly_summary_destination_table }}", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.wind_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", From 54c3d7f011f4b93885982dd977a6708f53d0560c Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Mon, 1 Nov 2021 03:16:03 +0000 Subject: [PATCH 15/26] fix: clean-up code --- .../run_csv_transform_kub/csv_transform.py | 203 ++++++------------ .../ozone_hourly_summary_dag.py | 136 +++++++++++- .../voc_hourly_summary_dag.py | 184 +++++++++++++++- 3 files changed, 370 insertions(+), 153 deletions(-) diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py index c5092c1bf..ef38be082 100644 --- a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -21,7 +21,6 @@ import typing import zipfile as zip -# import numpy import pandas as pd import requests from google.cloud import storage @@ -55,19 +54,8 @@ def main( file_group_wildcard = os.path.split(source_url)[1].replace("_~year~.zip", "") source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") - key_list = [ - "state_code", - "county_code", - "site_num", - "sample_duration", - "pollutant_standard", - "metric_used", - "method_name", - "address", - "date_of_last_change", - ] process_source_file( - source, target_file, data_names, data_dtypes, int(chunksize), key_list + source, target_file, data_names, data_dtypes, int(chunksize) ) upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) @@ -86,7 +74,7 @@ def download_url_files_from_year_range( for yr in range(start_year, end_year + 1, 1): src_url = source_url.replace("~year~", str(yr)) dest_file = dest_path + "/source_" + os.path.split(src_url)[1] - download_file_http(src_url, dest_file) + download_file_http(src_url, dest_file, continue_on_error) unpack_file(dest_file, dest_path, "zip") if remove_file: os.remove(dest_file) @@ -117,97 +105,26 @@ def download_file_http( ) -def process_source_file( - source_file: str, - target_file: str, - names: list, - dtypes: dict, - chunksize: int, - key_list: list, -) -> None: - logging.info(f"Opening batch file {source_file}") - with pd.read_csv( - source_file, # path to main source file to load in batches - engine="python", - encoding="utf-8", - quotechar='"', # string separator, typically double-quotes - chunksize=chunksize, # size of batch data, in no. of records - sep=",", # data column separator, typically "," - header=None, # use when the data file does not contain a header - names=names, - dtype=dtypes, - keep_default_na=True, - na_values=[" "] - # parse_dates=["start_date", "end_date"], - ) as reader: - for chunk_number, chunk in enumerate(reader): - target_file_batch = str(target_file).replace( - ".csv", "-" + str(chunk_number) + ".csv" - ) - df = pd.DataFrame() - df = pd.concat([df, chunk]) - process_chunk( - df, target_file_batch, target_file, (not chunk_number == 0), key_list +def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> None: + if os.path.exists(infile): + if compression_type == "zip": + logging.info(f"Unpacking {infile} to {dest_path}") + with zip.ZipFile(infile, mode="r") as zipf: + zipf.extractall(dest_path) + zipf.close() + else: + logging.info( + f"{infile} ignored as it is not compressed or is of unknown compression" ) + else: + logging.info(f"{infile} not unpacked because it does not exist.") -def process_chunk( - df: pd.DataFrame, - target_file_batch: str, - target_file: str, - skip_header: bool, - key_list: list, -) -> None: - df = resolve_date_format(df, "%Y-%m-%d %H:%M") - # df = add_key(df, key_list) - save_to_new_file(df, file_path=str(target_file_batch), sep=",") - append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) - - -def add_key(df: pd.DataFrame, key_list: list) -> pd.DataFrame: - logging.info(f"Adding key column(s) {key_list}") - df["key"] = "" - for key in key_list: - df["key"] = df.apply( - lambda x: str(x[key]) - if not str(x["key"]) - else str(x["key"]) + "-" + str(x[key]), - axis=1, - ) - df["key_val"] = df["key"] - - return df - - -# def reorder_headers(df: pd.DataFrame) -> pd.DataFrame: -# logging.info("Reordering headers output file") -# df = df[ -# [ -# "trip_id", -# "duration_sec", -# "start_date", -# "start_station_name", -# "start_station_id", -# "end_date", -# "end_station_name", -# "end_station_id", -# "bike_number", -# "zip_code", -# "subscriber_type", -# "subscription_type", -# "start_station_latitude", -# "start_station_longitude", -# "end_station_latitude", -# "end_station_longitude", -# "member_birth_year", -# "member_gender", -# "bike_share_for_all_trip", -# "start_station_geom", -# "end_station_geom", -# ] -# ] - -# return df +def zip_decompress(infile: str, dest_path: str) -> None: + logging.info(f"Unpacking {infile} to {dest_path}") + with zip.ZipFile(infile, mode="r") as zipf: + zipf.extractall(dest_path) + zipf.close() def concatenate_files( @@ -253,15 +170,47 @@ def concatenate_files( return target_file_path -# def listdirs(rootdir: str) -> list: -# rtn_list = [] -# for file in os.listdir(rootdir): -# d = os.path.join(rootdir, file) -# if os.path.isdir(d): -# rtn_list.append(d) -# for elem in listdirs(d): -# rtn_list.append(elem) -# return rtn_list +def process_source_file( + source_file: str, + target_file: str, + names: list, + dtypes: dict, + chunksize: int +) -> None: + logging.info(f"Opening batch file {source_file}") + with pd.read_csv( + source_file, # path to main source file to load in batches + engine="python", + encoding="utf-8", + quotechar='"', # string separator, typically double-quotes + chunksize=chunksize, # size of batch data, in no. of records + sep=",", # data column separator, typically "," + header=None, # use when the data file does not contain a header + names=names, + dtype=dtypes, + keep_default_na=True, + na_values=[" "] + ) as reader: + for chunk_number, chunk in enumerate(reader): + target_file_batch = str(target_file).replace( + ".csv", "-" + str(chunk_number) + ".csv" + ) + df = pd.DataFrame() + df = pd.concat([df, chunk]) + process_chunk( + df, target_file_batch, target_file, (not chunk_number == 0) + ) + + +def process_chunk( + df: pd.DataFrame, + target_file_batch: str, + target_file: str, + skip_header: bool, +) -> None: + df = resolve_date_format(df, "%Y-%m-%d %H:%M") + save_to_new_file(df, file_path=str(target_file_batch), sep=",") + append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) def resolve_date_format(df: pd.DataFrame, from_format: str) -> pd.DataFrame: @@ -301,6 +250,11 @@ def convert_dt_format(dt_str: str, from_format: str) -> str: return rtnval +def save_to_new_file(df, file_path, sep="|") -> None: + logging.info(f"Saving to file {file_path} separator='{sep}'") + df.to_csv(file_path, sep=sep, index=False) + + def append_batch_file( batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool ) -> None: @@ -322,11 +276,6 @@ def append_batch_file( os.remove(batch_file_path) -def save_to_new_file(df, file_path, sep="|") -> None: - logging.info(f"Saving to file {file_path} separator='{sep}'") - df.to_csv(file_path, sep=sep, index=False) - - def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: logging.info(f"Uploading to GCS {gcs_bucket} in {gcs_path}") storage_client = storage.Client() @@ -335,28 +284,6 @@ def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) blob.upload_from_filename(file_path) -def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> None: - if os.path.exists(infile): - if compression_type == "zip": - logging.info(f"Unpacking {infile} to {dest_path}") - with zip.ZipFile(infile, mode="r") as zipf: - zipf.extractall(dest_path) - zipf.close() - else: - logging.info( - f"{infile} ignored as it is not compressed or is of unknown compression" - ) - else: - logging.info(f"{infile} not unpacked because it does not exist.") - - -def zip_decompress(infile: str, dest_path: str) -> None: - logging.info(f"Unpacking {infile} to {dest_path}") - with zip.ZipFile(infile, mode="r") as zipf: - zipf.extractall(dest_path) - zipf.close() - - if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py index 2513a448f..ba09d52b0 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py @@ -38,28 +38,146 @@ task_id="transform_csv", name="ozone_hourly_summary", namespace="default", - affinity={'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'cloud.google.com/gke-nodepool', 'operator': 'In', 'values': ['pool-e2-standard-4']}]}]}}}, + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={'SOURCE_URL': 'https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip', 'START_YEAR': '1990', 'SOURCE_FILE': 'files/data.csv', 'TARGET_FILE': 'files/data_output.csv', 'CHUNKSIZE': '2500000', 'TARGET_GCS_BUCKET': '{{ var.value.composer_bucket }}', 'TARGET_GCS_PATH': 'data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv', 'DATA_NAMES': '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', 'DATA_DTYPES': '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }'}, - resources={'limit_memory': '8G', 'limit_cpu': '3'}, + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, ) # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=['data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv'], + source_objects=[ + "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv" + ], source_format="CSV", destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.ozone_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", - schema_fields=[{'name': 'state_code', 'type': 'STRING', 'description': 'The FIPS code of the state in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'county_code', 'type': 'STRING', 'description': 'The FIPS code of the county in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'site_num', 'type': 'STRING', 'description': 'A unique number within the county identifying the site.', 'mode': 'NULLABLE'}, {'name': 'parameter_code', 'type': 'INTEGER', 'description': 'The AQS code corresponding to the parameter measured by the monitor.', 'mode': 'NULLABLE'}, {'name': 'poc', 'type': 'INTEGER', 'description': 'This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.', 'mode': 'NULLABLE'}, {'name': 'latitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance north of the equator measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'longitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'datum', 'type': 'STRING', 'description': 'The Datum associated with the Latitude and Longitude measures.', 'mode': 'NULLABLE'}, {'name': 'parameter_name', 'type': 'STRING', 'description': 'The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.', 'mode': 'NULLABLE'}, {'name': 'date_local', 'type': 'TIMESTAMP', 'description': 'The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.', 'mode': 'NULLABLE'}, {'name': 'time_local', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Local Standard Time.', 'mode': 'NULLABLE'}, {'name': 'date_gmt', 'type': 'TIMESTAMP', 'description': 'The calendar date of the sample in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'time_gmt', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'sample_measurement', 'type': 'FLOAT', 'description': 'The measured value in the standard units of measure for the parameter.', 'mode': 'NULLABLE'}, {'name': 'units_of_measure', 'type': 'STRING', 'description': 'The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.', 'mode': 'NULLABLE'}, {'name': 'mdl', 'type': 'FLOAT', 'description': 'The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.', 'mode': 'NULLABLE'}, {'name': 'uncertainty', 'type': 'FLOAT', 'description': 'The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.', 'mode': 'NULLABLE'}, {'name': 'qualifier', 'type': 'STRING', 'description': 'Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.', 'mode': 'NULLABLE'}, {'name': 'method_type', 'type': 'STRING', 'description': 'An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).', 'mode': 'NULLABLE'}, {'name': 'method_code', 'type': 'STRING', 'description': 'An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.', 'mode': 'NULLABLE'}, {'name': 'method_name', 'type': 'STRING', 'description': 'A short description of the processes, equipment, and protocols used in gathering and measuring the sample.', 'mode': 'NULLABLE'}, {'name': 'state_name', 'type': 'STRING', 'description': 'The name of the state where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'county_name', 'type': 'STRING', 'description': 'The name of the county where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'date_of_last_change', 'type': 'TIMESTAMP', 'description': 'The date the last time any numeric values in this record were updated in the AQS data system.', 'mode': 'NULLABLE'}], - ) - - transform_csv >> load_to_bq - ample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", "mode": "NULLABLE", }, { diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py index 754c7db72..6e480569e 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -38,25 +38,197 @@ task_id="transform_csv", name="voc_hourly_summary", namespace="default", - affinity={'nodeAffinity': {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': [{'matchExpressions': [{'key': 'cloud.google.com/gke-nodepool', 'operator': 'In', 'values': ['pool-e2-standard-4']}]}]}}}, + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={'SOURCE_URL': 'https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip', 'START_YEAR': '1990', 'SOURCE_FILE': 'files/data.csv', 'TARGET_FILE': 'files/data_output.csv', 'CHUNKSIZE': '2500000', 'TARGET_GCS_BUCKET': '{{ var.value.composer_bucket }}', 'TARGET_GCS_PATH': 'data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv', 'DATA_NAMES': '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', 'DATA_DTYPES': '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }'}, - resources={'limit_memory': '8G', 'limit_cpu': '3'}, + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, ) # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_to_bq", bucket="{{ var.value.composer_bucket }}", - source_objects=['data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv'], + source_objects=[ + "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv" + ], source_format="CSV", destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.voc_hourly_summary_destination_table }}", skip_leading_rows=1, allow_quoted_newlines=True, write_disposition="WRITE_TRUNCATE", - schema_fields=[{'name': 'state_code', 'type': 'STRING', 'description': 'The FIPS code of the state in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'county_code', 'type': 'STRING', 'description': 'The FIPS code of the county in which the monitor resides.', 'mode': 'NULLABLE'}, {'name': 'site_num', 'type': 'STRING', 'description': 'A unique number within the county identifying the site.', 'mode': 'NULLABLE'}, {'name': 'parameter_code', 'type': 'INTEGER', 'description': 'The AQS code corresponding to the parameter measured by the monitor.', 'mode': 'NULLABLE'}, {'name': 'poc', 'type': 'INTEGER', 'description': 'This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.', 'mode': 'NULLABLE'}, {'name': 'latitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance north of the equator measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'longitude', 'type': 'FLOAT', 'description': 'The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.', 'mode': 'NULLABLE'}, {'name': 'datum', 'type': 'STRING', 'description': 'The Datum associated with the Latitude and Longitude measures.', 'mode': 'NULLABLE'}, {'name': 'parameter_name', 'type': 'STRING', 'description': 'The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.', 'mode': 'NULLABLE'}, {'name': 'date_local', 'type': 'TIMESTAMP', 'description': 'The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.', 'mode': 'NULLABLE'}, {'name': 'time_local', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Local Standard Time.', 'mode': 'NULLABLE'}, {'name': 'date_gmt', 'type': 'TIMESTAMP', 'description': 'The calendar date of the sample in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'time_gmt', 'type': 'STRING', 'description': 'The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.', 'mode': 'NULLABLE'}, {'name': 'sample_measurement', 'type': 'FLOAT', 'description': 'The measured value in the standard units of measure for the parameter.', 'mode': 'NULLABLE'}, {'name': 'units_of_measure', 'type': 'STRING', 'description': 'The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.', 'mode': 'NULLABLE'}, {'name': 'mdl', 'type': 'FLOAT', 'description': 'The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.', 'mode': 'NULLABLE'}, {'name': 'uncertainty', 'type': 'FLOAT', 'description': 'The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.', 'mode': 'NULLABLE'}, {'name': 'qualifier', 'type': 'STRING', 'description': 'Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.', 'mode': 'NULLABLE'}, {'name': 'method_type', 'type': 'STRING', 'description': 'An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).', 'mode': 'NULLABLE'}, {'name': 'method_code', 'type': 'STRING', 'description': 'An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.', 'mode': 'NULLABLE'}, {'name': 'method_name', 'type': 'STRING', 'description': 'A short description of the processes, equipment, and protocols used in gathering and measuring the sample.', 'mode': 'NULLABLE'}, {'name': 'state_name', 'type': 'STRING', 'description': 'The name of the state where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'county_name', 'type': 'STRING', 'description': 'The name of the county where the monitoring site is located.', 'mode': 'NULLABLE'}, {'name': 'date_of_last_change', 'type': 'TIMESTAMP', 'description': 'The date the last time any numeric values in this record were updated in the AQS data system.', 'mode': 'NULLABLE'}], + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], ) transform_csv >> load_to_bq - \ No newline at end of file From 7d93681ac85ada269578962d0d83b11796ff55d5 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Mon, 1 Nov 2021 15:47:48 +0000 Subject: [PATCH 16/26] fix: Reduced CHUNKSIZE in order to prevent memory outage in AF, preventing some pipelines to complete successfully. --- .../annual_summaries/annual_summaries_dag.py | 3 ++- .../nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py | 2 +- .../nonoxnoy_daily_summary/pipeline.yaml | 2 +- .../ozone_daily_summary/ozone_daily_summary_dag.py | 2 +- .../ozone_daily_summary/pipeline.yaml | 2 +- .../temperature_daily_summary/pipeline.yaml | 2 +- .../temperature_daily_summary/temperature_daily_summary_dag.py | 2 +- .../wind_daily_summary/pipeline.yaml | 2 +- .../wind_daily_summary/wind_daily_summary_dag.py | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 96cf50443..cd9b83159 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -14,8 +14,9 @@ from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod from airflow.providers.google.cloud.transfers import gcs_to_bigquery +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod + default_args = { "owner": "Google", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py index d05392a54..0832f15c1 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -62,7 +62,7 @@ "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", + "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml index 1c8700be7..49dcfbf8c 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -58,7 +58,7 @@ dag: START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" + CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" DATA_NAMES: >- diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py index 0d35a6c27..8468002a3 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -62,7 +62,7 @@ "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", + "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_ozone_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml index 922fa715c..41ffeae6a 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -58,7 +58,7 @@ dag: START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" + CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_ozone_summary/files/data_output.csv" DATA_NAMES: >- diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml index 539331c1b..f396323fe 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml @@ -58,7 +58,7 @@ dag: START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" + CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" DATA_NAMES: >- diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py index e7c0ebbe0..b3f1daded 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py @@ -62,7 +62,7 @@ "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", + "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml index a8b0a9d62..f3b8e243e 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml @@ -58,7 +58,7 @@ dag: START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" + CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" DATA_NAMES: >- diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py index b746e1bc1..76caf60cb 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py @@ -62,7 +62,7 @@ "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", + "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', From 7d0f21e49c9f7b930ab67edbb8f7d795275af25a Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Mon, 1 Nov 2021 16:26:23 +0000 Subject: [PATCH 17/26] fix: Resolved incorrect path entry in ozone daily summary pipeline.yaml; Fixed flake8 issue in csv_transform --- .../run_csv_transform_kub/csv_transform.py | 16 ++++------------ .../annual_summaries/annual_summaries_dag.py | 3 +-- .../ozone_daily_summary_dag.py | 2 +- .../ozone_daily_summary/pipeline.yaml | 2 +- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py index ef38be082..216fe229a 100644 --- a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -54,9 +54,7 @@ def main( file_group_wildcard = os.path.split(source_url)[1].replace("_~year~.zip", "") source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") - process_source_file( - source, target_file, data_names, data_dtypes, int(chunksize) - ) + process_source_file(source, target_file, data_names, data_dtypes, int(chunksize)) upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) @@ -171,11 +169,7 @@ def concatenate_files( def process_source_file( - source_file: str, - target_file: str, - names: list, - dtypes: dict, - chunksize: int + source_file: str, target_file: str, names: list, dtypes: dict, chunksize: int ) -> None: logging.info(f"Opening batch file {source_file}") with pd.read_csv( @@ -189,7 +183,7 @@ def process_source_file( names=names, dtype=dtypes, keep_default_na=True, - na_values=[" "] + na_values=[" "], ) as reader: for chunk_number, chunk in enumerate(reader): target_file_batch = str(target_file).replace( @@ -197,9 +191,7 @@ def process_source_file( ) df = pd.DataFrame() df = pd.concat([df, chunk]) - process_chunk( - df, target_file_batch, target_file, (not chunk_number == 0) - ) + process_chunk(df, target_file_batch, target_file, (not chunk_number == 0)) def process_chunk( diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index cd9b83159..96cf50443 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -14,9 +14,8 @@ from airflow import DAG -from airflow.providers.google.cloud.transfers import gcs_to_bigquery from airflow.providers.cncf.kubernetes.operators import kubernetes_pod - +from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { "owner": "Google", diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py index 8468002a3..b35e06fb7 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "750000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/daily_ozone_summary/files/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml index 41ffeae6a..01279cd92 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "750000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/daily_ozone_summary/files/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", From 3869b2ddb1d81ef4c189bcf6f0712c4aa76c3553 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 3 Nov 2021 20:38:39 +0000 Subject: [PATCH 18/26] fix: Requested changes as per PR code review --- .../run_csv_transform_kub/csv_transform.py | 35 +++++++++---------- .../run_csv_transform_kub/requirements.txt | 2 -- .../annual_summaries/annual_summaries_dag.py | 2 +- .../annual_summaries/pipeline.yaml | 2 +- .../co_daily_summary/co_daily_summary_dag.py | 2 +- .../co_daily_summary/pipeline.yaml | 2 +- .../co_hourly_summary_dag.py | 2 +- .../co_hourly_summary/pipeline.yaml | 2 +- .../hap_daily_summary_dag.py | 2 +- .../hap_daily_summary/pipeline.yaml | 2 +- .../hap_hourly_summary_dag.py | 2 +- .../hap_hourly_summary/pipeline.yaml | 2 +- .../lead_daily_summary_dag.py | 2 +- .../lead_daily_summary/pipeline.yaml | 2 +- .../no2_daily_summary_dag.py | 2 +- .../no2_daily_summary/pipeline.yaml | 2 +- .../no2_hourly_summary_dag.py | 2 +- .../no2_hourly_summary/pipeline.yaml | 2 +- .../nonoxnoy_daily_summary_dag.py | 2 +- .../nonoxnoy_daily_summary/pipeline.yaml | 2 +- .../nonoxnoy_hourly_summary_dag.py | 2 +- .../nonoxnoy_hourly_summary/pipeline.yaml | 2 +- .../ozone_daily_summary_dag.py | 2 +- .../ozone_daily_summary/pipeline.yaml | 2 +- .../ozone_hourly_summary_dag.py | 2 +- .../ozone_hourly_summary/pipeline.yaml | 2 +- .../pm10_daily_summary/pipeline.yaml | 2 +- .../pm10_daily_summary_dag.py | 2 +- .../pm10_hourly_summary/pipeline.yaml | 2 +- .../pm10_hourly_summary_dag.py | 2 +- .../pm25_frm_hourly_summary/pipeline.yaml | 2 +- .../pm25_frm_hourly_summary_dag.py | 2 +- .../pm25_nonfrm_daily_summary/pipeline.yaml | 2 +- .../pm25_nonfrm_daily_summary_dag.py | 2 +- .../pm25_nonfrm_hourly_summary/pipeline.yaml | 2 +- .../pm25_nonfrm_hourly_summary_dag.py | 2 +- .../pipeline.yaml | 2 +- .../pm25_speciation_daily_summary_dag.py | 2 +- .../pipeline.yaml | 2 +- .../pm25_speciation_hourly_summary_dag.py | 2 +- .../pressure_daily_summary/pipeline.yaml | 2 +- .../pressure_daily_summary_dag.py | 2 +- .../pressure_hourly_summary/pipeline.yaml | 2 +- .../pressure_hourly_summary_dag.py | 2 +- .../rh_and_dp_daily_summary/pipeline.yaml | 2 +- .../rh_and_dp_daily_summary_dag.py | 2 +- .../rh_and_dp_hourly_summary/pipeline.yaml | 2 +- .../rh_and_dp_hourly_summary_dag.py | 2 +- .../so2_daily_summary/pipeline.yaml | 2 +- .../so2_daily_summary_dag.py | 2 +- .../so2_hourly_summary/pipeline.yaml | 2 +- .../so2_hourly_summary_dag.py | 2 +- .../temperature_daily_summary/pipeline.yaml | 2 +- .../temperature_daily_summary_dag.py | 2 +- .../temperature_hourly_summary/pipeline.yaml | 2 +- .../temperature_hourly_summary_dag.py | 2 +- .../voc_daily_summary/pipeline.yaml | 2 +- .../voc_daily_summary_dag.py | 2 +- .../voc_hourly_summary/pipeline.yaml | 2 +- .../voc_hourly_summary_dag.py | 2 +- .../wind_daily_summary/pipeline.yaml | 2 +- .../wind_daily_summary_dag.py | 2 +- .../wind_hourly_summary/pipeline.yaml | 2 +- .../wind_hourly_summary_dag.py | 2 +- 64 files changed, 78 insertions(+), 83 deletions(-) diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py index 216fe229a..db75ce450 100644 --- a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -51,7 +51,7 @@ def main( download_url_files_from_year_range( source_url, st_year, end_year, dest_path, True, True ) - file_group_wildcard = os.path.split(source_url)[1].replace("_~year~.zip", "") + file_group_wildcard = os.path.split(source_url)[1].replace("_YEAR_ITERATOR.zip", "") source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") process_source_file(source, target_file, data_names, data_dtypes, int(chunksize)) @@ -70,7 +70,7 @@ def download_url_files_from_year_range( continue_on_error: bool = False, ): for yr in range(start_year, end_year + 1, 1): - src_url = source_url.replace("~year~", str(yr)) + src_url = source_url.replace("YEAR_ITERATOR", str(yr)) dest_file = dest_path + "/source_" + os.path.split(src_url)[1] download_file_http(src_url, dest_file, continue_on_error) unpack_file(dest_file, dest_path, "zip") @@ -216,7 +216,6 @@ def resolve_date_format(df: pd.DataFrame, from_format: str) -> pd.DataFrame: def convert_dt_format(dt_str: str, from_format: str) -> str: - # rtnval = "" if not dt_str or str(dt_str).lower() == "nan" or str(dt_str).lower() == "nat": rtnval = "" elif len(dt_str.strip()) == 10: @@ -250,22 +249,20 @@ def save_to_new_file(df, file_path, sep="|") -> None: def append_batch_file( batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool ) -> None: - data_file = open(batch_file_path, "r") - if truncate_file: - target_file = open(target_file_path, "w+").close() - target_file = open(target_file_path, "a+") - if skip_header: - logging.info( - f"Appending batch file {batch_file_path} to {target_file_path} with skip header" - ) - next(data_file) - else: - logging.info(f"Appending batch file {batch_file_path} to {target_file_path}") - target_file.write(data_file.read()) - data_file.close() - target_file.close() - if os.path.exists(batch_file_path): - os.remove(batch_file_path) + with open(batch_file_path, "r") as data_file: + if truncate_file: + target_file = open(target_file_path, "w+").close() + with open(target_file_path, "a+") as target_file: + if skip_header: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path} with skip header" + ) + next(data_file) + else: + logging.info(f"Appending batch file {batch_file_path} to {target_file_path}") + target_file.write(data_file.read()) + if os.path.exists(batch_file_path): + os.remove(batch_file_path) def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt index 88bfd2aba..f36704793 100644 --- a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt @@ -1,5 +1,3 @@ requests -numpy pandas google-cloud-storage -gsutil diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index 96cf50443..ab756ae60 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_YEAR_ITERATOR.zip", "START_YEAR": "1980", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml index 23e9a51e7..64dd24d2a 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_YEAR_ITERATOR.zip" START_YEAR: "1980" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py index cae5283ca..36a730539 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42101_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42101_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml index 25483b0fa..c9faa5417 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42101_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42101_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py index 6dcf847bb..110e22cb8 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml index 8c5ca6928..f2550f211 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py index 6cec05873..82f1a1380 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml index 4be18a09d..2aeb82d7d 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py index fc2ee41c0..fdde73355 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_YEAR_ITERATOR.zip", "START_YEAR": "1993", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml index 8dd9df12c..69ed033eb 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_YEAR_ITERATOR.zip" START_YEAR: "1993" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py index e7b7ee022..31d834d8a 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml index 915ec933f..b14e87a76 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py index f69b7a714..8bd75aa4c 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42602_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42602_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml index 85aa4be7d..9e441b41e 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42602_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42602_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py index 5aa5a7a0d..35ce8d0b1 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml index 651b03840..ac0ba1b0d 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py index 0832f15c1..cbedbaa53 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml index 49dcfbf8c..ac4bd27ca 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py index 27b069371..23c79567f 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml index 8eca61207..a94067766 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py index b35e06fb7..c9854b8ee 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_44201_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_44201_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml index 01279cd92..7bf792c24 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_44201_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_44201_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py index ba09d52b0..0166c20a8 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml index bd9d1f873..49d53f1b4 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml index 8730024bf..0bd08f5bc 100644 --- a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_81102_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_81102_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py index e63fb89e1..481790157 100644 --- a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_81102_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_81102_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml index fd07e176d..c7753ac99 100644 --- a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py index 93d6d5865..bb19bcc5d 100644 --- a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml index a521796a0..edbbef7b0 100644 --- a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py index 19ea97a9e..e3be71dd5 100644 --- a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml index 21e239a5d..67427fc17 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_88502_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_88502_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py index 5e93c4f4f..aa80378ed 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_88502_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_88502_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml index 5f2533213..ec2a4e4b6 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py index 3809645fc..ed7d4b8d4 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml index 78e030eaf..5f8bc25fb 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py index 1b21a8809..75b044294 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml index 39d37872f..b5a11fcf9 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py index 417d4d9ae..1f677a749 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml index 914ac8569..8f2988c0c 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py index 8a6d00e9f..5275e9cfc 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml index 85e4d8510..ae67eedb6 100644 --- a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py index 2b5be13b5..1987ac61a 100644 --- a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml index 0af5e2e43..508cb00fe 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py index eb5447632..ac9ac395f 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml index 453853343..796972c7c 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py index 36d0a46a2..70f4f066c 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml index b72322ff9..6733ade79 100644 --- a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42401_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42401_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py index f3ab2629a..503ebb78f 100644 --- a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42401_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42401_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml index e3958f8af..6c0311ea6 100644 --- a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py index f9ed056cf..3687aea9d 100644 --- a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml index f396323fe..681ee0a76 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py index b3f1daded..aa95ebada 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml index 9618ba4e9..77bd98f12 100644 --- a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py index 634f7e238..45d763b2d 100644 --- a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml index a9f939c49..f27eb21e4 100644 --- a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py index 0555a4a73..c19249ab3 100644 --- a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml index fd2c11e72..9ffecbd06 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py index 6e480569e..0f85d0172 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml index f3b8e243e..4a683203a 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py index 76caf60cb..7f1ca9ec8 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml index 834c5e06c..8247bf2f7 100644 --- a/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml @@ -54,7 +54,7 @@ dag: image_pull_policy: "Always" image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_~year~.zip" + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_YEAR_ITERATOR.zip" START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py index 7412f4d37..60af123f9 100644 --- a/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py @@ -58,7 +58,7 @@ image_pull_policy="Always", image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_~year~.zip", + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_YEAR_ITERATOR.zip", "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", From 9caa6870c2a199b149a04fa959405c2a745c6efe Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 3 Nov 2021 20:51:34 +0000 Subject: [PATCH 19/26] fix: Resolved black hook issue --- .../_images/run_csv_transform_kub/csv_transform.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py index db75ce450..ff894fd20 100644 --- a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -259,7 +259,9 @@ def append_batch_file( ) next(data_file) else: - logging.info(f"Appending batch file {batch_file_path} to {target_file_path}") + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path}" + ) target_file.write(data_file.read()) if os.path.exists(batch_file_path): os.remove(batch_file_path) From 642ab67a0e635d811377d0642967c04df3501ead Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 3 Nov 2021 21:13:45 +0000 Subject: [PATCH 20/26] fix: Reduced resources used in both lead daily summary and pressure daily summary --- .../lead_daily_summary/lead_daily_summary_dag.py | 2 +- .../lead_daily_summary/pipeline.yaml | 4 ++-- .../pressure_daily_summary/pipeline.yaml | 4 ++-- .../pressure_daily_summary/pressure_daily_summary_dag.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py index 31d834d8a..446c42eb1 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -68,7 +68,7 @@ "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, - resources={"limit_memory": "8G", "limit_cpu": "3"}, + resources={"limit_memory": "4G", "limit_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml index b14e87a76..808e6072d 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -76,8 +76,8 @@ dag: "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } resources: - limit_memory: "8G" - limit_cpu: "3" + limit_memory: "4G" + limit_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml index 8f2988c0c..afee47002 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -76,8 +76,8 @@ dag: "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } resources: - limit_memory: "8G" - limit_cpu: "3" + limit_memory: "4G" + limit_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py index 5275e9cfc..c01ea5fdf 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -68,7 +68,7 @@ "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, - resources={"limit_memory": "8G", "limit_cpu": "3"}, + resources={"limit_memory": "4G", "limit_cpu": "1"}, ) # Task to load CSV data to a BigQuery table From 4636273b819b2b1f2ad410cca1890a22202cb7db Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Wed, 3 Nov 2021 21:36:58 +0000 Subject: [PATCH 21/26] fix: Tiered start time for each DAG by converting start time to chron expression --- .../annual_summaries/annual_summaries_dag.py | 2 +- .../epa_historical_air_quality/annual_summaries/pipeline.yaml | 2 +- .../co_daily_summary/co_daily_summary_dag.py | 2 +- .../epa_historical_air_quality/co_daily_summary/pipeline.yaml | 2 +- .../co_hourly_summary/co_hourly_summary_dag.py | 2 +- .../epa_historical_air_quality/co_hourly_summary/pipeline.yaml | 2 +- .../hap_daily_summary/hap_daily_summary_dag.py | 2 +- .../epa_historical_air_quality/hap_daily_summary/pipeline.yaml | 2 +- .../hap_hourly_summary/hap_hourly_summary_dag.py | 2 +- .../epa_historical_air_quality/hap_hourly_summary/pipeline.yaml | 2 +- .../lead_daily_summary/lead_daily_summary_dag.py | 2 +- .../epa_historical_air_quality/lead_daily_summary/pipeline.yaml | 2 +- .../no2_daily_summary/no2_daily_summary_dag.py | 2 +- .../epa_historical_air_quality/no2_daily_summary/pipeline.yaml | 2 +- .../no2_hourly_summary/no2_hourly_summary_dag.py | 2 +- .../epa_historical_air_quality/no2_hourly_summary/pipeline.yaml | 2 +- .../nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py | 2 +- .../nonoxnoy_daily_summary/pipeline.yaml | 2 +- .../nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py | 2 +- .../nonoxnoy_hourly_summary/pipeline.yaml | 2 +- .../ozone_daily_summary/ozone_daily_summary_dag.py | 2 +- .../ozone_daily_summary/pipeline.yaml | 2 +- .../ozone_hourly_summary/ozone_hourly_summary_dag.py | 2 +- .../ozone_hourly_summary/pipeline.yaml | 2 +- .../epa_historical_air_quality/pm10_daily_summary/pipeline.yaml | 2 +- .../pm10_daily_summary/pm10_daily_summary_dag.py | 2 +- .../pm10_hourly_summary/pipeline.yaml | 2 +- .../pm10_hourly_summary/pm10_hourly_summary_dag.py | 2 +- .../pm25_frm_hourly_summary/pipeline.yaml | 2 +- .../pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py | 2 +- .../pm25_nonfrm_daily_summary/pipeline.yaml | 2 +- .../pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py | 2 +- .../pm25_nonfrm_hourly_summary/pipeline.yaml | 2 +- .../pm25_nonfrm_hourly_summary_dag.py | 2 +- .../pm25_speciation_daily_summary/pipeline.yaml | 2 +- .../pm25_speciation_daily_summary_dag.py | 2 +- .../pm25_speciation_hourly_summary/pipeline.yaml | 2 +- .../pm25_speciation_hourly_summary_dag.py | 2 +- .../pressure_daily_summary/pipeline.yaml | 2 +- .../pressure_daily_summary/pressure_daily_summary_dag.py | 2 +- .../pressure_hourly_summary/pipeline.yaml | 2 +- .../pressure_hourly_summary/pressure_hourly_summary_dag.py | 2 +- .../rh_and_dp_daily_summary/pipeline.yaml | 2 +- .../rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py | 2 +- .../rh_and_dp_hourly_summary/pipeline.yaml | 2 +- .../rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py | 2 +- .../epa_historical_air_quality/so2_daily_summary/pipeline.yaml | 2 +- .../so2_daily_summary/so2_daily_summary_dag.py | 2 +- .../epa_historical_air_quality/so2_hourly_summary/pipeline.yaml | 2 +- .../so2_hourly_summary/so2_hourly_summary_dag.py | 2 +- .../temperature_daily_summary/pipeline.yaml | 2 +- .../temperature_daily_summary/temperature_daily_summary_dag.py | 2 +- .../temperature_hourly_summary/pipeline.yaml | 2 +- .../temperature_hourly_summary_dag.py | 2 +- .../epa_historical_air_quality/voc_daily_summary/pipeline.yaml | 2 +- .../voc_daily_summary/voc_daily_summary_dag.py | 2 +- .../epa_historical_air_quality/voc_hourly_summary/pipeline.yaml | 2 +- .../voc_hourly_summary/voc_hourly_summary_dag.py | 2 +- .../epa_historical_air_quality/wind_daily_summary/pipeline.yaml | 2 +- .../wind_daily_summary/wind_daily_summary_dag.py | 2 +- 60 files changed, 60 insertions(+), 60 deletions(-) diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py index ab756ae60..ab8494e2f 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.annual_summaries", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 0 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml index 64dd24d2a..9c0f8bb03 100644 --- a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 0 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py index 36a730539..17b6c4c89 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.co_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 0 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml index c9faa5417..e41d91b53 100644 --- a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 0 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py index 110e22cb8..fe18b714f 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.co_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 1 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml index f2550f211..6523b582e 100644 --- a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 1 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py index 82f1a1380..f63b35b45 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.hap_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 1 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml index 2aeb82d7d..b06b3acc2 100644 --- a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 1 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py index fdde73355..2049296d3 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.hap_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 2 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml index 69ed033eb..ed44524a0 100644 --- a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 2 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py index 446c42eb1..99ecaba21 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.lead_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 2 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml index 808e6072d..a0fb46080 100644 --- a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 2 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py index 8bd75aa4c..12793e677 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.no2_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 3 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml index 9e441b41e..e342bf07e 100644 --- a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 3 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py index 35ce8d0b1..c4bf58df8 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.no2_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 3 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml index ac0ba1b0d..c867ca87d 100644 --- a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 3 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py index cbedbaa53..909bcb0dc 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.nonoxnoy_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 4 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml index ac4bd27ca..efa9d91c8 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 4 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py index 23c79567f..bb136bdd2 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.nonoxnoy_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 4 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml index a94067766..a3697c022 100644 --- a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 4 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py index c9854b8ee..adc12d601 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.ozone_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 5 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml index 7bf792c24..606c6aefa 100644 --- a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 5 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py index 0166c20a8..a6508591d 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.ozone_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 5 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml index 49d53f1b4..2e302bd74 100644 --- a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 5 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml index 0bd08f5bc..ea22b406f 100644 --- a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 6 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py index 481790157..ec5d38ec7 100644 --- a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm10_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 6 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml index c7753ac99..5ec784da8 100644 --- a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 6 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py index bb19bcc5d..e0b441cf0 100644 --- a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm10_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 6 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml index edbbef7b0..a65c60116 100644 --- a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 7 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py index e3be71dd5..05964ff80 100644 --- a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm25_frm_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 7 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml index 67427fc17..785463927 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 7 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py index aa80378ed..7705307a4 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm25_nonfrm_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 7 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml index ec2a4e4b6..36775c9b3 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 8 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py index ed7d4b8d4..635612e04 100644 --- a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm25_nonfrm_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 8 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml index 5f8bc25fb..a4a093875 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 8 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py index 75b044294..433ff6d75 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm25_speciation_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 8 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml index b5a11fcf9..cbb4c1c93 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 9 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py index 1f677a749..50362adab 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pm25_speciation_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 9 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml index afee47002..db6ad5d00 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 9 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py index c01ea5fdf..10d22e16f 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pressure_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 9 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml index ae67eedb6..79e46c0e8 100644 --- a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 10 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py index 1987ac61a..f1a5f7477 100644 --- a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.pressure_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 10 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml index 508cb00fe..3bf0a3c2a 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 10 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py index ac9ac395f..8bda3f59b 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.rh_and_dp_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 10 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml index 796972c7c..ce3f8628b 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 11 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py index 70f4f066c..fde00a884 100644 --- a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.rh_and_dp_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 11 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml index 6733ade79..400f068c4 100644 --- a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 11 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py index 503ebb78f..6e654d455 100644 --- a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.so2_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 11 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml index 6c0311ea6..66713d76d 100644 --- a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 12 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py index 3687aea9d..f3b54b1f4 100644 --- a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.so2_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 12 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml index 681ee0a76..eca99a645 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 12 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py index aa95ebada..3d62bd329 100644 --- a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.temperature_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 12 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml index 77bd98f12..005d49fac 100644 --- a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 13 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py index 45d763b2d..35ee19165 100644 --- a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.temperature_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 13 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml index f27eb21e4..1c2efde87 100644 --- a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "30 13 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py index c19249ab3..a1dc95991 100644 --- a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.voc_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="30 13 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml index 9ffecbd06..ff13171f6 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 14 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py index 0f85d0172..9d6e061b9 100644 --- a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.voc_hourly_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 14 * * *", catchup=False, default_view="graph", ) as dag: diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml index 4a683203a..02abb84d7 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml @@ -28,7 +28,7 @@ dag: depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 - schedule_interval: "@daily" + schedule_interval: "0 15 * * *" catchup: False default_view: graph diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py index 7f1ca9ec8..6dd0792ee 100644 --- a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py @@ -28,7 +28,7 @@ dag_id="epa_historical_air_quality.wind_daily_summary", default_args=default_args, max_active_runs=1, - schedule_interval="@daily", + schedule_interval="0 15 * * *", catchup=False, default_view="graph", ) as dag: From 6a5b3f18db2eff8c367296f937dd9e414bcb0860 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 4 Nov 2021 00:30:53 +0000 Subject: [PATCH 22/26] fix: Resolved invalid folder path in pipeline.yaml --- .../pm25_speciation_daily_summary/pipeline.yaml | 2 +- .../pm25_speciation_daily_summary_dag.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml index a4a093875..7f5c1e916 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml @@ -60,7 +60,7 @@ dag: TARGET_FILE: "files/data_output.csv" CHUNKSIZE: "2500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_daily_speciation_summary/files/data_output.csv" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv" DATA_NAMES: >- [ "state_code", "county_code", "site_num", "parameter_code", "poc", "latitude", "longitude", "datum", "parameter_name", "sample_duration", diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py index 433ff6d75..9fd91fb3d 100644 --- a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py @@ -64,7 +64,7 @@ "TARGET_FILE": "files/data_output.csv", "CHUNKSIZE": "2500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_daily_speciation_summary/files/data_output.csv", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, From 9d7c7535b8a3f190cf423792805012d443ca3b60 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 4 Nov 2021 16:08:43 +0000 Subject: [PATCH 23/26] fix: Removed out of date terraform file --- .../_terraform/epa_dataset.tf | 26 ------------------- 1 file changed, 26 deletions(-) delete mode 100644 datasets/epa_historical_air_quality/_terraform/epa_dataset.tf diff --git a/datasets/epa_historical_air_quality/_terraform/epa_dataset.tf b/datasets/epa_historical_air_quality/_terraform/epa_dataset.tf deleted file mode 100644 index ce6e58582..000000000 --- a/datasets/epa_historical_air_quality/_terraform/epa_dataset.tf +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_dataset" "epa" { - dataset_id = "epa" - project = var.project_id - description = "epa" -} - -output "bigquery_dataset-epa-dataset_id" { - value = google_bigquery_dataset.epa.dataset_id -} From fcb1503fa10bab3c5581051fe57bc18e1365caf9 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 4 Nov 2021 17:10:56 +0000 Subject: [PATCH 24/26] fix: attempting to resolve code check issues. --- .../_terraform/epa_historical_air_quality_dataset.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf b/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf index b1917471e..0e76d4064 100644 --- a/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf +++ b/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf @@ -21,6 +21,11 @@ resource "google_bigquery_dataset" "epa_historical_air_quality" { description = "EPA Historical Air Quality Datasets" } +resource "google_storage_bucket" "epa_historical_air_quality" { + name = "${var.bucket_name_prefix}-epa_historical_air_quality" + force_destroy = true +} + output "bigquery_dataset-epa_historical_air_quality-dataset_id" { value = google_bigquery_dataset.epa_historical_air_quality.dataset_id } From 34d67fa6eecc57b81069fb135b08a30d4e33af02 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Thu, 4 Nov 2021 22:41:58 +0000 Subject: [PATCH 25/26] fix: Missed one change specified in code review. --- .../_images/run_csv_transform_kub/csv_transform.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py index ff894fd20..bd56ca0b6 100644 --- a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -237,7 +237,6 @@ def convert_dt_format(dt_str: str, from_format: str) -> str: else: dt_str = "" - # return datetime.datetime.strptime(dt_str, from_format).strftime("%Y-%m-%d %H:%M:%S") return rtnval From 5bf30622d1841899ece0427fbe750efbc451ed40 Mon Sep 17 00:00:00 2001 From: nlarge-google Date: Mon, 29 Nov 2021 19:35:03 +0000 Subject: [PATCH 26/26] fix: Increase memory and CPU for pressure_daily_summary and reduced batch size in order to resolve memory pressure resulting in DAG failure. --- .../pressure_daily_summary/pipeline.yaml | 6 +++--- .../pressure_daily_summary/pressure_daily_summary_dag.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml index db6ad5d00..ab61ca317 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -58,7 +58,7 @@ dag: START_YEAR: "1990" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" + CHUNKSIZE: "500000" TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" DATA_NAMES: >- @@ -76,8 +76,8 @@ dag: "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } resources: - limit_memory: "4G" - limit_cpu: "1" + limit_memory: "8G" + limit_cpu: "3" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py index 10d22e16f..63e902c5e 100644 --- a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -62,13 +62,13 @@ "START_YEAR": "1990", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", + "CHUNKSIZE": "500000", "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv", "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', }, - resources={"limit_memory": "4G", "limit_cpu": "1"}, + resources={"limit_memory": "8G", "limit_cpu": "3"}, ) # Task to load CSV data to a BigQuery table