diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/Dockerfile b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/Dockerfile new file mode 100644 index 000000000..748bc3bec --- /dev/null +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/Dockerfile @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8 +ENV PYTHONUNBUFFERED True +COPY requirements.txt ./ +RUN python3 -m pip install --no-cache-dir -r requirements.txt +WORKDIR /custom +COPY ./csv_transform.py . +CMD ["python3", "csv_transform.py"] diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py new file mode 100644 index 000000000..bd56ca0b6 --- /dev/null +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/csv_transform.py @@ -0,0 +1,290 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import fnmatch +import json +import logging +import os +import pathlib +import typing +import zipfile as zip + +import pandas as pd +import requests +from google.cloud import storage + + +def main( + source_url: str, + start_year: int, + source_file: pathlib.Path, + target_file: pathlib.Path, + chunksize: str, + target_gcs_bucket: str, + target_gcs_path: str, + data_names: typing.List[str], + data_dtypes: dict, +) -> None: + + logging.info("Pipeline process started") + + pathlib.Path("./files").mkdir(parents=True, exist_ok=True) + dest_path = os.path.split(source_file)[0] + end_year = datetime.datetime.today().year - 2 + download_url_files_from_year_range( + source_url, start_year, end_year, dest_path, True, False + ) + st_year = datetime.datetime.today().year - 1 + end_year = datetime.datetime.today().year + download_url_files_from_year_range( + source_url, st_year, end_year, dest_path, True, True + ) + file_group_wildcard = os.path.split(source_url)[1].replace("_YEAR_ITERATOR.zip", "") + source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") + + process_source_file(source, target_file, data_names, data_dtypes, int(chunksize)) + + upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) + + logging.info("Pipeline process completed") + + +def download_url_files_from_year_range( + source_url: str, + start_year: int, + end_year: int, + dest_path: str, + remove_file: bool = False, + continue_on_error: bool = False, +): + for yr in range(start_year, end_year + 1, 1): + src_url = source_url.replace("YEAR_ITERATOR", str(yr)) + dest_file = dest_path + "/source_" + os.path.split(src_url)[1] + download_file_http(src_url, dest_file, continue_on_error) + unpack_file(dest_file, dest_path, "zip") + if remove_file: + os.remove(dest_file) + + +def download_file_http( + source_url: str, source_file: pathlib.Path, continue_on_error: bool = False +) -> None: + logging.info(f"Downloading {source_url} to {source_file}") + try: + src_file = requests.get(source_url, stream=True) + with open(source_file, "wb") as f: + for chunk in src_file: + f.write(chunk) + except requests.exceptions.RequestException as e: + if e == requests.exceptions.HTTPError: + err_msg = "A HTTP error occurred." + elif e == requests.exceptions.Timeout: + err_msg = "A HTTP timeout error occurred." + elif e == requests.exceptions.TooManyRedirects: + err_msg = "Too Many Redirects occurred." + if not continue_on_error: + logging.info(f"{err_msg} Unable to obtain {source_url}") + raise SystemExit(e) + else: + logging.info( + f"{err_msg} Unable to obtain {source_url}. Continuing execution." + ) + + +def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> None: + if os.path.exists(infile): + if compression_type == "zip": + logging.info(f"Unpacking {infile} to {dest_path}") + with zip.ZipFile(infile, mode="r") as zipf: + zipf.extractall(dest_path) + zipf.close() + else: + logging.info( + f"{infile} ignored as it is not compressed or is of unknown compression" + ) + else: + logging.info(f"{infile} not unpacked because it does not exist.") + + +def zip_decompress(infile: str, dest_path: str) -> None: + logging.info(f"Unpacking {infile} to {dest_path}") + with zip.ZipFile(infile, mode="r") as zipf: + zipf.extractall(dest_path) + zipf.close() + + +def concatenate_files( + target_file_path: str, + dest_path: str, + file_group_wildcard: str, + incl_file_source_path: bool = False, + separator: str = ",", + delete_src_file: bool = True, +) -> str: + target_file_dir = os.path.split(str(target_file_path))[0] + target_file_path = str(target_file_path).replace( + ".csv", "_" + file_group_wildcard + ".csv" + ) + logging.info(f"Concatenating files {target_file_dir}/*{file_group_wildcard}") + if os.path.isfile(target_file_path): + os.unlink(target_file_path) + for src_file_path in sorted( + fnmatch.filter(os.listdir(dest_path), "*" + file_group_wildcard + "*") + ): + src_file_path = dest_path + "/" + src_file_path + with open(src_file_path, "r") as src_file: + with open(target_file_path, "a+") as target_file: + next(src_file) + logging.info( + f"Reading from file {src_file_path}, writing to file {target_file_path}" + ) + for line in src_file: + if incl_file_source_path: + line = ( + '"' + + os.path.split(src_file_path)[1].strip() + + '"' + + separator + + line + ) # include the file source + else: + line = line + target_file.write(line) + if os.path.isfile(src_file_path) and delete_src_file: + os.unlink(src_file_path) + + return target_file_path + + +def process_source_file( + source_file: str, target_file: str, names: list, dtypes: dict, chunksize: int +) -> None: + logging.info(f"Opening batch file {source_file}") + with pd.read_csv( + source_file, # path to main source file to load in batches + engine="python", + encoding="utf-8", + quotechar='"', # string separator, typically double-quotes + chunksize=chunksize, # size of batch data, in no. of records + sep=",", # data column separator, typically "," + header=None, # use when the data file does not contain a header + names=names, + dtype=dtypes, + keep_default_na=True, + na_values=[" "], + ) as reader: + for chunk_number, chunk in enumerate(reader): + target_file_batch = str(target_file).replace( + ".csv", "-" + str(chunk_number) + ".csv" + ) + df = pd.DataFrame() + df = pd.concat([df, chunk]) + process_chunk(df, target_file_batch, target_file, (not chunk_number == 0)) + + +def process_chunk( + df: pd.DataFrame, + target_file_batch: str, + target_file: str, + skip_header: bool, +) -> None: + df = resolve_date_format(df, "%Y-%m-%d %H:%M") + save_to_new_file(df, file_path=str(target_file_batch), sep=",") + append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) + + +def resolve_date_format(df: pd.DataFrame, from_format: str) -> pd.DataFrame: + logging.info("Resolving Date Format") + for col in df.columns: + if df[col].dtype == "datetime64[ns]": + logging.info(f"Resolving datetime on {col}") + df[col] = df[col].apply(lambda x: convert_dt_format(str(x), from_format)) + + return df + + +def convert_dt_format(dt_str: str, from_format: str) -> str: + if not dt_str or str(dt_str).lower() == "nan" or str(dt_str).lower() == "nat": + rtnval = "" + elif len(dt_str.strip()) == 10: + # if there is no time format + rtnval = dt_str + " 00:00:00" + elif len(dt_str.strip().split(" ")[1]) == 8: + # if format of time portion is 00:00:00 then use 00:00 format + dt_str = dt_str[:-3] + rtnval = datetime.datetime.strptime(dt_str, from_format).strftime( + "%Y-%m-%d %H:%M:%S" + ) + elif (len(dt_str.strip().split("-")[0]) == 4) and ( + len(from_format.strip().split("/")[0]) == 2 + ): + # if the format of the date portion of the data is in YYYY-MM-DD format + # and from_format is in MM-DD-YYYY then resolve this by modifying the from_format + # to use the YYYY-MM-DD. This resolves mixed date formats in files + from_format = "%Y-%m-%d " + from_format.strip().split(" ")[1] + else: + dt_str = "" + + return rtnval + + +def save_to_new_file(df, file_path, sep="|") -> None: + logging.info(f"Saving to file {file_path} separator='{sep}'") + df.to_csv(file_path, sep=sep, index=False) + + +def append_batch_file( + batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool +) -> None: + with open(batch_file_path, "r") as data_file: + if truncate_file: + target_file = open(target_file_path, "w+").close() + with open(target_file_path, "a+") as target_file: + if skip_header: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path} with skip header" + ) + next(data_file) + else: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path}" + ) + target_file.write(data_file.read()) + if os.path.exists(batch_file_path): + os.remove(batch_file_path) + + +def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: + logging.info(f"Uploading to GCS {gcs_bucket} in {gcs_path}") + storage_client = storage.Client() + bucket = storage_client.bucket(gcs_bucket) + blob = bucket.blob(gcs_path) + blob.upload_from_filename(file_path) + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + main( + source_url=os.environ["SOURCE_URL"], + source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), + target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), + start_year=int(os.environ["START_YEAR"]), + chunksize=os.environ["CHUNKSIZE"], + target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], + target_gcs_path=os.environ["TARGET_GCS_PATH"], + data_names=json.loads(os.environ["DATA_NAMES"]), + data_dtypes=json.loads(os.environ["DATA_DTYPES"]), + ) diff --git a/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt new file mode 100644 index 000000000..f36704793 --- /dev/null +++ b/datasets/epa_historical_air_quality/_images/run_csv_transform_kub/requirements.txt @@ -0,0 +1,3 @@ +requests +pandas +google-cloud-storage diff --git a/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf new file mode 100644 index 000000000..7084028c2 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/annual_summaries_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_annual_summaries" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "annual_summaries" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_annual_summaries-table_id" { + value = google_bigquery_table.epa_historical_air_quality_annual_summaries.table_id +} + +output "bigquery_table-epa_historical_air_quality_annual_summaries-id" { + value = google_bigquery_table.epa_historical_air_quality_annual_summaries.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf new file mode 100644 index 000000000..4b475afed --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/co_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_co_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "co_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_co_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_co_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf new file mode 100644 index 000000000..96131d79d --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/co_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_co_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "co_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_co_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_co_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf b/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf new file mode 100644 index 000000000..0e76d4064 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/epa_historical_air_quality_dataset.tf @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "epa_historical_air_quality" { + dataset_id = "epa_historical_air_quality" + project = var.project_id + description = "EPA Historical Air Quality Datasets" +} + +resource "google_storage_bucket" "epa_historical_air_quality" { + name = "${var.bucket_name_prefix}-epa_historical_air_quality" + force_destroy = true +} + +output "bigquery_dataset-epa_historical_air_quality-dataset_id" { + value = google_bigquery_dataset.epa_historical_air_quality.dataset_id +} diff --git a/datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf new file mode 100644 index 000000000..b8aac1e45 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/hap_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_hap_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "hap_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_hap_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_hap_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf new file mode 100644 index 000000000..dd7896ae1 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/hap_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_hap_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "hap_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf new file mode 100644 index 000000000..d7fd58f20 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/lead_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_lead_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "lead_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_lead_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_lead_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf new file mode 100644 index 000000000..448a029f8 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/no2_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_no2_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "no2_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_no2_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_no2_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf new file mode 100644 index 000000000..2d057b403 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/no2_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_no2_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "no2_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf new file mode 100644 index 000000000..92f5294c7 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "nonoxnoy_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf new file mode 100644 index 000000000..4b57e8fba --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/nonoxnoy_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "nonoxnoy_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf new file mode 100644 index 000000000..19cff7cc2 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/ozone_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_ozone_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "ozone_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf new file mode 100644 index 000000000..517e8127c --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/ozone_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_ozone_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "ozone_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf new file mode 100644 index 000000000..af38e7681 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm10_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm10_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm10_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf new file mode 100644 index 000000000..d83d38c9e --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm10_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm10_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm10_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf new file mode 100644 index 000000000..3d64246b4 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_frm_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_frm_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_frm_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf new file mode 100644 index 000000000..5faf05f88 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_nonfrm_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf new file mode 100644 index 000000000..8cb22a6ac --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_nonfrm_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_nonfrm_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf new file mode 100644 index 000000000..c4ce35a13 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_speciation_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf new file mode 100644 index 000000000..aa0da3bf7 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pm25_speciation_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_speciation_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf new file mode 100644 index 000000000..f67bfa0eb --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pressure_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pressure_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pressure_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf new file mode 100644 index 000000000..23fa46310 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/pressure_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_pressure_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pressure_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/provider.tf b/datasets/epa_historical_air_quality/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf new file mode 100644 index 000000000..7bd465c09 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "rh_and_dp_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf new file mode 100644 index 000000000..f259b3cba --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/rh_and_dp_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "rh_and_dp_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf new file mode 100644 index 000000000..c2e5bfa02 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/so2_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_so2_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "so2_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_so2_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_so2_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf new file mode 100644 index 000000000..5a74e4d45 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/so2_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_so2_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "so2_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf new file mode 100644 index 000000000..98865c34e --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/temperature_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_temperature_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "temperature_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf new file mode 100644 index 000000000..09bd21923 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/temperature_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_temperature_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "temperature_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/variables.tf b/datasets/epa_historical_air_quality/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf new file mode 100644 index 000000000..7348fa307 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/voc_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_voc_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "voc_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_voc_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_voc_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf new file mode 100644 index 000000000..7a337682d --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/voc_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_voc_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "voc_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf new file mode 100644 index 000000000..90d444049 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/wind_daily_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_wind_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "wind_daily_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_wind_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_wind_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.id +} diff --git a/datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf new file mode 100644 index 000000000..257bce937 --- /dev/null +++ b/datasets/epa_historical_air_quality/_terraform/wind_hourly_summary_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_wind_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "wind_hourly_summary" + + description = "epaspc" + + + + + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py new file mode 100644 index 000000000..ab8494e2f --- /dev/null +++ b/datasets/epa_historical_air_quality/annual_summaries/annual_summaries_dag.py @@ -0,0 +1,420 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.annual_summaries", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 0 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="annual_summaries", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_YEAR_ITERATOR.zip", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/annual_summaries/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/annual_summaries/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "metric_used", + "type": "STRING", + "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "year", + "type": "INTEGER", + "description": "The year the annual summary data represents.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the year.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "completeness_indicator", + "type": "STRING", + "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "valid_day_count", + "type": "INTEGER", + "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days.", + "mode": "NULLABLE", + }, + { + "name": "required_day_count", + "type": "INTEGER", + "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required.", + "mode": "NULLABLE", + }, + { + "name": "exceptional_data_count", + "type": "INTEGER", + "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality).", + "mode": "NULLABLE", + }, + { + "name": "null_data_count", + "type": "INTEGER", + "description": "The count of scheduled samples when no data was collected and the reason for no data was reported.", + "mode": "NULLABLE", + }, + { + "name": "primary_exceedance_count", + "type": "INTEGER", + "description": "The number of samples during the year that exceeded the primary air quality standard.", + "mode": "NULLABLE", + }, + { + "name": "secondary_exceedance_count", + "type": "INTEGER", + "description": "The number of samples during the year that exceeded the secondary air quality standard.", + "mode": "NULLABLE", + }, + { + "name": "certification_indicator", + "type": "STRING", + "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated.", + "mode": "NULLABLE", + }, + { + "name": "num_obs_below_mdl", + "type": "INTEGER", + "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations.", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the year.", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_standard_dev", + "type": "FLOAT", + "description": "The standard deviation about the mean of the values for the year.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "first_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "second_max_value", + "type": "FLOAT", + "description": "The second highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "second_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "third_max_value", + "type": "FLOAT", + "description": "The third highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "third_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "fourth_max_value", + "type": "FLOAT", + "description": "The fourth highest value for the year.", + "mode": "NULLABLE", + }, + { + "name": "fourth_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "first_max_non_overlapping_value", + "type": "FLOAT", + "description": "For 8-hour CO averages, the highest value of the year.", + "mode": "NULLABLE", + }, + { + "name": "first_no_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "second_max_non_overlapping_value", + "type": "FLOAT", + "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value.", + "mode": "NULLABLE", + }, + { + "name": "second_no_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "ninety_nine_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "ninety_eight_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "ninety_five_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "ninety_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "seventy_five_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "fifty_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median).", + "mode": "NULLABLE", + }, + { + "name": "ten_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml new file mode 100644 index 000000000..9c0f8bb03 --- /dev/null +++ b/datasets/epa_historical_air_quality/annual_summaries/pipeline.yaml @@ -0,0 +1,327 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "annual_summaries" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: annual_summaries + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 0 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "annual_summaries" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_YEAR_ITERATOR.zip" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/annual_summaries/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", + "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", + "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", + "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", + "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", + "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", + "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", + "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", + "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", + "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", + "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str", + "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", + "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", + "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", + "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", + "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", + "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/annual_summaries/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "metric_used" + "type": "STRING" + "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "year" + "type": "INTEGER" + "description": "The year the annual summary data represents." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the year." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "completeness_indicator" + "type": "STRING" + "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter." + "mode": "NULLABLE" + - "name": "valid_day_count" + "type": "INTEGER" + "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days." + "mode": "NULLABLE" + - "name": "required_day_count" + "type": "INTEGER" + "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required." + "mode": "NULLABLE" + - "name": "exceptional_data_count" + "type": "INTEGER" + "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality)." + "mode": "NULLABLE" + - "name": "null_data_count" + "type": "INTEGER" + "description": "The count of scheduled samples when no data was collected and the reason for no data was reported." + "mode": "NULLABLE" + - "name": "primary_exceedance_count" + "type": "INTEGER" + "description": "The number of samples during the year that exceeded the primary air quality standard." + "mode": "NULLABLE" + - "name": "secondary_exceedance_count" + "type": "INTEGER" + "description": "The number of samples during the year that exceeded the secondary air quality standard." + "mode": "NULLABLE" + - "name": "certification_indicator" + "type": "STRING" + "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated." + "mode": "NULLABLE" + - "name": "num_obs_below_mdl" + "type": "INTEGER" + "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the year." + "mode": "NULLABLE" + - "name": "arithmetic_standard_dev" + "type": "FLOAT" + "description": "The standard deviation about the mean of the values for the year." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the year." + "mode": "NULLABLE" + - "name": "first_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "second_max_value" + "type": "FLOAT" + "description": "The second highest value for the year." + "mode": "NULLABLE" + - "name": "second_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "third_max_value" + "type": "FLOAT" + "description": "The third highest value for the year." + "mode": "NULLABLE" + - "name": "third_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "fourth_max_value" + "type": "FLOAT" + "description": "The fourth highest value for the year." + "mode": "NULLABLE" + - "name": "fourth_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "first_max_non_overlapping_value" + "type": "FLOAT" + "description": "For 8-hour CO averages, the highest value of the year." + "mode": "NULLABLE" + - "name": "first_no_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "second_max_non_overlapping_value" + "type": "FLOAT" + "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value." + "mode": "NULLABLE" + - "name": "second_no_max_datetime" + "type": "TIMESTAMP" + "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken." + "mode": "NULLABLE" + - "name": "ninety_nine_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "ninety_eight_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "ninety_five_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "ninety_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "seventy_five_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "fifty_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median)." + "mode": "NULLABLE" + - "name": "ten_percentile" + "type": "FLOAT" + "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py new file mode 100644 index 000000000..17b6c4c89 --- /dev/null +++ b/datasets/epa_historical_air_quality/co_daily_summary/co_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.co_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 0 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="co_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42101_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml new file mode 100644 index 000000000..e41d91b53 --- /dev/null +++ b/datasets/epa_historical_air_quality/co_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "co_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: co_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 0 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "co_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42101_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/co_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.annual_summaries_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py new file mode 100644 index 000000000..fe18b714f --- /dev/null +++ b/datasets/epa_historical_air_quality/co_hourly_summary/co_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.co_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 1 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="co_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code",\n "method_name", "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str",\n "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str",\n "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.co_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..6523b582e --- /dev/null +++ b/datasets/epa_historical_air_quality/co_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "co_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: co_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 1 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "co_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", + "method_name", "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str", + "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", + "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.co_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/dataset.yaml b/datasets/epa_historical_air_quality/dataset.yaml new file mode 100644 index 000000000..ab6deb945 --- /dev/null +++ b/datasets/epa_historical_air_quality/dataset.yaml @@ -0,0 +1,27 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: epa_historical_air_quality + friendly_name: epa_historical_air_quality + description: "EPA Historical Air Quality Datasets" + dataset_sources: ~ + terms_of_use: ~ + + +resources: + + - type: bigquery_dataset + dataset_id: epa_historical_air_quality + description: "EPA Historical Air Quality Datasets" diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py new file mode 100644 index 000000000..f63b35b45 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_daily_summary/hap_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.hap_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 1 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="hap_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.hap_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml new file mode 100644 index 000000000..b06b3acc2 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "hap_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: hap_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 1 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "hap_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.hap_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py new file mode 100644 index 000000000..2049296d3 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/hap_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.hap_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 2 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="hap_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_YEAR_ITERATOR.zip", + "START_YEAR": "1993", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.hap_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..ed44524a0 --- /dev/null +++ b/datasets/epa_historical_air_quality/hap_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "hap_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: hap_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 2 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "hap_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_YEAR_ITERATOR.zip" + START_YEAR: "1993" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.hap_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py new file mode 100644 index 000000000..99ecaba21 --- /dev/null +++ b/datasets/epa_historical_air_quality/lead_daily_summary/lead_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.lead_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 2 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="lead_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "4G", "limit_cpu": "1"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.lead_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml new file mode 100644 index 000000000..a0fb46080 --- /dev/null +++ b/datasets/epa_historical_air_quality/lead_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "lead_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: lead_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 2 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "lead_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "4G" + limit_cpu: "1" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.lead_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py new file mode 100644 index 000000000..12793e677 --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_daily_summary/no2_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.no2_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 3 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="no2_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42602_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.no2_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml new file mode 100644 index 000000000..e342bf07e --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "no2_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: no2_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 3 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "no2_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42602_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.no2_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py new file mode 100644 index 000000000..c4bf58df8 --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/no2_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.no2_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 3 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="no2_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.no2_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..c867ca87d --- /dev/null +++ b/datasets/epa_historical_air_quality/no2_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "no2_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: no2_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 3 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "no2_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.no2_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py new file mode 100644 index 000000000..909bcb0dc --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.nonoxnoy_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 4 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="nonoxnoy_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml new file mode 100644 index 000000000..efa9d91c8 --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "nonoxnoy_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: nonoxnoy_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 4 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "nonoxnoy_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py new file mode 100644 index 000000000..bb136bdd2 --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.nonoxnoy_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 4 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="no2_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..a3697c022 --- /dev/null +++ b/datasets/epa_historical_air_quality/nonoxnoy_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "nonoxnoy_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: nonoxnoy_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 4 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "no2_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.nonoxnoy_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py new file mode 100644 index 000000000..adc12d601 --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/ozone_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.ozone_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 5 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="ozone_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_44201_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.ozone_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml new file mode 100644 index 000000000..606c6aefa --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "ozone_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: ozone_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 5 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "ozone_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_44201_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.ozone_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py new file mode 100644 index 000000000..a6508591d --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/ozone_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.ozone_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 5 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="ozone_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.ozone_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..2e302bd74 --- /dev/null +++ b/datasets/epa_historical_air_quality/ozone_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "ozone_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: ozone_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 5 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "ozone_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.ozone_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml new file mode 100644 index 000000000..ea22b406f --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm10_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm10_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 6 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm10_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_81102_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm10_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py new file mode 100644 index 000000000..ec5d38ec7 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_daily_summary/pm10_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm10_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 6 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm10_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_81102_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm10_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..5ec784da8 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm10_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm10_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 6 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm10_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm10_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py new file mode 100644 index 000000000..e0b441cf0 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm10_hourly_summary/pm10_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm10_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 6 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm10_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm10_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..a65c60116 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_frm_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_frm_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 7 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_frm_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_frm_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py new file mode 100644 index 000000000..05964ff80 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_frm_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 7 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_frm_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_frm_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml new file mode 100644 index 000000000..785463927 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_nonfrm_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_nonfrm_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 7 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_nonfrm_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_88502_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py new file mode 100644 index 000000000..7705307a4 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_nonfrm_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 7 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_nonfrm_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_88502_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..36775c9b3 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_nonfrm_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_nonfrm_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 8 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_nonfrm_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py new file mode 100644 index 000000000..635612e04 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_nonfrm_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 8 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_nonfrm_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_nonfrm_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml new file mode 100644 index 000000000..7f5c1e916 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_speciation_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_speciation_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 8 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_speciation_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py new file mode 100644 index 000000000..9fd91fb3d --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_speciation_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 8 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_speciation_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..cbb4c1c93 --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pm25_speciation_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pm25_speciation_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 9 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pm25_speciation_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py new file mode 100644 index 000000000..50362adab --- /dev/null +++ b/datasets/epa_historical_air_quality/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pm25_speciation_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 9 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pm25_speciation_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pm25_speciation_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml new file mode 100644 index 000000000..ab61ca317 --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pressure_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pressure_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 9 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pressure_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pressure_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py new file mode 100644 index 000000000..63e902c5e --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_daily_summary/pressure_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pressure_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 9 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pressure_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pressure_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..79e46c0e8 --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "pressure_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: pressure_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 10 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "pressure_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.pressure_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py new file mode 100644 index 000000000..f1a5f7477 --- /dev/null +++ b/datasets/epa_historical_air_quality/pressure_hourly_summary/pressure_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.pressure_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 10 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="pressure_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.pressure_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml new file mode 100644 index 000000000..3bf0a3c2a --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "rh_and_dp_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: rh_and_dp_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 10 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "rh_and_dp_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py new file mode 100644 index 000000000..8bda3f59b --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.rh_and_dp_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 10 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="rh_and_dp_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..ce3f8628b --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "rh_and_dp_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: rh_and_dp_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 11 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "rh_and_dp_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py new file mode 100644 index 000000000..fde00a884 --- /dev/null +++ b/datasets/epa_historical_air_quality/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.rh_and_dp_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 11 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="rh_and_dp_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.rh_and_dp_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml new file mode 100644 index 000000000..400f068c4 --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "so2_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: so2_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 11 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "so2_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42401_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.so2_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py new file mode 100644 index 000000000..6e654d455 --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_daily_summary/so2_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.so2_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 11 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="so2_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42401_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.so2_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..66713d76d --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "so2_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: so2_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 12 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "so2_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.so2_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py new file mode 100644 index 000000000..f3b54b1f4 --- /dev/null +++ b/datasets/epa_historical_air_quality/so2_hourly_summary/so2_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.so2_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 12 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="so2_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.so2_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml new file mode 100644 index 000000000..eca99a645 --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "temperature_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: temperature_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 12 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "temperature_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.temperature_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py new file mode 100644 index 000000000..3d62bd329 --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_daily_summary/temperature_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.temperature_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 12 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="temperature_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.temperature_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..005d49fac --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "temperature_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: temperature_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 13 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "temperature_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.temperature_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py new file mode 100644 index 000000000..35ee19165 --- /dev/null +++ b/datasets/epa_historical_air_quality/temperature_hourly_summary/temperature_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.temperature_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 13 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="temperature_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.temperature_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml new file mode 100644 index 000000000..1c2efde87 --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "voc_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: voc_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "30 13 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "voc_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.voc_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py new file mode 100644 index 000000000..a1dc95991 --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_daily_summary/voc_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.voc_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="30 13 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="voc_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.voc_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..ff13171f6 --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "voc_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: voc_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 14 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "voc_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.voc_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py new file mode 100644 index 000000000..9d6e061b9 --- /dev/null +++ b/datasets/epa_historical_air_quality/voc_hourly_summary/voc_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.voc_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 14 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="voc_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.voc_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml new file mode 100644 index 000000000..02abb84d7 --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_daily_summary/pipeline.yaml @@ -0,0 +1,213 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "wind_daily_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: wind_daily_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 15 * * *" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "wind_daily_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "750000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.wind_daily_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "sample_duration" + "type": "STRING" + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." + "mode": "NULLABLE" + - "name": "pollutant_standard" + "type": "STRING" + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "event_type" + "type": "STRING" + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." + "mode": "NULLABLE" + - "name": "observation_count" + "type": "INTEGER" + "description": "The number of observations (samples) taken during the day." + "mode": "NULLABLE" + - "name": "observation_percent" + "type": "FLOAT" + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." + "mode": "NULLABLE" + - "name": "arithmetic_mean" + "type": "FLOAT" + "description": "The average (arithmetic mean) value for the day." + "mode": "NULLABLE" + - "name": "first_max_value" + "type": "FLOAT" + "description": "The highest value for the day." + "mode": "NULLABLE" + - "name": "first_max_hour" + "type": "INTEGER" + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." + "mode": "NULLABLE" + - "name": "aqi" + "type": "INTEGER" + "description": "The Air Quality Index for the day for the pollutant, if applicable." + "mode": "NULLABLE" + - "name": "method_code" + "type": "INTEGER" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "local_site_name" + "type": "STRING" + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." + "mode": "NULLABLE" + - "name": "address" + "type": "STRING" + "description": "The approximate street address of the monitoring site." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "city_name" + "type": "STRING" + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." + "mode": "NULLABLE" + - "name": "cbsa_name" + "type": "STRING" + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py new file mode 100644 index 000000000..6dd0792ee --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_daily_summary/wind_daily_summary_dag.py @@ -0,0 +1,264 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.wind_daily_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 15 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="wind_daily_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "750000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.wind_daily_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE", + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE", + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE", + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE", + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE", + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE", + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE", + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE", + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml new file mode 100644 index 000000000..8247bf2f7 --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/pipeline.yaml @@ -0,0 +1,191 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: "wind_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: wind_hourly_summary + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + + args: + + task_id: "transform_csv" + name: "wind_hourly_summary" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_YEAR_ITERATOR.zip" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + TARGET_FILE: "files/data_output.csv" + CHUNKSIZE: "2500000" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv" + DATA_NAMES: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + resources: + limit_memory: "8G" + limit_cpu: "3" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + + args: + task_id: "load_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.container_registry.wind_hourly_summary_destination_table }}" + skip_leading_rows: 1 + allow_quoted_newlines: True + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - "name": "state_code" + "type": "STRING" + "description": "The FIPS code of the state in which the monitor resides." + "mode": "NULLABLE" + - "name": "county_code" + "type": "STRING" + "description": "The FIPS code of the county in which the monitor resides." + "mode": "NULLABLE" + - "name": "site_num" + "type": "STRING" + "description": "A unique number within the county identifying the site." + "mode": "NULLABLE" + - "name": "parameter_code" + "type": "INTEGER" + "description": "The AQS code corresponding to the parameter measured by the monitor." + "mode": "NULLABLE" + - "name": "poc" + "type": "INTEGER" + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." + "mode": "NULLABLE" + - "name": "latitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." + "mode": "NULLABLE" + - "name": "longitude" + "type": "FLOAT" + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." + "mode": "NULLABLE" + - "name": "datum" + "type": "STRING" + "description": "The Datum associated with the Latitude and Longitude measures." + "mode": "NULLABLE" + - "name": "parameter_name" + "type": "STRING" + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." + "mode": "NULLABLE" + - "name": "date_local" + "type": "TIMESTAMP" + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." + "mode": "NULLABLE" + - "name": "time_local" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." + "mode": "NULLABLE" + - "name": "date_gmt" + "type": "TIMESTAMP" + "description": "The calendar date of the sample in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "time_gmt" + "type": "STRING" + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." + "mode": "NULLABLE" + - "name": "sample_measurement" + "type": "FLOAT" + "description": "The measured value in the standard units of measure for the parameter." + "mode": "NULLABLE" + - "name": "units_of_measure" + "type": "STRING" + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." + "mode": "NULLABLE" + - "name": "mdl" + "type": "FLOAT" + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." + "mode": "NULLABLE" + - "name": "uncertainty" + "type": "FLOAT" + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." + "mode": "NULLABLE" + - "name": "qualifier" + "type": "STRING" + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." + "mode": "NULLABLE" + - "name": "method_type" + "type": "STRING" + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." + "mode": "NULLABLE" + - "name": "method_code" + "type": "STRING" + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." + "mode": "NULLABLE" + - "name": "method_name" + "type": "STRING" + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." + "mode": "NULLABLE" + - "name": "state_name" + "type": "STRING" + "description": "The name of the state where the monitoring site is located." + "mode": "NULLABLE" + - "name": "county_name" + "type": "STRING" + "description": "The name of the county where the monitoring site is located." + "mode": "NULLABLE" + - "name": "date_of_last_change" + "type": "TIMESTAMP" + "description": "The date the last time any numeric values in this record were updated in the AQS data system." + "mode": "NULLABLE" + + graph_paths: + - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py new file mode 100644 index 000000000..60af123f9 --- /dev/null +++ b/datasets/epa_historical_air_quality/wind_hourly_summary/wind_hourly_summary_dag.py @@ -0,0 +1,234 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.wind_hourly_summary", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Run CSV transform within kubernetes pod + transform_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_csv", + name="wind_hourly_summary", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_YEAR_ITERATOR.zip", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "TARGET_FILE": "files/data_output.csv", + "CHUNKSIZE": "2500000", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv", + "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + }, + resources={"limit_memory": "8G", "limit_cpu": "3"}, + ) + + # Task to load CSV data to a BigQuery table + load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="{{ var.json.epa_historical_air_quality.container_registry.wind_hourly_summary_destination_table }}", + skip_leading_rows=1, + allow_quoted_newlines=True, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE", + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE", + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE", + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE", + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE", + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE", + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE", + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE", + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE", + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE", + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE", + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE", + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE", + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE", + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE", + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE", + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE", + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE", + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE", + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE", + }, + ], + ) + + transform_csv >> load_to_bq