From b02f47a6d617a6864c78f56ce8247ccaceee8695 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Thu, 16 Sep 2021 17:14:48 -0400 Subject: [PATCH] chore: Use Airflow's `AIRFLOW_VAR_*` feature for shared variables (#183) --- README.md | 26 +++++++++-------- .../bikeshare_stations_dag.py | 4 +-- .../bikeshare_stations/pipeline.yaml | 4 +-- .../data_by_province/data_by_province_dag.py | 4 +-- .../data_by_province/pipeline.yaml | 4 +-- .../data_by_region/data_by_region_dag.py | 4 +-- .../data_by_region/pipeline.yaml | 4 +-- .../national_trends/national_trends_dag.py | 4 +-- .../national_trends/pipeline.yaml | 4 +-- .../city_level_cases_and_deaths_dag.py | 8 +++--- .../city_level_cases_and_deaths/pipeline.yaml | 8 +++--- .../covid_racial_data_tracker_dag.py | 8 +++--- .../covid_racial_data_tracker/pipeline.yaml | 8 +++--- .../national_testing_and_outcomes_dag.py | 6 ++-- .../pipeline.yaml | 6 ++-- .../pipeline.yaml | 8 +++--- ...state_facility_level_long_term_care_dag.py | 8 +++--- .../pipeline.yaml | 8 +++--- ...tate_level_aggregate_long_term_care_dag.py | 8 +++--- .../pipeline.yaml | 8 +++--- ...ate_level_cumulative_long_term_care_dag.py | 8 +++--- .../pipeline.yaml | 8 +++--- ...ate_level_cumulative_long_term_care_dag.py | 8 +++--- ...vel_current_outbreak_long_term_care_dag.py | 8 +++--- .../state_screenshots/pipeline.yaml | 10 +++---- .../state_screenshots_dag.py | 10 +++---- .../state_testing_and_outcomes/pipeline.yaml | 6 ++-- .../state_testing_and_outcomes_dag.py | 6 ++-- .../advertiser_declared_stats_dag.py | 4 +-- .../advertiser_declared_stats/pipeline.yaml | 4 +-- .../advertiser_geo_spend_dag.py | 4 +-- .../advertiser_geo_spend/pipeline.yaml | 4 +-- .../advertiser_stats/advertiser_stats_dag.py | 4 +-- .../advertiser_stats/pipeline.yaml | 4 +-- .../advertiser_weekly_spend_dag.py | 4 +-- .../advertiser_weekly_spend/pipeline.yaml | 4 +-- .../campaign_targeting_dag.py | 4 +-- .../campaign_targeting/pipeline.yaml | 4 +-- .../creative_stats/creative_stats_dag.py | 4 +-- .../creative_stats/pipeline.yaml | 4 +-- .../geo_spend/geo_spend_dag.py | 4 +-- .../geo_spend/pipeline.yaml | 4 +-- .../last_updated/last_updated_dag.py | 4 +-- .../last_updated/pipeline.yaml | 4 +-- .../top_keywords_history/pipeline.yaml | 4 +-- .../top_keywords_history_dag.py | 4 +-- .../irs_990/irs_990_2014/irs_990_2014_dag.py | 4 +-- datasets/irs_990/irs_990_2014/pipeline.yaml | 4 +-- .../irs_990/irs_990_2015/irs_990_2015_dag.py | 4 +-- datasets/irs_990/irs_990_2015/pipeline.yaml | 4 +-- .../irs_990/irs_990_2016/irs_990_2016_dag.py | 4 +-- datasets/irs_990/irs_990_2016/pipeline.yaml | 4 +-- .../irs_990/irs_990_2017/irs_990_2017_dag.py | 4 +-- datasets/irs_990/irs_990_2017/pipeline.yaml | 4 +-- .../irs_990_ez_2014/irs_990_ez_2014_dag.py | 4 +-- .../irs_990/irs_990_ez_2014/pipeline.yaml | 4 +-- .../irs_990_ez_2015/irs_990_ez_2015_dag.py | 4 +-- .../irs_990/irs_990_ez_2015/pipeline.yaml | 4 +-- .../irs_990_ez_2016/irs_990_ez_2016_dag.py | 4 +-- .../irs_990/irs_990_ez_2016/pipeline.yaml | 4 +-- .../irs_990_ez_2017/irs_990_ez_2017_dag.py | 4 +-- .../irs_990/irs_990_ez_2017/pipeline.yaml | 4 +-- .../irs_990_pf_2014/irs_990_pf_2014_dag.py | 4 +-- .../irs_990/irs_990_pf_2014/pipeline.yaml | 4 +-- .../irs_990_pf_2015/irs_990_pf_2015_dag.py | 4 +-- .../irs_990/irs_990_pf_2015/pipeline.yaml | 4 +-- .../irs_990_pf_2016/irs_990_pf_2016_dag.py | 4 +-- .../irs_990/irs_990_pf_2016/pipeline.yaml | 4 +-- .../noaa/gsod_stations/gsod_stations_dag.py | 4 +-- datasets/noaa/gsod_stations/pipeline.yaml | 4 +-- .../lightning_strikes_by_year_dag.py | 4 +-- .../lightning_strikes_by_year/pipeline.yaml | 4 +-- .../usa_names/usa_1910_current/pipeline.yaml | 2 +- .../usa_1910_current/usa_1910_current_dag.py | 2 +- samples/pipeline.airflow1.yaml | 6 ++-- samples/pipeline.yaml | 28 +++++++++---------- 76 files changed, 212 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index b52712eec..ad15e43e9 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Use only underscores and alpha-numeric characters for the names. If you created a new dataset directory above, you need to create a `datasets/DATASET/dataset.yaml` config file. See this [section](https://github.com/GoogleCloudPlatform/public-datasets-pipelines/blob/main/README.md#yaml-config-reference) for the `dataset.yaml` reference. -Create a `datasets/DATASET/PIPELINE/pipeline.yaml` config file for your pipeline. See [here](https://github.com/GoogleCloudPlatform/public-datasets-pipelines/blob/main/samples/) for the `pipeline.yaml` references. +Create a `datasets/DATASET/PIPELINE/pipeline.yaml` config file for your pipeline. See [here](https://github.com/GoogleCloudPlatform/public-datasets-pipelines/blob/main/samples/) for the `pipeline.yaml` references. For a YAML config template using Airflow 1.10 operators, see [`samples/pipeline.airflow1.yaml`](https://github.com/GoogleCloudPlatform/public-datasets-pipelines/blob/main/samples/pipeline.airflow1.yaml). @@ -151,9 +151,12 @@ Docker images will be built and pushed to GCR by default whenever the command ab Running the command in the previous step will parse your pipeline config and inform you about the Airflow variables that your pipeline expects to use and must be defined. -If your pipeline doesn't use any Airflow variables, you can skip this step. +If your pipeline doesn't use any Airflow variables, you can skip this step. -There are two types of variables that pipelines can use: **shared** and **dataset-specific**. Shared variables are those that can be reused by other pipelines in the same Airflow or Cloud Composer environment. These are variables that stay constant from pipeline to pipeline. Examples of shared variables include your Cloud Composer environment name and bucket, your GCP project ID, and paths to the Airflow DAG and data folders. To prevent duplication, specify your shared variables in one place: +There are two types of variables that pipelines can use: **shared** and **dataset-specific**. Shared variables are those that can be reused by other pipelines in the same Airflow or Cloud Composer environment. These variables will have the same values for any pipeline. Examples of shared variables include your Cloud Composer environment name and bucket, your GCP project ID, and paths to the Airflow DAG and data folders (e.g. `/home/airflow/gcs/data`). To specify your shared variables, you can either + +* Store the variables as Cloud Composer environment variables [using Airflow's built-in `AIRFLOW_VAR_*` behavior](https://airflow.apache.org/docs/apache-airflow/stable/howto/variable.html#storing-variables-in-environment-variables). (Preferred) +* or, use a single `shared_variables.json` file by creating it under ``` [.dev|.test]/datasets/shared_variables.json @@ -171,25 +174,26 @@ and inside the file, nest the variables under a common parent key. For example: } ``` -For dataset-specific variables, create the following file +Another type of variable is dataset-specific variables. To make use of dataset-specific variables, create the following file ``` [.dev|.test]/datasets/{DATASET}/{DATASET}_variables.json ``` -In general, pipelines use the JSON dot notation to access Airflow variables. Make sure to define and nest your variables under some parent key when writing to the JSON files above. We recommend using your dataset's name as the parent key, to mimic the same structure as the folder hierarchy. Airflow variables are globally accessed by any pipeline, which means nesting your variables helps avoid collisions. For example, if you're using the following variables in your pipeline config: +In general, pipelines use the JSON dot notation to access Airflow variables. Make sure to define and nest your variables under some parent key when writing to the JSON files above. We recommend using your dataset's name as the parent key, to mimic the same structure as the folder hierarchy in the Composer's GCS bucket. Airflow variables are globally accessed by any pipeline, which means nesting your variables helps avoid collisions. For example, if you're using the following variables in your pipeline config: -- `{{ var.json.shared.composer_bucket }}` -- `{{ var.json.parent.nested }}` -- `{{ var.json.parent.another_nested }}` +- `{{ var.json.namespace.nested }}` +- `{{ var.json.namespace.some_key.nested_twice }}` then your variables JSON file should look like this ```json { - "parent": { + "namespace": { "nested": "some value", - "another_nested": "another value" + "some_key": { + "nested_twice": "another value" + } } } @@ -234,7 +238,7 @@ The `samples` folder contains references for the YAML config files, complete wit # Best Practices - When your tabular data contains percentage values, represent them as floats between 0 to 1. -- To represent hierarchical data in BigQuery, use either: +- To represent hierarchical data in BigQuery, use either: - (Recommended) Nested columns in BigQuery. For more info, see [the documentation on nested and repeated columns](https://cloud.google.com/bigquery/docs/nested-repeated). - Or, represent each level as a separate column. For example, if you have the following hierarchy: `chapter > section > subsection`, then represent them as diff --git a/datasets/austin_bikeshare/bikeshare_stations/bikeshare_stations_dag.py b/datasets/austin_bikeshare/bikeshare_stations/bikeshare_stations_dag.py index 16852b3fb..a64468721 100644 --- a/datasets/austin_bikeshare/bikeshare_stations/bikeshare_stations_dag.py +++ b/datasets/austin_bikeshare/bikeshare_stations/bikeshare_stations_dag.py @@ -60,7 +60,7 @@ "SOURCE_URL": "https://data.austintexas.gov/api/views/qd73-bsdg/rows.csv", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/austin_bikeshare/bikeshare_stations/data_output.csv", "PIPELINE_NAME": "bikeshare_stations", "CSV_HEADERS": '["station_id","name","status","address","alternate_name","city_asset_number","property_type","number_of_docks","power_type","footprint_length","footprint_width","notes","council_district","modified_date"]', @@ -73,7 +73,7 @@ load_austin_bikeshare_stations_to_bq = ( gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_austin_bikeshare_stations_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/austin_bikeshare/bikeshare_stations/data_output.csv"], source_format="CSV", destination_project_dataset_table="austin_bikeshare.bikeshare_stations", diff --git a/datasets/austin_bikeshare/bikeshare_stations/pipeline.yaml b/datasets/austin_bikeshare/bikeshare_stations/pipeline.yaml index 345c53ebe..beca7aeea 100644 --- a/datasets/austin_bikeshare/bikeshare_stations/pipeline.yaml +++ b/datasets/austin_bikeshare/bikeshare_stations/pipeline.yaml @@ -73,7 +73,7 @@ dag: SOURCE_URL: "https://data.austintexas.gov/api/views/qd73-bsdg/rows.csv" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/austin_bikeshare/bikeshare_stations/data_output.csv" PIPELINE_NAME: "bikeshare_stations" CSV_HEADERS: >- @@ -93,7 +93,7 @@ dag: task_id: "load_austin_bikeshare_stations_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/austin_bikeshare/bikeshare_stations/data_output.csv"] diff --git a/datasets/covid19_italy/data_by_province/data_by_province_dag.py b/datasets/covid19_italy/data_by_province/data_by_province_dag.py index 8bcdefa0b..ac5fc6263 100644 --- a/datasets/covid19_italy/data_by_province/data_by_province_dag.py +++ b/datasets/covid19_italy/data_by_province/data_by_province_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-province/dpc-covid19-ita-province.csv", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/covid19_italy/data_by_province/data_output.csv", "CSV_HEADERS": '["date","country","region_code","region_name","province_code","province_name","province_abbreviation","latitude","longitude","location_geom","confirmed_cases","note"]', "RENAME_MAPPINGS": '{"data": "date","stato": "country","codice_regione": "region_code","denominazione_regione": "region_name","lat": "latitude","long": "longitude","codice_provincia": "province_code","denominazione_provincia": "province_name","sigla_provincia": "province_abbreviation","totale_casi": "confirmed_cases","note": "note"}', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_data_by_province_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_data_by_province_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/covid19_italy/data_by_province/data_output.csv"], source_format="CSV", destination_project_dataset_table="covid19_italy.data_by_province", diff --git a/datasets/covid19_italy/data_by_province/pipeline.yaml b/datasets/covid19_italy/data_by_province/pipeline.yaml index 1682fd40a..039621191 100644 --- a/datasets/covid19_italy/data_by_province/pipeline.yaml +++ b/datasets/covid19_italy/data_by_province/pipeline.yaml @@ -78,7 +78,7 @@ dag: SOURCE_URL: "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-province/dpc-covid19-ita-province.csv" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/covid19_italy/data_by_province/data_output.csv" CSV_HEADERS: >- ["date","country","region_code","region_name","province_code","province_name","province_abbreviation","latitude","longitude","location_geom","confirmed_cases","note"] @@ -98,7 +98,7 @@ dag: task_id: "load_data_by_province_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/covid19_italy/data_by_province/data_output.csv"] diff --git a/datasets/covid19_italy/data_by_region/data_by_region_dag.py b/datasets/covid19_italy/data_by_region/data_by_region_dag.py index ce274e9fe..7291d7d24 100644 --- a/datasets/covid19_italy/data_by_region/data_by_region_dag.py +++ b/datasets/covid19_italy/data_by_region/data_by_region_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/covid19_italy/data_by_region/data_output.csv", "CSV_HEADERS": '["date","country","region_code","region_name","latitude","longitude","location_geom","hospitalized_patients_symptoms","hospitalized_patients_intensive_care","total_hospitalized_patients","home_confinement_cases","total_current_confirmed_cases","new_current_confirmed_cases","new_total_confirmed_cases","recovered","deaths","total_confirmed_cases","tests_performed","note"]', "RENAME_MAPPINGS": '{"data": "date","stato": "country","codice_regione": "region_code","denominazione_regione": "region_name","lat": "latitude","long": "longitude","ricoverati_con_sintomi": "hospitalized_patients_symptoms","terapia_intensiva": "hospitalized_patients_intensive_care","totale_ospedalizzati": "total_hospitalized_patients","isolamento_domiciliare": "home_confinement_cases","totale_positivi": "total_current_confirmed_cases","variazione_totale_positivi": "new_current_confirmed_cases","nuovi_positivi": "new_total_confirmed_cases","note": "note","dimessi_guariti": "recovered","totale_casi": "total_confirmed_cases","tamponi": "tests_performed","deceduti": "deaths"}', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_data_by_region_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_data_by_region_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/covid19_italy/data_by_region/data_output.csv"], source_format="CSV", destination_project_dataset_table="covid19_italy.data_by_region", diff --git a/datasets/covid19_italy/data_by_region/pipeline.yaml b/datasets/covid19_italy/data_by_region/pipeline.yaml index 13adaece3..2ada936b1 100644 --- a/datasets/covid19_italy/data_by_region/pipeline.yaml +++ b/datasets/covid19_italy/data_by_region/pipeline.yaml @@ -77,7 +77,7 @@ dag: SOURCE_URL: "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/covid19_italy/data_by_region/data_output.csv" CSV_HEADERS: >- ["date","country","region_code","region_name","latitude","longitude","location_geom","hospitalized_patients_symptoms","hospitalized_patients_intensive_care","total_hospitalized_patients","home_confinement_cases","total_current_confirmed_cases","new_current_confirmed_cases","new_total_confirmed_cases","recovered","deaths","total_confirmed_cases","tests_performed","note"] @@ -97,7 +97,7 @@ dag: task_id: "load_data_by_region_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/covid19_italy/data_by_region/data_output.csv"] diff --git a/datasets/covid19_italy/national_trends/national_trends_dag.py b/datasets/covid19_italy/national_trends/national_trends_dag.py index 7e562c8e2..edc430f4c 100644 --- a/datasets/covid19_italy/national_trends/national_trends_dag.py +++ b/datasets/covid19_italy/national_trends/national_trends_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/covid19_italy/national_trends/data_output.csv", "CSV_HEADERS": '["date","country","hospitalized_patients_symptoms","hospitalized_patients_intensive_care","total_hospitalized_patients","home_confinement_cases","total_current_confirmed_cases","new_current_confirmed_cases","new_total_confirmed_cases","recovered","deaths","total_confirmed_cases","tests_performed","note"]', "RENAME_MAPPINGS": '{"data": "date","stato": "country","ricoverati_con_sintomi": "hospitalized_patients_symptoms","terapia_intensiva": "hospitalized_patients_intensive_care","totale_ospedalizzati": "total_hospitalized_patients","isolamento_domiciliare": "home_confinement_cases","totale_positivi": "total_current_confirmed_cases","variazione_totale_positivi": "new_current_confirmed_cases","nuovi_positivi": "new_total_confirmed_cases","dimessi_guariti": "recovered","deceduti": "deaths","totale_casi": "total_confirmed_cases","tamponi": "tests_performed","note": "note"}', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_national_trends_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_national_trends_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/covid19_italy/national_trends/data_output.csv"], source_format="CSV", destination_project_dataset_table="covid19_italy.national_trends", diff --git a/datasets/covid19_italy/national_trends/pipeline.yaml b/datasets/covid19_italy/national_trends/pipeline.yaml index b7a0ffc50..733d10338 100644 --- a/datasets/covid19_italy/national_trends/pipeline.yaml +++ b/datasets/covid19_italy/national_trends/pipeline.yaml @@ -78,7 +78,7 @@ dag: SOURCE_URL: "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/covid19_italy/national_trends/data_output.csv" CSV_HEADERS: >- ["date","country","hospitalized_patients_symptoms","hospitalized_patients_intensive_care","total_hospitalized_patients","home_confinement_cases","total_current_confirmed_cases","new_current_confirmed_cases","new_total_confirmed_cases","recovered","deaths","total_confirmed_cases","tests_performed","note"] @@ -98,7 +98,7 @@ dag: task_id: "load_national_trends_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/covid19_italy/national_trends/data_output.csv"] diff --git a/datasets/covid19_tracking/city_level_cases_and_deaths/city_level_cases_and_deaths_dag.py b/datasets/covid19_tracking/city_level_cases_and_deaths/city_level_cases_and_deaths_dag.py index f03b46e9f..1aacf2c65 100644 --- a/datasets/covid19_tracking/city_level_cases_and_deaths/city_level_cases_and_deaths_dag.py +++ b/datasets/covid19_tracking/city_level_cases_and_deaths/city_level_cases_and_deaths_dag.py @@ -39,7 +39,7 @@ bash_command="mkdir -p $airflow_data_folder/covid19_tracking/city_level_cases_and_deaths/{{ ds }}\ncurl -o $airflow_data_folder/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/raw-data.csv -L $csv_source_url\n", env={ "csv_source_url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vRg-dB5Pjt-zN38BZNoCdOk_RJ_MyYFAl3QIkK5fKSddUy44DUgJwZuhjCz8KPMpiFKRwhoIwfs0NbZ/pub?gid=0&single=true&output=csv", - "airflow_data_folder": "{{ var.json.shared.airflow_data_folder }}", + "airflow_data_folder": "{{ var.value.airflow_data_folder }}", }, ) @@ -48,7 +48,7 @@ task_id="process_raw_csv_file", bash_command="SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/raw-data.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/data.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "city_level_cases_and_deaths", }, @@ -57,7 +57,7 @@ # Task to load the data from Airflow data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/data.csv" ], @@ -167,7 +167,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/*", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/", diff --git a/datasets/covid19_tracking/city_level_cases_and_deaths/pipeline.yaml b/datasets/covid19_tracking/city_level_cases_and_deaths/pipeline.yaml index f0572bd94..08e004dc8 100644 --- a/datasets/covid19_tracking/city_level_cases_and_deaths/pipeline.yaml +++ b/datasets/covid19_tracking/city_level_cases_and_deaths/pipeline.yaml @@ -42,7 +42,7 @@ dag: curl -o $airflow_data_folder/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/raw-data.csv -L $csv_source_url env: csv_source_url: "https://docs.google.com/spreadsheets/d/e/2PACX-1vRg-dB5Pjt-zN38BZNoCdOk_RJ_MyYFAl3QIkK5fKSddUy44DUgJwZuhjCz8KPMpiFKRwhoIwfs0NbZ/pub?gid=0&single=true&output=csv" - airflow_data_folder: "{{ var.json.shared.airflow_data_folder }}" + airflow_data_folder: "{{ var.value.airflow_data_folder }}" - operator: "BashOperator" description: "Run the custom/csv_transform.py script to process the raw CSV contents into a BigQuery friendly format" @@ -51,7 +51,7 @@ dag: bash_command: | SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/raw-data.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/data.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "city_level_cases_and_deaths" @@ -59,7 +59,7 @@ dag: description: "Task to load the data from Airflow data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/data.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.city_level_cases_and_deaths" @@ -197,7 +197,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/*" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/city_level_cases_and_deaths/{{ ds }}/" diff --git a/datasets/covid19_tracking/covid_racial_data_tracker/covid_racial_data_tracker_dag.py b/datasets/covid19_tracking/covid_racial_data_tracker/covid_racial_data_tracker_dag.py index e17d084cf..712f892b0 100644 --- a/datasets/covid19_tracking/covid_racial_data_tracker/covid_racial_data_tracker_dag.py +++ b/datasets/covid19_tracking/covid_racial_data_tracker/covid_racial_data_tracker_dag.py @@ -39,7 +39,7 @@ bash_command="mkdir -p $airflow_home/data/covid19_tracking/covid_racial_data_tracker\ncurl -o $airflow_home/data/covid19_tracking/covid_racial_data_tracker/raw-crdt-data-{{ ds }}.csv -L $csv_source_url\n", env={ "csv_source_url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vS8SzaERcKJOD_EzrtCDK1dX1zkoMochlA9iHoHg_RSw3V8bkpfk1mpw4pfL5RdtSOyx_oScsUtyXyk/pub?gid=43720681&single=true&output=csv", - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", }, ) @@ -48,7 +48,7 @@ task_id="process_raw_csv_file", bash_command="SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-crdt-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/crdt-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/transform_dates.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "covid_racial_data_tracker", }, @@ -57,7 +57,7 @@ # Task to load the data from Airflow data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/covid_racial_data_tracker/crdt-data-{{ ds }}.csv" ], @@ -161,7 +161,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/covid_racial_data_tracker/crdt-data-{{ ds }}.csv", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/covid_racial_data_tracker/crdt-data-{{ ds }}.csv", diff --git a/datasets/covid19_tracking/covid_racial_data_tracker/pipeline.yaml b/datasets/covid19_tracking/covid_racial_data_tracker/pipeline.yaml index 4b1a06ce6..d098ded29 100644 --- a/datasets/covid19_tracking/covid_racial_data_tracker/pipeline.yaml +++ b/datasets/covid19_tracking/covid_racial_data_tracker/pipeline.yaml @@ -42,7 +42,7 @@ dag: curl -o $airflow_home/data/covid19_tracking/covid_racial_data_tracker/raw-crdt-data-{{ ds }}.csv -L $csv_source_url env: csv_source_url: "https://docs.google.com/spreadsheets/d/e/2PACX-1vS8SzaERcKJOD_EzrtCDK1dX1zkoMochlA9iHoHg_RSw3V8bkpfk1mpw4pfL5RdtSOyx_oScsUtyXyk/pub?gid=43720681&single=true&output=csv" - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" - operator: "BashOperator" description: "Run a custom/*.py script to process the raw CSV contents into a BigQuery friendly format" @@ -51,7 +51,7 @@ dag: bash_command: | SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-crdt-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/crdt-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/transform_dates.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "covid_racial_data_tracker" @@ -59,7 +59,7 @@ dag: description: "Task to load the data from Airflow data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/covid_racial_data_tracker/crdt-data-{{ ds }}.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.covid_racial_data_tracker" @@ -236,7 +236,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/covid_racial_data_tracker/crdt-data-{{ ds }}.csv" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/covid_racial_data_tracker/crdt-data-{{ ds }}.csv" diff --git a/datasets/covid19_tracking/national_testing_and_outcomes/national_testing_and_outcomes_dag.py b/datasets/covid19_tracking/national_testing_and_outcomes/national_testing_and_outcomes_dag.py index 39b087eec..37168f956 100644 --- a/datasets/covid19_tracking/national_testing_and_outcomes/national_testing_and_outcomes_dag.py +++ b/datasets/covid19_tracking/national_testing_and_outcomes/national_testing_and_outcomes_dag.py @@ -39,14 +39,14 @@ bash_command="echo $airflow_data_folder\necho $csv_source_url\nmkdir -p $airflow_data_folder/covid19_tracking/national_testing_and_outcomes\ncurl -o $airflow_data_folder/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv -L $csv_source_url\n", env={ "csv_source_url": "https://covidtracking.com/data/download/national-history.csv", - "airflow_data_folder": "{{ var.json.shared.airflow_data_folder }}", + "airflow_data_folder": "{{ var.value.airflow_data_folder }}", }, ) # Task to load the data from Airflow data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv" ], @@ -163,7 +163,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv", diff --git a/datasets/covid19_tracking/national_testing_and_outcomes/pipeline.yaml b/datasets/covid19_tracking/national_testing_and_outcomes/pipeline.yaml index b76787fc8..9dd5130f5 100644 --- a/datasets/covid19_tracking/national_testing_and_outcomes/pipeline.yaml +++ b/datasets/covid19_tracking/national_testing_and_outcomes/pipeline.yaml @@ -44,13 +44,13 @@ dag: curl -o $airflow_data_folder/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv -L $csv_source_url env: csv_source_url: "https://covidtracking.com/data/download/national-history.csv" - airflow_data_folder: "{{ var.json.shared.airflow_data_folder }}" + airflow_data_folder: "{{ var.value.airflow_data_folder }}" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load the data from Airflow data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.national_testing_and_outcomes" @@ -130,7 +130,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/national_testing_and_outcomes/national-history-{{ ds }}.csv" diff --git a/datasets/covid19_tracking/state_facility_level_long_term_care/pipeline.yaml b/datasets/covid19_tracking/state_facility_level_long_term_care/pipeline.yaml index 9753a329a..25d307b44 100644 --- a/datasets/covid19_tracking/state_facility_level_long_term_care/pipeline.yaml +++ b/datasets/covid19_tracking/state_facility_level_long_term_care/pipeline.yaml @@ -38,7 +38,7 @@ dag: args: task_id: "download_raw_csv_files" env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: covid19_tracking pipeline: state_facility_level_long_term_care bash_command: | @@ -90,7 +90,7 @@ dag: bash_command: | WORKING_DIR=$airflow_home/data/covid19_tracking/state_facility_level_long_term_care python $airflow_home/dags/$dataset/$pipeline/custom/multi_csv_transform.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "state_facility_level_long_term_care" @@ -98,7 +98,7 @@ dag: description: "Task to load the CSV from the pipeline's data folder to BigQuery" args: task_id: "load_csv_files_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: [ "data/covid19_tracking/state_facility_level_long_term_care/facilities-ar.csv", "data/covid19_tracking/state_facility_level_long_term_care/facilities-ga.csv", @@ -278,7 +278,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_files_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/state_facility_level_long_term_care/*" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/state_facility_level_long_term_care/{{ ds }}/" diff --git a/datasets/covid19_tracking/state_facility_level_long_term_care/state_facility_level_long_term_care_dag.py b/datasets/covid19_tracking/state_facility_level_long_term_care/state_facility_level_long_term_care_dag.py index 76aaa2653..5ca57784c 100644 --- a/datasets/covid19_tracking/state_facility_level_long_term_care/state_facility_level_long_term_care_dag.py +++ b/datasets/covid19_tracking/state_facility_level_long_term_care/state_facility_level_long_term_care_dag.py @@ -37,7 +37,7 @@ download_raw_csv_files = bash_operator.BashOperator( task_id="download_raw_csv_files", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_facility_level_long_term_care", }, @@ -49,7 +49,7 @@ task_id="process_raw_csv_files", bash_command="WORKING_DIR=$airflow_home/data/covid19_tracking/state_facility_level_long_term_care python $airflow_home/dags/$dataset/$pipeline/custom/multi_csv_transform.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_facility_level_long_term_care", }, @@ -58,7 +58,7 @@ # Task to load the CSV from the pipeline's data folder to BigQuery load_csv_files_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_files_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_facility_level_long_term_care/facilities-ar.csv", "data/covid19_tracking/state_facility_level_long_term_care/facilities-ga.csv", @@ -221,7 +221,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_files_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_files_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_facility_level_long_term_care/*", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_facility_level_long_term_care/{{ ds }}/", diff --git a/datasets/covid19_tracking/state_level_aggregate_long_term_care/pipeline.yaml b/datasets/covid19_tracking/state_level_aggregate_long_term_care/pipeline.yaml index 4c2661800..d79f98b8b 100644 --- a/datasets/covid19_tracking/state_level_aggregate_long_term_care/pipeline.yaml +++ b/datasets/covid19_tracking/state_level_aggregate_long_term_care/pipeline.yaml @@ -42,7 +42,7 @@ dag: curl -o $airflow_home/data/covid19_tracking/state_level_aggregate_long_term_care/raw-aggregated-data-{{ ds }}.csv -L $csv_source_url env: csv_source_url: "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=827060758&single=true&output=csv" - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" - operator: "BashOperator" description: "Run the custom/csv_transform.py script to process the raw CSV contents into a BigQuery friendly format" @@ -51,7 +51,7 @@ dag: bash_command: | SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-aggregated-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/aggregated-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "state_level_aggregate_long_term_care" @@ -59,7 +59,7 @@ dag: description: "Task to load the CSV from the pipeline's data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/state_level_aggregate_long_term_care/aggregated-data-{{ ds }}.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.state_level_aggregate_long_term_care" @@ -238,7 +238,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/state_level_aggregate_long_term_care/*-data-{{ ds }}.csv" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/state_level_aggregate_long_term_care/{{ ds }}/" diff --git a/datasets/covid19_tracking/state_level_aggregate_long_term_care/state_level_aggregate_long_term_care_dag.py b/datasets/covid19_tracking/state_level_aggregate_long_term_care/state_level_aggregate_long_term_care_dag.py index 94cea9236..027284bd9 100644 --- a/datasets/covid19_tracking/state_level_aggregate_long_term_care/state_level_aggregate_long_term_care_dag.py +++ b/datasets/covid19_tracking/state_level_aggregate_long_term_care/state_level_aggregate_long_term_care_dag.py @@ -39,7 +39,7 @@ bash_command="mkdir -p $airflow_home/data/covid19_tracking/state_level_aggregate_long_term_care\ncurl -o $airflow_home/data/covid19_tracking/state_level_aggregate_long_term_care/raw-aggregated-data-{{ ds }}.csv -L $csv_source_url\n", env={ "csv_source_url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=827060758&single=true&output=csv", - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", }, ) @@ -48,7 +48,7 @@ task_id="process_raw_csv_file", bash_command="SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-aggregated-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/aggregated-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_level_aggregate_long_term_care", }, @@ -57,7 +57,7 @@ # Task to load the CSV from the pipeline's data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_level_aggregate_long_term_care/aggregated-data-{{ ds }}.csv" ], @@ -345,7 +345,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_level_aggregate_long_term_care/*-data-{{ ds }}.csv", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_level_aggregate_long_term_care/{{ ds }}/", diff --git a/datasets/covid19_tracking/state_level_cumulative_long_term_care/pipeline.yaml b/datasets/covid19_tracking/state_level_cumulative_long_term_care/pipeline.yaml index ef896b7b4..c649c2f17 100644 --- a/datasets/covid19_tracking/state_level_cumulative_long_term_care/pipeline.yaml +++ b/datasets/covid19_tracking/state_level_cumulative_long_term_care/pipeline.yaml @@ -42,7 +42,7 @@ dag: curl -o $airflow_home/data/covid19_tracking/state_level_cumulative_long_term_care/raw-cumulative-data-{{ ds }}.csv -L $csv_source_url env: csv_source_url: "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=467018747&single=true&output=csv" - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" - operator: "BashOperator" @@ -52,7 +52,7 @@ dag: bash_command: | SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-cumulative-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/cumulative-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "state_level_cumulative_long_term_care" @@ -60,7 +60,7 @@ dag: description: "Task to load the CSV from the pipeline's data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/state_level_cumulative_long_term_care/cumulative-data-{{ ds }}.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.state_level_cumulative_long_term_care" @@ -242,7 +242,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/state_level_cumulative_long_term_care/*-data-{{ ds }}.csv" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/state_level_cumulative_long_term_care/{{ ds }}/" diff --git a/datasets/covid19_tracking/state_level_cumulative_long_term_care/state_level_cumulative_long_term_care_dag.py b/datasets/covid19_tracking/state_level_cumulative_long_term_care/state_level_cumulative_long_term_care_dag.py index cc102b344..bf21b562e 100644 --- a/datasets/covid19_tracking/state_level_cumulative_long_term_care/state_level_cumulative_long_term_care_dag.py +++ b/datasets/covid19_tracking/state_level_cumulative_long_term_care/state_level_cumulative_long_term_care_dag.py @@ -39,7 +39,7 @@ bash_command="mkdir -p $airflow_home/data/covid19_tracking/state_level_cumulative_long_term_care\ncurl -o $airflow_home/data/covid19_tracking/state_level_cumulative_long_term_care/raw-cumulative-data-{{ ds }}.csv -L $csv_source_url\n", env={ "csv_source_url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=467018747&single=true&output=csv", - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", }, ) @@ -48,7 +48,7 @@ task_id="process_raw_csv_file", bash_command="SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-cumulative-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/cumulative-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_level_cumulative_long_term_care", }, @@ -57,7 +57,7 @@ # Task to load the CSV from the pipeline's data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_level_cumulative_long_term_care/cumulative-data-{{ ds }}.csv" ], @@ -346,7 +346,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_level_cumulative_long_term_care/*-data-{{ ds }}.csv", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_level_cumulative_long_term_care/{{ ds }}/", diff --git a/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/pipeline.yaml b/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/pipeline.yaml index c0ef7294c..9db0b0241 100644 --- a/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/pipeline.yaml +++ b/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/pipeline.yaml @@ -42,7 +42,7 @@ dag: curl -o $airflow_home/data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/raw-data.csv -L $csv_source_url env: csv_source_url: "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=467018747&single=true&output=csv" - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" - operator: "BashOperator" @@ -52,7 +52,7 @@ dag: bash_command: | SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/raw-data.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/data.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "state_level_current_outbreak_long_term_care" @@ -60,7 +60,7 @@ dag: description: "Task to load the CSV from the pipeline's data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/data.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.state_level_current_outbreak_long_term_care" @@ -242,7 +242,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/*" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/" diff --git a/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_cumulative_long_term_care_dag.py b/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_cumulative_long_term_care_dag.py index 9a5c6ae52..08504f290 100644 --- a/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_cumulative_long_term_care_dag.py +++ b/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_cumulative_long_term_care_dag.py @@ -42,7 +42,7 @@ bash_command="mkdir -p $airflow_home/data/covid19_tracking/state_level_cumulative_long_term_care\ncurl -o $airflow_home/data/covid19_tracking/state_level_cumulative_long_term_care/raw-cumulative-data-{{ ds }}.csv -L $csv_source_url\n", env={ "csv_source_url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=467018747&single=true&output=csv", - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", }, ) @@ -51,7 +51,7 @@ task_id="process_raw_csv_file", bash_command="SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/raw-cumulative-data-{{ ds }}.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/cumulative-data-{{ ds }}.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_level_cumulative_long_term_care", }, @@ -60,7 +60,7 @@ # Task to load the CSV from the pipeline's data folder to BigQuery load_csv_file_to_bq_table = GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_level_cumulative_long_term_care/cumulative-data-{{ ds }}.csv" ], @@ -339,7 +339,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_level_cumulative_long_term_care/*-data-{{ ds }}.csv", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_level_cumulative_long_term_care/{{ ds }}/", diff --git a/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_current_outbreak_long_term_care_dag.py b/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_current_outbreak_long_term_care_dag.py index 32f44a5a2..c799149a4 100644 --- a/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_current_outbreak_long_term_care_dag.py +++ b/datasets/covid19_tracking/state_level_current_outbreak_long_term_care/state_level_current_outbreak_long_term_care_dag.py @@ -39,7 +39,7 @@ bash_command="mkdir -p $airflow_home/data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}\ncurl -o $airflow_home/data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/raw-data.csv -L $csv_source_url\n", env={ "csv_source_url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vRa9HnmEl83YXHfbgSPpt0fJe4SyuYLc0GuBAglF4yMYaoKSPRCyXASaWXMrTu1WEYp1oeJZIYHpj7t/pub?gid=467018747&single=true&output=csv", - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", }, ) @@ -48,7 +48,7 @@ task_id="process_raw_csv_file", bash_command="SOURCE_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/raw-data.csv TARGET_CSV=$airflow_home/data/$dataset/$pipeline/{{ ds }}/data.csv python $airflow_home/dags/$dataset/$pipeline/custom/csv_transform.py\n", env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_level_current_outbreak_long_term_care", }, @@ -57,7 +57,7 @@ # Task to load the CSV from the pipeline's data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/data.csv" ], @@ -346,7 +346,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/*", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_level_current_outbreak_long_term_care/{{ ds }}/", diff --git a/datasets/covid19_tracking/state_screenshots/pipeline.yaml b/datasets/covid19_tracking/state_screenshots/pipeline.yaml index 038119e5c..7361d9936 100644 --- a/datasets/covid19_tracking/state_screenshots/pipeline.yaml +++ b/datasets/covid19_tracking/state_screenshots/pipeline.yaml @@ -41,7 +41,7 @@ dag: mkdir -p $airflow_home/data/$dataset/$pipeline/run_date={{ ds }} SOURCE_URL=$source_url CSV_OUTPUT_PATH=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }}/data.csv GCS_PATH_PREFIX="gs://$destination_bucket/datasets/$dataset/$pipeline/run_date={{ ds }}/screenshots" python $airflow_home/dags/$dataset/$pipeline/custom/web_scrape_and_generate_csv.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" source_url: "https://screenshots.covidtracking.com" dataset: "covid19_tracking" @@ -57,7 +57,7 @@ dag: DOWNLOAD_PREFIX=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }} \ python $airflow_home/dags/$dataset/$pipeline/custom/download_screenshots.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: "covid19_tracking" pipeline: "state_screenshots" @@ -65,7 +65,7 @@ dag: description: "Upload all downloaded screenshots to the destination bucket" args: task_id: "upload_screenshots_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/state_screenshots/run_date={{ ds }}/*" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/state_screenshots/run_date={{ ds }}/" @@ -75,7 +75,7 @@ dag: description: "Task to load the data from Airflow data folder to BigQuery" args: task_id: "load_screenshots_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/state_screenshots/run_date={{ ds }}/data.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.state_screenshots" @@ -115,7 +115,7 @@ dag: description: "Delete downloaded screenshots from the Cloud Composer bucket" args: task_id: "delete_screenshots_from_composer_bucket" - bucket_name: "{{ var.json.shared.composer_bucket }}" + bucket_name: "{{ var.value.composer_bucket }}" prefix: "data/covid19_tracking/state_screenshots/run_date={{ ds }}" graph_paths: diff --git a/datasets/covid19_tracking/state_screenshots/state_screenshots_dag.py b/datasets/covid19_tracking/state_screenshots/state_screenshots_dag.py index 2d10399bf..dd0f2be3c 100644 --- a/datasets/covid19_tracking/state_screenshots/state_screenshots_dag.py +++ b/datasets/covid19_tracking/state_screenshots/state_screenshots_dag.py @@ -38,7 +38,7 @@ task_id="generate_csv_data_from_web_scraping", bash_command='mkdir -p $airflow_home/data/$dataset/$pipeline/run_date={{ ds }}\nSOURCE_URL=$source_url CSV_OUTPUT_PATH=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }}/data.csv GCS_PATH_PREFIX="gs://$destination_bucket/datasets/$dataset/$pipeline/run_date={{ ds }}/screenshots" python $airflow_home/dags/$dataset/$pipeline/custom/web_scrape_and_generate_csv.py\n', env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "destination_bucket": "{{ var.json.covid19_tracking.destination_bucket }}", "source_url": "https://screenshots.covidtracking.com", "dataset": "covid19_tracking", @@ -51,7 +51,7 @@ task_id="download_screenshots", bash_command='CSV_PATH=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }}/data.csv \\\nSOURCE_COLUMN="source_url" \\\nDOWNLOAD_PREFIX=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }} \\\npython $airflow_home/dags/$dataset/$pipeline/custom/download_screenshots.py\n', env={ - "airflow_home": "{{ var.json.shared.airflow_home }}", + "airflow_home": "{{ var.value.airflow_home }}", "dataset": "covid19_tracking", "pipeline": "state_screenshots", }, @@ -60,7 +60,7 @@ # Upload all downloaded screenshots to the destination bucket upload_screenshots_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="upload_screenshots_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_screenshots/run_date={{ ds }}/*", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_screenshots/run_date={{ ds }}/", @@ -70,7 +70,7 @@ # Task to load the data from Airflow data folder to BigQuery load_screenshots_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_screenshots_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_screenshots/run_date={{ ds }}/data.csv" ], @@ -128,7 +128,7 @@ delete_screenshots_from_composer_bucket = ( gcs_delete_operator.GoogleCloudStorageDeleteOperator( task_id="delete_screenshots_from_composer_bucket", - bucket_name="{{ var.json.shared.composer_bucket }}", + bucket_name="{{ var.value.composer_bucket }}", prefix="data/covid19_tracking/state_screenshots/run_date={{ ds }}", ) ) diff --git a/datasets/covid19_tracking/state_testing_and_outcomes/pipeline.yaml b/datasets/covid19_tracking/state_testing_and_outcomes/pipeline.yaml index b925ba3f4..56741acda 100644 --- a/datasets/covid19_tracking/state_testing_and_outcomes/pipeline.yaml +++ b/datasets/covid19_tracking/state_testing_and_outcomes/pipeline.yaml @@ -44,13 +44,13 @@ dag: curl -o $airflow_data_folder/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv -L $csv_source_url env: csv_source_url: "https://covidtracking.com/data/download/all-states-history.csv" - airflow_data_folder: "{{ var.json.shared.airflow_data_folder }}" + airflow_data_folder: "{{ var.value.airflow_data_folder }}" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load the data from Airflow data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv"] source_format: "CSV" destination_project_dataset_table: "covid19_tracking.state_testing_and_outcomes" @@ -225,7 +225,7 @@ dag: description: "Task to archive the CSV file in the destination bucket" args: task_id: "archive_csv_file_to_destination_bucket" - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" source_object: "data/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv" destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}" destination_object: "datasets/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv" diff --git a/datasets/covid19_tracking/state_testing_and_outcomes/state_testing_and_outcomes_dag.py b/datasets/covid19_tracking/state_testing_and_outcomes/state_testing_and_outcomes_dag.py index e46cc4f42..3fe6d5984 100644 --- a/datasets/covid19_tracking/state_testing_and_outcomes/state_testing_and_outcomes_dag.py +++ b/datasets/covid19_tracking/state_testing_and_outcomes/state_testing_and_outcomes_dag.py @@ -39,14 +39,14 @@ bash_command="echo $airflow_data_folder\necho $csv_source_url\nmkdir -p $airflow_data_folder/covid19_tracking/state_testing_and_outcomes\ncurl -o $airflow_data_folder/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv -L $csv_source_url\n", env={ "csv_source_url": "https://covidtracking.com/data/download/all-states-history.csv", - "airflow_data_folder": "{{ var.json.shared.airflow_data_folder }}", + "airflow_data_folder": "{{ var.value.airflow_data_folder }}", }, ) # Task to load the data from Airflow data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv" ], @@ -302,7 +302,7 @@ # Task to archive the CSV file in the destination bucket archive_csv_file_to_destination_bucket = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id="archive_csv_file_to_destination_bucket", - source_bucket="{{ var.json.shared.composer_bucket }}", + source_bucket="{{ var.value.composer_bucket }}", source_object="data/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv", destination_bucket="{{ var.json.covid19_tracking.destination_bucket }}", destination_object="datasets/covid19_tracking/state_testing_and_outcomes/all-states-history-{{ ds }}.csv", diff --git a/datasets/google_political_ads/advertiser_declared_stats/advertiser_declared_stats_dag.py b/datasets/google_political_ads/advertiser_declared_stats/advertiser_declared_stats_dag.py index c175cd4c6..3fc495bca 100644 --- a/datasets/google_political_ads/advertiser_declared_stats/advertiser_declared_stats_dag.py +++ b/datasets/google_political_ads/advertiser_declared_stats/advertiser_declared_stats_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/*advertiser-declared-stats*", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/advertiser_declared_stats/data_output.csv", "PIPELINE_NAME": "advertiser_declared_stats", "CSV_HEADERS": '["advertiser_id","advertiser_declared_name","advertiser_declared_regulatory_id","advertiser_declared_scope","advertiser_declared_promoter_name","advertiser_declared_promoter_address"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_advertiser_declared_stats_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_advertiser_declared_stats_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/google_political_ads/advertiser_declared_stats/data_output.csv" ], diff --git a/datasets/google_political_ads/advertiser_declared_stats/pipeline.yaml b/datasets/google_political_ads/advertiser_declared_stats/pipeline.yaml index e40c68a45..3250215cd 100644 --- a/datasets/google_political_ads/advertiser_declared_stats/pipeline.yaml +++ b/datasets/google_political_ads/advertiser_declared_stats/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/*advertiser-declared-stats*" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/advertiser_declared_stats/data_output.csv" PIPELINE_NAME: "advertiser_declared_stats" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_advertiser_declared_stats_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/advertiser_declared_stats/data_output.csv"] diff --git a/datasets/google_political_ads/advertiser_geo_spend/advertiser_geo_spend_dag.py b/datasets/google_political_ads/advertiser_geo_spend/advertiser_geo_spend_dag.py index c118b6beb..1ff9914b8 100644 --- a/datasets/google_political_ads/advertiser_geo_spend/advertiser_geo_spend_dag.py +++ b/datasets/google_political_ads/advertiser_geo_spend/advertiser_geo_spend_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-advertiser-geo-spend.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/advertiser_geo_spend/data_output.csv", "PIPELINE_NAME": "advertiser_geo_spend", "CSV_HEADERS": '["advertiser_id","advertiser_name","country","country_subdivision_primary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_advertiser_geo_spend_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_advertiser_geo_spend_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/google_political_ads/advertiser_geo_spend/data_output.csv" ], diff --git a/datasets/google_political_ads/advertiser_geo_spend/pipeline.yaml b/datasets/google_political_ads/advertiser_geo_spend/pipeline.yaml index 9e791aaa5..115c47697 100644 --- a/datasets/google_political_ads/advertiser_geo_spend/pipeline.yaml +++ b/datasets/google_political_ads/advertiser_geo_spend/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-advertiser-geo-spend.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/advertiser_geo_spend/data_output.csv" PIPELINE_NAME: "advertiser_geo_spend" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_advertiser_geo_spend_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/advertiser_geo_spend/data_output.csv"] diff --git a/datasets/google_political_ads/advertiser_stats/advertiser_stats_dag.py b/datasets/google_political_ads/advertiser_stats/advertiser_stats_dag.py index 2c5d7cfc1..257711ea4 100644 --- a/datasets/google_political_ads/advertiser_stats/advertiser_stats_dag.py +++ b/datasets/google_political_ads/advertiser_stats/advertiser_stats_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-advertiser-stats.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/advertiser_stats/data_output.csv", "PIPELINE_NAME": "advertiser_stats", "CSV_HEADERS": '["advertiser_id","advertiser_name","public_ids_list","regions","elections","total_creatives","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_advertiser_stats_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_advertiser_stats_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/google_political_ads/advertiser_stats/data_output.csv"], source_format="CSV", destination_project_dataset_table="google_political_ads.advertiser_stats", diff --git a/datasets/google_political_ads/advertiser_stats/pipeline.yaml b/datasets/google_political_ads/advertiser_stats/pipeline.yaml index b429bea98..0310d58b5 100644 --- a/datasets/google_political_ads/advertiser_stats/pipeline.yaml +++ b/datasets/google_political_ads/advertiser_stats/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-advertiser-stats.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/advertiser_stats/data_output.csv" PIPELINE_NAME: "advertiser_stats" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_advertiser_stats_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/advertiser_stats/data_output.csv"] diff --git a/datasets/google_political_ads/advertiser_weekly_spend/advertiser_weekly_spend_dag.py b/datasets/google_political_ads/advertiser_weekly_spend/advertiser_weekly_spend_dag.py index 4c065bc43..5cd2127fa 100644 --- a/datasets/google_political_ads/advertiser_weekly_spend/advertiser_weekly_spend_dag.py +++ b/datasets/google_political_ads/advertiser_weekly_spend/advertiser_weekly_spend_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-advertiser-weekly-spend.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/advertiser_weekly_spend/data_output.csv", "PIPELINE_NAME": "advertiser_weekly_spend", "CSV_HEADERS": '["advertiser_id","advertiser_name","election_cycle","week_start_date","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_advertiser_weekly_spend_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_advertiser_weekly_spend_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/google_political_ads/advertiser_weekly_spend/data_output.csv" ], diff --git a/datasets/google_political_ads/advertiser_weekly_spend/pipeline.yaml b/datasets/google_political_ads/advertiser_weekly_spend/pipeline.yaml index 92a1eb651..6ebae050d 100644 --- a/datasets/google_political_ads/advertiser_weekly_spend/pipeline.yaml +++ b/datasets/google_political_ads/advertiser_weekly_spend/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-advertiser-weekly-spend.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/advertiser_weekly_spend/data_output.csv" PIPELINE_NAME: "advertiser_weekly_spend" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_advertiser_weekly_spend_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/advertiser_weekly_spend/data_output.csv"] diff --git a/datasets/google_political_ads/campaign_targeting/campaign_targeting_dag.py b/datasets/google_political_ads/campaign_targeting/campaign_targeting_dag.py index 52dfa8f55..c6fe58303 100644 --- a/datasets/google_political_ads/campaign_targeting/campaign_targeting_dag.py +++ b/datasets/google_political_ads/campaign_targeting/campaign_targeting_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-campaign-targeting.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/campaign_targeting/data_output.csv", "PIPELINE_NAME": "campaign_targeting", "CSV_HEADERS": '["campaign_id","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","start_date","end_date","ads_list","advertiser_id","advertiser_name"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_campaign_targeting_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_campaign_targeting_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/google_political_ads/campaign_targeting/data_output.csv"], source_format="CSV", destination_project_dataset_table="google_political_ads.campaign_targeting", diff --git a/datasets/google_political_ads/campaign_targeting/pipeline.yaml b/datasets/google_political_ads/campaign_targeting/pipeline.yaml index acd08ce55..ed245dfc2 100644 --- a/datasets/google_political_ads/campaign_targeting/pipeline.yaml +++ b/datasets/google_political_ads/campaign_targeting/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-campaign-targeting.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/campaign_targeting/data_output.csv" PIPELINE_NAME: "campaign_targeting" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_campaign_targeting_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/campaign_targeting/data_output.csv"] diff --git a/datasets/google_political_ads/creative_stats/creative_stats_dag.py b/datasets/google_political_ads/creative_stats/creative_stats_dag.py index 1f7c03616..671a58d52 100644 --- a/datasets/google_political_ads/creative_stats/creative_stats_dag.py +++ b/datasets/google_political_ads/creative_stats/creative_stats_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-creative-stats.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/creative_stats/data_output.csv", "PIPELINE_NAME": "creative_stats", "CSV_HEADERS": '["ad_id","ad_url","ad_type","regions","advertiser_id","advertiser_name","ad_campaigns_list","date_range_start","date_range_end","num_of_days","impressions","spend_usd","first_served_timestamp","last_served_timestamp","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","spend_range_min_usd","spend_range_max_usd","spend_range_min_eur","spend_range_max_eur","spend_range_min_inr","spend_range_max_inr","spend_range_min_bgn","spend_range_max_bgn","spend_range_min_hrk","spend_range_max_hrk","spend_range_min_czk","spend_range_max_czk","spend_range_min_dkk","spend_range_max_dkk","spend_range_min_huf","spend_range_max_huf","spend_range_min_pln","spend_range_max_pln","spend_range_min_ron","spend_range_max_ron","spend_range_min_sek","spend_range_max_sek","spend_range_min_gbp","spend_range_max_gbp","spend_range_min_nzd","spend_range_max_nzd"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_creative_stats_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_creative_stats_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/google_political_ads/creative_stats/data_output.csv"], source_format="CSV", destination_project_dataset_table="google_political_ads.creative_stats", diff --git a/datasets/google_political_ads/creative_stats/pipeline.yaml b/datasets/google_political_ads/creative_stats/pipeline.yaml index f4113a401..f33650973 100644 --- a/datasets/google_political_ads/creative_stats/pipeline.yaml +++ b/datasets/google_political_ads/creative_stats/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-creative-stats.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/creative_stats/data_output.csv" PIPELINE_NAME: "creative_stats" CSV_HEADERS: >- @@ -95,7 +95,7 @@ dag: task_id: "load_creative_stats_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/creative_stats/data_output.csv"] diff --git a/datasets/google_political_ads/geo_spend/geo_spend_dag.py b/datasets/google_political_ads/geo_spend/geo_spend_dag.py index 193df9a9d..078ac958a 100644 --- a/datasets/google_political_ads/geo_spend/geo_spend_dag.py +++ b/datasets/google_political_ads/geo_spend/geo_spend_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-geo-spend.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/geo_spend/data_output.csv", "PIPELINE_NAME": "geo_spend", "CSV_HEADERS": '["country","country_subdivision_primary","country_subdivision_secondary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_geo_spend_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_geo_spend_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/google_political_ads/geo_spend/data_output.csv"], source_format="CSV", destination_project_dataset_table="google_political_ads.geo_spend", diff --git a/datasets/google_political_ads/geo_spend/pipeline.yaml b/datasets/google_political_ads/geo_spend/pipeline.yaml index 6ae04731f..b71339bca 100644 --- a/datasets/google_political_ads/geo_spend/pipeline.yaml +++ b/datasets/google_political_ads/geo_spend/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-geo-spend.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/geo_spend/data_output.csv" PIPELINE_NAME: "geo_spend" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_geo_spend_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/geo_spend/data_output.csv"] diff --git a/datasets/google_political_ads/last_updated/last_updated_dag.py b/datasets/google_political_ads/last_updated/last_updated_dag.py index 824aabf22..52d679fa3 100644 --- a/datasets/google_political_ads/last_updated/last_updated_dag.py +++ b/datasets/google_political_ads/last_updated/last_updated_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-updated*", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/last_updated/data_output.csv", "PIPELINE_NAME": "last_updated", "CSV_HEADERS": '["report_data_updated_date"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_last_updated_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_last_updated_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/google_political_ads/last_updated/data_output.csv"], source_format="CSV", destination_project_dataset_table="google_political_ads.last_updated", diff --git a/datasets/google_political_ads/last_updated/pipeline.yaml b/datasets/google_political_ads/last_updated/pipeline.yaml index 6fc8b1d72..52b1483d8 100644 --- a/datasets/google_political_ads/last_updated/pipeline.yaml +++ b/datasets/google_political_ads/last_updated/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-updated*" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/last_updated/data_output.csv" PIPELINE_NAME: "last_updated" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_last_updated_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/last_updated/data_output.csv"] diff --git a/datasets/google_political_ads/top_keywords_history/pipeline.yaml b/datasets/google_political_ads/top_keywords_history/pipeline.yaml index 77d1f283e..b6d208787 100644 --- a/datasets/google_political_ads/top_keywords_history/pipeline.yaml +++ b/datasets/google_political_ads/top_keywords_history/pipeline.yaml @@ -76,7 +76,7 @@ dag: SOURCE_FILE: "files/data.zip" FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-top-keywords-history.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/google_political_ads/top_keywords_history/data_output.csv" PIPELINE_NAME: "top_keywords_history" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_top_keywords_history_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/google_political_ads/top_keywords_history/data_output.csv"] diff --git a/datasets/google_political_ads/top_keywords_history/top_keywords_history_dag.py b/datasets/google_political_ads/top_keywords_history/top_keywords_history_dag.py index 950c48bf7..97d0e1e16 100644 --- a/datasets/google_political_ads/top_keywords_history/top_keywords_history_dag.py +++ b/datasets/google_political_ads/top_keywords_history/top_keywords_history_dag.py @@ -62,7 +62,7 @@ "SOURCE_FILE": "files/data.zip", "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-top-keywords-history.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/google_political_ads/top_keywords_history/data_output.csv", "PIPELINE_NAME": "top_keywords_history", "CSV_HEADERS": '["election_cycle","report_date","keyword_1","spend_usd_1","keyword_2","spend_usd_2","keyword_3","spend_usd_3","keyword_4","spend_usd_4","keyword_5","spend_usd_5","keyword_6","spend_usd_6","region","elections"]', @@ -74,7 +74,7 @@ # Task to load CSV data to a BigQuery table load_top_keywords_history_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_top_keywords_history_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=[ "data/google_political_ads/top_keywords_history/data_output.csv" ], diff --git a/datasets/irs_990/irs_990_2014/irs_990_2014_dag.py b/datasets/irs_990/irs_990_2014/irs_990_2014_dag.py index c0332b475..4a073c615 100644 --- a/datasets/irs_990/irs_990_2014/irs_990_2014_dag.py +++ b/datasets/irs_990/irs_990_2014/irs_990_2014_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/14eofinextract990.zip", "SOURCE_FILE": "files/data.zip", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_2014/data_output.csv", "PIPELINE_NAME": "irs_990_2015", "CSV_HEADERS": '["ein","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_2014/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_2014", diff --git a/datasets/irs_990/irs_990_2014/pipeline.yaml b/datasets/irs_990/irs_990_2014/pipeline.yaml index f92492a1a..cfaf32f7c 100644 --- a/datasets/irs_990/irs_990_2014/pipeline.yaml +++ b/datasets/irs_990/irs_990_2014/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/14eofinextract990.zip" SOURCE_FILE: "files/data.zip" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_2014/data_output.csv" PIPELINE_NAME: "irs_990_2015" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_irs_990_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2014/data_output.csv"] diff --git a/datasets/irs_990/irs_990_2015/irs_990_2015_dag.py b/datasets/irs_990/irs_990_2015/irs_990_2015_dag.py index 4decd1e43..c85075311 100644 --- a/datasets/irs_990/irs_990_2015/irs_990_2015_dag.py +++ b/datasets/irs_990/irs_990_2015/irs_990_2015_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/15eofinextract990.dat.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_2015/data_output.csv", "PIPELINE_NAME": "irs_990_2015", "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_2015/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_2015", diff --git a/datasets/irs_990/irs_990_2015/pipeline.yaml b/datasets/irs_990/irs_990_2015/pipeline.yaml index 90ce8fbab..586e6f0a0 100644 --- a/datasets/irs_990/irs_990_2015/pipeline.yaml +++ b/datasets/irs_990/irs_990_2015/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/15eofinextract990.dat.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_2015/data_output.csv" PIPELINE_NAME: "irs_990_2015" CSV_HEADERS: >- @@ -95,7 +95,7 @@ dag: task_id: "load_irs_990_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2015/data_output.csv"] diff --git a/datasets/irs_990/irs_990_2016/irs_990_2016_dag.py b/datasets/irs_990/irs_990_2016/irs_990_2016_dag.py index 7d8efe159..5f3daee8b 100644 --- a/datasets/irs_990/irs_990_2016/irs_990_2016_dag.py +++ b/datasets/irs_990/irs_990_2016/irs_990_2016_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/16eofinextract990.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_2016/data_output.csv", "PIPELINE_NAME": "irs_990_2016", "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_2016_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_2016_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_2016/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_2016", diff --git a/datasets/irs_990/irs_990_2016/pipeline.yaml b/datasets/irs_990/irs_990_2016/pipeline.yaml index 96dda309c..425027f4f 100644 --- a/datasets/irs_990/irs_990_2016/pipeline.yaml +++ b/datasets/irs_990/irs_990_2016/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/16eofinextract990.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_2016/data_output.csv" PIPELINE_NAME: "irs_990_2016" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_irs_990_2016_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2016/data_output.csv"] diff --git a/datasets/irs_990/irs_990_2017/irs_990_2017_dag.py b/datasets/irs_990/irs_990_2017/irs_990_2017_dag.py index 8f5bf67a9..48385a922 100644 --- a/datasets/irs_990/irs_990_2017/irs_990_2017_dag.py +++ b/datasets/irs_990/irs_990_2017/irs_990_2017_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/17eofinextract990.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_2017/data_output.csv", "PIPELINE_NAME": "irs_990_2017", "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_2017_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_2017_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_2017/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_2017", diff --git a/datasets/irs_990/irs_990_2017/pipeline.yaml b/datasets/irs_990/irs_990_2017/pipeline.yaml index 1c88867b0..25495f0a4 100644 --- a/datasets/irs_990/irs_990_2017/pipeline.yaml +++ b/datasets/irs_990/irs_990_2017/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/17eofinextract990.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_2017/data_output.csv" PIPELINE_NAME: "irs_990_2017" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_irs_990_2017_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2017/data_output.csv"] diff --git a/datasets/irs_990/irs_990_ez_2014/irs_990_ez_2014_dag.py b/datasets/irs_990/irs_990_ez_2014/irs_990_ez_2014_dag.py index 39d081e92..1590cb24d 100644 --- a/datasets/irs_990/irs_990_ez_2014/irs_990_ez_2014_dag.py +++ b/datasets/irs_990/irs_990_ez_2014/irs_990_ez_2014_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/14eofinextract990ez.zip", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_ez_2014/data_output.csv", "PIPELINE_NAME": "irs_990_ez_2014", "CSV_HEADERS": '["ein","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_ez_2014_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_ez_2014_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_ez_2014/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_ez_2014", diff --git a/datasets/irs_990/irs_990_ez_2014/pipeline.yaml b/datasets/irs_990/irs_990_ez_2014/pipeline.yaml index 3038f8c52..5d96bce5f 100644 --- a/datasets/irs_990/irs_990_ez_2014/pipeline.yaml +++ b/datasets/irs_990/irs_990_ez_2014/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/14eofinextract990ez.zip" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_ez_2014/data_output.csv" PIPELINE_NAME: "irs_990_ez_2014" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_irs_990_ez_2014_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2014/data_output.csv"] diff --git a/datasets/irs_990/irs_990_ez_2015/irs_990_ez_2015_dag.py b/datasets/irs_990/irs_990_ez_2015/irs_990_ez_2015_dag.py index d34114af8..b53c5b263 100644 --- a/datasets/irs_990/irs_990_ez_2015/irs_990_ez_2015_dag.py +++ b/datasets/irs_990/irs_990_ez_2015/irs_990_ez_2015_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/15eofinextractEZ.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_ez_2015/data_output.csv", "PIPELINE_NAME": "irs_990_ez_2015", "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_ez_2015_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_ez_2015_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_ez_2015/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_ez_2015", diff --git a/datasets/irs_990/irs_990_ez_2015/pipeline.yaml b/datasets/irs_990/irs_990_ez_2015/pipeline.yaml index b884f9fed..ebca122ad 100644 --- a/datasets/irs_990/irs_990_ez_2015/pipeline.yaml +++ b/datasets/irs_990/irs_990_ez_2015/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/15eofinextractEZ.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_ez_2015/data_output.csv" PIPELINE_NAME: "irs_990_ez_2015" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_irs_990_ez_2015_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2015/data_output.csv"] diff --git a/datasets/irs_990/irs_990_ez_2016/irs_990_ez_2016_dag.py b/datasets/irs_990/irs_990_ez_2016/irs_990_ez_2016_dag.py index b5b106864..5fb8cf1f3 100644 --- a/datasets/irs_990/irs_990_ez_2016/irs_990_ez_2016_dag.py +++ b/datasets/irs_990/irs_990_ez_2016/irs_990_ez_2016_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/16eofinextractez.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_ez_2016/data_output.csv", "PIPELINE_NAME": "irs_990_ez_2016", "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_ez_2016_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_ez_2016_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_ez_2016/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_ez_2016", diff --git a/datasets/irs_990/irs_990_ez_2016/pipeline.yaml b/datasets/irs_990/irs_990_ez_2016/pipeline.yaml index 0e375d39f..0d41943ca 100644 --- a/datasets/irs_990/irs_990_ez_2016/pipeline.yaml +++ b/datasets/irs_990/irs_990_ez_2016/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/16eofinextractez.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_ez_2016/data_output.csv" PIPELINE_NAME: "irs_990_ez_2016" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_irs_990_ez_2016_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2016/data_output.csv"] diff --git a/datasets/irs_990/irs_990_ez_2017/irs_990_ez_2017_dag.py b/datasets/irs_990/irs_990_ez_2017/irs_990_ez_2017_dag.py index 16d1cfa52..4a9d5ca92 100644 --- a/datasets/irs_990/irs_990_ez_2017/irs_990_ez_2017_dag.py +++ b/datasets/irs_990/irs_990_ez_2017/irs_990_ez_2017_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/17eofinextractEZ.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_ez_2017/data_output.csv", "PIPELINE_NAME": "irs_990_ez_2017", "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_ez_2017_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_ez_2017_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_ez_2017/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_ez_2017", diff --git a/datasets/irs_990/irs_990_ez_2017/pipeline.yaml b/datasets/irs_990/irs_990_ez_2017/pipeline.yaml index ef3fa9818..cd00bd0b6 100644 --- a/datasets/irs_990/irs_990_ez_2017/pipeline.yaml +++ b/datasets/irs_990/irs_990_ez_2017/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/17eofinextractEZ.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_ez_2017/data_output.csv" PIPELINE_NAME: "irs_990_ez_2017" CSV_HEADERS: >- @@ -97,7 +97,7 @@ dag: task_id: "load_irs_990_ez_2017_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2017/data_output.csv"] diff --git a/datasets/irs_990/irs_990_pf_2014/irs_990_pf_2014_dag.py b/datasets/irs_990/irs_990_pf_2014/irs_990_pf_2014_dag.py index 8f19dbcde..057bd6a73 100644 --- a/datasets/irs_990/irs_990_pf_2014/irs_990_pf_2014_dag.py +++ b/datasets/irs_990/irs_990_pf_2014/irs_990_pf_2014_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/14eofinextract990pf.zip", "SOURCE_FILE": "files/data.zip", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_pf_2014/data_output.csv", "PIPELINE_NAME": "irs_990_pf_2014", "CSV_HEADERS": '["ein","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","acqdrindrintcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_pf_2014_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_pf_2014_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_pf_2014/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_pf_2014", diff --git a/datasets/irs_990/irs_990_pf_2014/pipeline.yaml b/datasets/irs_990/irs_990_pf_2014/pipeline.yaml index eb3fbc1ff..cd07da6d0 100644 --- a/datasets/irs_990/irs_990_pf_2014/pipeline.yaml +++ b/datasets/irs_990/irs_990_pf_2014/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/14eofinextract990pf.zip" SOURCE_FILE: "files/data.zip" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_pf_2014/data_output.csv" PIPELINE_NAME: "irs_990_pf_2014" CSV_HEADERS: >- @@ -95,7 +95,7 @@ dag: task_id: "load_irs_990_pf_2014_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_pf_2014/data_output.csv"] diff --git a/datasets/irs_990/irs_990_pf_2015/irs_990_pf_2015_dag.py b/datasets/irs_990/irs_990_pf_2015/irs_990_pf_2015_dag.py index 858232e7f..e6f0011cf 100644 --- a/datasets/irs_990/irs_990_pf_2015/irs_990_pf_2015_dag.py +++ b/datasets/irs_990/irs_990_pf_2015/irs_990_pf_2015_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/15eofinextract990pf.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_pf_2015/data_output.csv", "PIPELINE_NAME": "irs_990_pf_2015", "CSV_HEADERS": '["ein","elf","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","distribdafcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_pf_2015_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_pf_2015_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_pf_2015/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_pf_2015", diff --git a/datasets/irs_990/irs_990_pf_2015/pipeline.yaml b/datasets/irs_990/irs_990_pf_2015/pipeline.yaml index 8e623a050..25ef8bc50 100644 --- a/datasets/irs_990/irs_990_pf_2015/pipeline.yaml +++ b/datasets/irs_990/irs_990_pf_2015/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/15eofinextract990pf.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_pf_2015/data_output.csv" PIPELINE_NAME: "irs_990_pf_2015" CSV_HEADERS: >- @@ -95,7 +95,7 @@ dag: task_id: "load_irs_990_pf_2015_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_pf_2015/data_output.csv"] diff --git a/datasets/irs_990/irs_990_pf_2016/irs_990_pf_2016_dag.py b/datasets/irs_990/irs_990_pf_2016/irs_990_pf_2016_dag.py index d81fd4779..40a2201cd 100644 --- a/datasets/irs_990/irs_990_pf_2016/irs_990_pf_2016_dag.py +++ b/datasets/irs_990/irs_990_pf_2016/irs_990_pf_2016_dag.py @@ -61,7 +61,7 @@ "SOURCE_URL": "https://www.irs.gov/pub/irs-soi/16eofinextract990pf.dat", "SOURCE_FILE": "files/data.dat", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/irs_990/irs_990_pf_2016/data_output.csv", "PIPELINE_NAME": "irs_990_pf_2016", "CSV_HEADERS": '["ein","elf","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","distribdafcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"]', @@ -73,7 +73,7 @@ # Task to load CSV data to a BigQuery table load_irs_990_pf_2016_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_irs_990_pf_2016_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/irs_990/irs_990_pf_2016/data_output.csv"], source_format="CSV", destination_project_dataset_table="irs_990.irs_990_pf_2016", diff --git a/datasets/irs_990/irs_990_pf_2016/pipeline.yaml b/datasets/irs_990/irs_990_pf_2016/pipeline.yaml index 2d69b4dbc..ece4ccfae 100644 --- a/datasets/irs_990/irs_990_pf_2016/pipeline.yaml +++ b/datasets/irs_990/irs_990_pf_2016/pipeline.yaml @@ -75,7 +75,7 @@ dag: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/16eofinextract990pf.dat" SOURCE_FILE: "files/data.dat" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/irs_990/irs_990_pf_2016/data_output.csv" PIPELINE_NAME: "irs_990_pf_2016" CSV_HEADERS: >- @@ -96,7 +96,7 @@ dag: task_id: "load_irs_990_pf_2016_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_pf_2016/data_output.csv"] diff --git a/datasets/noaa/gsod_stations/gsod_stations_dag.py b/datasets/noaa/gsod_stations/gsod_stations_dag.py index 752f9b6d8..cdf7777d8 100644 --- a/datasets/noaa/gsod_stations/gsod_stations_dag.py +++ b/datasets/noaa/gsod_stations/gsod_stations_dag.py @@ -63,7 +63,7 @@ "FTP_FILENAME": "isd-history.txt", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/noaa/gsod_stations/data_output.csv", }, resources={"limit_memory": "2G", "limit_cpu": "1"}, @@ -72,7 +72,7 @@ # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/noaa/gsod_stations/data_output.csv"], source_format="CSV", destination_project_dataset_table="noaa.gsod_stations", diff --git a/datasets/noaa/gsod_stations/pipeline.yaml b/datasets/noaa/gsod_stations/pipeline.yaml index 3d4aabb6c..72c8a939f 100644 --- a/datasets/noaa/gsod_stations/pipeline.yaml +++ b/datasets/noaa/gsod_stations/pipeline.yaml @@ -77,7 +77,7 @@ dag: FTP_FILENAME: "isd-history.txt" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/noaa/gsod_stations/data_output.csv" @@ -93,7 +93,7 @@ dag: task_id: "load_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/noaa/gsod_stations/data_output.csv"] diff --git a/datasets/noaa/lightning_strikes_by_year/lightning_strikes_by_year_dag.py b/datasets/noaa/lightning_strikes_by_year/lightning_strikes_by_year_dag.py index ff02bc581..b113ed4c5 100644 --- a/datasets/noaa/lightning_strikes_by_year/lightning_strikes_by_year_dag.py +++ b/datasets/noaa/lightning_strikes_by_year/lightning_strikes_by_year_dag.py @@ -60,7 +60,7 @@ "SOURCE_URL": "https://www1.ncdc.noaa.gov/pub/data/swdi/database-csv/v2/nldn-tiles-{{ macros.ds_format(macros.ds_add(ds, -365), '%Y-%m-%d', '%Y') }}.csv.gz", "SOURCE_FILE": "files/data.csv", "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.json.shared.composer_bucket }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", "TARGET_GCS_PATH": "data/noaa/lightning_strikes_by_year/data_output.csv", }, resources={"limit_memory": "2G", "limit_cpu": "1"}, @@ -69,7 +69,7 @@ # Task to load CSV data to a BigQuery table load_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="load_to_bq", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/noaa/lightning_strikes_by_year/data_output.csv"], source_format="CSV", destination_project_dataset_table="noaa.lightning_strikes_{{ macros.ds_format(macros.ds_add(ds, -365), \u0027%Y-%m-%d\u0027, \u0027%Y\u0027) }}", diff --git a/datasets/noaa/lightning_strikes_by_year/pipeline.yaml b/datasets/noaa/lightning_strikes_by_year/pipeline.yaml index ddb58e766..13b657583 100644 --- a/datasets/noaa/lightning_strikes_by_year/pipeline.yaml +++ b/datasets/noaa/lightning_strikes_by_year/pipeline.yaml @@ -74,7 +74,7 @@ dag: SOURCE_URL: "https://www1.ncdc.noaa.gov/pub/data/swdi/database-csv/v2/nldn-tiles-{{ macros.ds_format(macros.ds_add(ds, -365), '%Y-%m-%d', '%Y') }}.csv.gz" SOURCE_FILE: "files/data.csv" TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.json.shared.composer_bucket }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" TARGET_GCS_PATH: "data/noaa/lightning_strikes_by_year/data_output.csv" @@ -90,7 +90,7 @@ dag: task_id: "load_to_bq" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/noaa/lightning_strikes_by_year/data_output.csv"] diff --git a/datasets/usa_names/usa_1910_current/pipeline.yaml b/datasets/usa_names/usa_1910_current/pipeline.yaml index 53153ffca..cc4766f53 100644 --- a/datasets/usa_names/usa_1910_current/pipeline.yaml +++ b/datasets/usa_names/usa_1910_current/pipeline.yaml @@ -52,7 +52,7 @@ dag: description: "Task to load the data from Airflow data folder to BigQuery" args: task_id: "load_csv_file_to_bq_table" - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" source_objects: ["data/usa_names/usa_1910_current/{{ ds }}/data.csv"] source_format: "CSV" destination_project_dataset_table: "usa_names.usa_1910_current" diff --git a/datasets/usa_names/usa_1910_current/usa_1910_current_dag.py b/datasets/usa_names/usa_1910_current/usa_1910_current_dag.py index eb00496a7..49cd07e26 100644 --- a/datasets/usa_names/usa_1910_current/usa_1910_current_dag.py +++ b/datasets/usa_names/usa_1910_current/usa_1910_current_dag.py @@ -46,7 +46,7 @@ # Task to load the data from Airflow data folder to BigQuery load_csv_file_to_bq_table = gcs_to_bigquery.GCSToBigQueryOperator( task_id="load_csv_file_to_bq_table", - bucket="{{ var.json.shared.composer_bucket }}", + bucket="{{ var.value.composer_bucket }}", source_objects=["data/usa_names/usa_1910_current/{{ ds }}/data.csv"], source_format="CSV", destination_project_dataset_table="usa_names.usa_1910_current", diff --git a/samples/pipeline.airflow1.yaml b/samples/pipeline.airflow1.yaml index 30261d145..8a8f797ab 100644 --- a/samples/pipeline.airflow1.yaml +++ b/samples/pipeline.airflow1.yaml @@ -120,7 +120,7 @@ dag: mkdir -p $airflow_home/data/$dataset/$pipeline/run_date={{ ds }} CUSTOM_ENV_VAR=$custom_env_var python $airflow_home/dags/$dataset/$pipeline/custom/some_script.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: DATASET_FOLDER_NAME pipeline: PIPELINE_FOLDER_NAME custom_env_var: "some value that your custom script needs" @@ -139,7 +139,7 @@ dag: task_id: "sample_gcs_to_bq_task" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/DATASET_FOLDER_NAME/PIPELINE_FOLDER_NAME/run_date={{ ds }}/data.csv"] @@ -185,7 +185,7 @@ dag: task_id: "sample_gcs_to_gcs_task" # The GCS bucket to copy the object/s from - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" # Use a trailing "/*" if you want to copy all objects under that path. source_object: "data/DATASET_FOLDER_NAME/PIPELINE_FOLDER_NAME/run_date={{ ds }}/*" diff --git a/samples/pipeline.yaml b/samples/pipeline.yaml index b9b522ee7..f6119bb9c 100644 --- a/samples/pipeline.yaml +++ b/samples/pipeline.yaml @@ -112,7 +112,7 @@ dag: mkdir -p $airflow_home/data/$dataset/$pipeline/run_date={{ ds }} CUSTOM_ENV_VAR=$custom_env_var python $airflow_home/dags/$dataset/$pipeline/custom/some_script.py env: - airflow_home: "{{ var.json.shared.airflow_home }}" + airflow_home: "{{ var.value.airflow_home }}" dataset: DATASET_FOLDER_NAME pipeline: PIPELINE_FOLDER_NAME custom_env_var: "some value that your custom script needs" @@ -130,7 +130,7 @@ dag: task_id: "sample_gcs_to_bq_task" # The GCS bucket where the CSV file is located in. - bucket: "{{ var.json.shared.composer_bucket }}" + bucket: "{{ var.value.composer_bucket }}" # The GCS object path for the CSV file source_objects: ["data/DATASET_FOLDER_NAME/PIPELINE_FOLDER_NAME/run_date={{ ds }}/data.csv"] @@ -176,7 +176,7 @@ dag: task_id: "sample_gcs_to_gcs_task" # The GCS bucket to copy the object/s from - source_bucket: "{{ var.json.shared.composer_bucket }}" + source_bucket: "{{ var.value.composer_bucket }}" # Use a trailing "/*" if you want to copy all objects under that path. source_object: "data/DATASET_FOLDER_NAME/PIPELINE_FOLDER_NAME/run_date={{ ds }}/*" @@ -360,8 +360,8 @@ dag: args: task_id: "gke_start_pod_task" - project_id: "{{ var.json.shared.gcp_project_id }}" - location: "{{ var.json.shared.gcp_location }}" + project_id: "{{ var.value.gcp_project_id }}" + location: "{{ var.value.gcp_location }}" # The name of the Google Kubernetes Engine cluster the pod should be spawned in cluster_name: "GKE_CLUSTER_NAME" @@ -390,8 +390,8 @@ dag: args: task_id: "gke_delete_cluster_task" - project_id: "{{ var.json.shared.gcp_project_id }}" - location: "{{ var.json.shared.gcp_location }}" + project_id: "{{ var.value.gcp_project_id }}" + location: "{{ var.value.gcp_location }}" # The GKE cluster name name: "sample-gke-cluster" @@ -404,8 +404,8 @@ dag: args: task_id: "gke_create_cluster_task" - project_id: "{{ var.json.shared.gcp_project_id }}" - location: "{{ var.json.shared.gcp_location }}" + project_id: "{{ var.value.gcp_project_id }}" + location: "{{ var.value.gcp_location }}" # The cluster definition to create, # see https://googleapis.dev/python/container/latest/container_v1/types.html#google.cloud.container_v1.types.Cluster @@ -440,7 +440,7 @@ dag: # Path to the Python script containing the Apache Beam Pipeline definition. # Place the script in your pipeline's `custom` folder, preferrably namespaced # under a folder named after the job name (templated as `DATAFLOW_JOB_NAME`). - py_file: "{{ var.json.shared.airflow_dags_folder }}/DATASET_NAME/PIPELINE_NAME/custom/DATAFLOW_JOB_NAME/main.py" + py_file: "{{ var.value.airflow_dags_folder }}/DATASET_NAME/PIPELINE_NAME/custom/DATAFLOW_JOB_NAME/main.py" # Python version of the Beam pipeline. If None, this defaults to python3. py_interpreter: "python3" @@ -455,22 +455,22 @@ dag: dataflow_default_options: # Your Google Cloud project ID - project: "{{ var.json.shared.gcp_project }}" + project: "{{ var.value.gcp_project }}" # [Required] Path for temporary files. # You can use the Composer bucket (preferred) or your own bucket. # When using the Cloud Composer bucket, use the templated path below to prevent collisions. - temp_location: "{{ var.json.shared.composer_bucket }}/data/DATASET_NAME/PIPELINE_NAME/dataflow/tmp/DATAFLOW_JOB_NAME/" + temp_location: "{{ var.value.composer_bucket }}/data/DATASET_NAME/PIPELINE_NAME/dataflow/tmp/DATAFLOW_JOB_NAME/" # The GCS bucket path to store staging binary files used by Apache Beam # You can use the Composer bucket (preferred) or your own bucket. # When using the Cloud Composer bucket, use the templated path below to prevent collisions. - staging_location: "{{ var.json.shared.composer_bucket }}/data/DATASET_NAME/PIPELINE_NAME/dataflow/staging/" + staging_location: "{{ var.value.composer_bucket }}/data/DATASET_NAME/PIPELINE_NAME/dataflow/staging/" # [Optional] If your pipeline uses public packages from the PyPI, make # these packages available via a requirements.txt file. Create the file # in the `custom` folder, in the same subfolder as your `main.py`. - requirements_file: "{{ var.json.shared.airflow_dags_folder }}/DATASET_NAME/PIPELINE_NAME/custom/DATAFLOW_JOB_NAME/requirements.txt" + requirements_file: "{{ var.value.airflow_dags_folder }}/DATASET_NAME/PIPELINE_NAME/custom/DATAFLOW_JOB_NAME/requirements.txt" # [Required] The pipeline runner to use. Must be set to "DataflowRunner" to run on # Google Cloud Dataflow. Keep this unchanged.