datasets/covid19_tracking/state_screenshots/pipeline.yaml

# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
resources:
  - type: bigquery_table
    table_id: state_screenshots

dag:
  airflow_version: 1
  initialize:
    dag_id: "state_screenshots"
    default_args:
      owner: "Google"

      # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded
      depends_on_past: False
      start_date: '2021-03-01'
    max_active_runs: 1
    schedule_interval: "@once"  # run once a week at Sunday 12am
    catchup: False
    default_view: graph

  tasks:
    - operator: "BashOperator"
      description: "Run the custom/generate_csv.py script to scrape the webpage and generate a CSV file of the state screenshots"
      args:
        task_id: "generate_csv_data_from_web_scraping"
        bash_command: |
          mkdir -p $airflow_home/data/$dataset/$pipeline/run_date={{ ds }}
          SOURCE_URL=$source_url CSV_OUTPUT_PATH=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }}/data.csv GCS_PATH_PREFIX="gs://$destination_bucket/datasets/$dataset/$pipeline/run_date={{ ds }}/screenshots" python $airflow_home/dags/$dataset/$pipeline/custom/web_scrape_and_generate_csv.py
        env:
          airflow_home: "{{ var.json.shared.airflow_home }}"
          destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}"
          source_url: "https://screenshots.covidtracking.com"
          dataset: "covid19_tracking"
          pipeline: "state_screenshots"

    - operator: "BashOperator"
      description: "Run the custom/download_screenshots.py script to download all the screenshots to the local file system (mounted GCS)"
      args:
        task_id: "download_screenshots"
        bash_command: |
          CSV_PATH=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }}/data.csv \
          SOURCE_COLUMN="source_url" \
          DOWNLOAD_PREFIX=$airflow_home/data/$dataset/$pipeline/run_date={{ ds }} \
          python $airflow_home/dags/$dataset/$pipeline/custom/download_screenshots.py
        env:
          airflow_home: "{{ var.json.shared.airflow_home }}"
          dataset: "covid19_tracking"
          pipeline: "state_screenshots"

    - operator: "GoogleCloudStorageToGoogleCloudStorageOperator"
      description: "Upload all downloaded screenshots to the destination bucket"
      args:
        task_id: "upload_screenshots_to_destination_bucket"
        source_bucket: "{{ var.json.shared.composer_bucket }}"
        source_object: "data/covid19_tracking/state_screenshots/run_date={{ ds }}/*"
        destination_bucket: "{{ var.json.covid19_tracking.destination_bucket }}"
        destination_object: "datasets/covid19_tracking/state_screenshots/run_date={{ ds }}/"
        move_object: False

    - operator: "GoogleCloudStorageToBigQueryOperator"
      description: "Task to load the data from Airflow data folder to BigQuery"
      args:
        task_id: "load_screenshots_to_bq_table"
        bucket: "{{ var.json.shared.composer_bucket }}"
        source_objects: ["data/covid19_tracking/state_screenshots/run_date={{ ds }}/data.csv"]
        source_format: "CSV"
        destination_project_dataset_table: "covid19_tracking.state_screenshots"
        skip_leading_rows: 1
        write_disposition: "WRITE_TRUNCATE"
        schema_fields:
          - name: "state"
            type: "STRING"
            mode: "REQUIRED"
            description: "2-letter postal abbreviation for the state"
          - name: "state_name"
            type: "STRING"
            mode: "REQUIRED"
            description: "The full state name"
          - name: "date"
            type: "DATE"
            mode: "REQUIRED"
            description: "Date of the observations"
          - name: "source_type"
            type: "STRING"
            mode: "REQUIRED"
            description: "How far from the original event or phenomenon the information source is created. Can be one of primary, secondary, tertiary, quaternary"
          - name: "time_of_day"
            type: "STRING"
            mode: "REQUIRED"
            description: "The time of day, based on a 12-hour clock in Eastern Time"
          - name: "source_url"
            type: "STRING"
            mode: "REQUIRED"
            description: "The URL where the screenshot or file was originally found"
          - name: "google_cloud_storage_uri"
            type: "STRING"
            mode: "REQUIRED"
            description: "The GCS location where the screenshot or file was copied to"

    - operator: "GoogleCloudStorageDeleteOperator"
      description: "Delete downloaded screenshots from the Cloud Composer bucket"
      args:
        task_id: "delete_screenshots_from_composer_bucket"
        bucket_name: "{{ var.json.shared.composer_bucket }}"
        prefix: "data/covid19_tracking/state_screenshots/run_date={{ ds }}"

  graph_paths:
    - "generate_csv_data_from_web_scraping >> download_screenshots"
    - "download_screenshots >> upload_screenshots_to_destination_bucket"
    - "upload_screenshots_to_destination_bucket >> load_screenshots_to_bq_table"
    - "load_screenshots_to_bq_table >> delete_screenshots_from_composer_bucket"