Skip to content

Update combined datasets #4838

Update combined datasets

Update combined datasets #4838

name: Update combined datasets
# Use a concurrency group to make sure we don't try to have multiple workflows
# run with the hosted runner at the same time.
concurrency: gce-runner
on:
# 2023/01/17: We no longer have a run scheduled via github actions and instead rely on the prefect
# scrapers to kick off a build after NYT updates each day.
# https://github.com/covid-projections/can-scrapers/blob/643819f7a42d0ae227453b1de24a317a54c6cde8/services/prefect/flows/scheduled_nyt_and_parquet_updater.py#L37
#
# schedule:
# # Run job everyday at 5:00 am EST
# - cron: '0 10 * * *'
workflow_dispatch:
inputs:
trigger_api_build:
description: 'If "true" API snapshot build will be triggered after dataset update.'
default: 'true'
refresh_datasets:
description: 'Set to "false" to skip downloading / re-combining the latest datasets.'
default: 'true'
repository_dispatch:
env:
# Used by python code that reports errors to sentry.
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
SENTRY_ENVIRONMENT: 'production'
# use a webhook to write to slack channel dev-alerts for QA
SLACK_DEV_ALERTS_WEBHOOK: ${{ secrets.SLACK_DEV_ALERTS_WEBHOOK }}
DATA_AVAILABILITY_SHEET_NAME: "Data Availability"
GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA: ${{ secrets.GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA }}
# Use trigger_api_build if specified, else use true. This automatically triggers
# the API build on scheduled runs where trigger_api_build is not defined.
# https://github.community/t/how-can-you-use-expressions-as-the-workflow-dispatch-input-default/141454/4
TRIGGER_API_BUILD: ${{ github.event.inputs.trigger_api_build || 'true' }}
REFRESH_DATASETS_ARG: ${{ (github.event.inputs.refresh_datasets == 'true') && '--refresh-datasets' || '--no-refresh-datasets' }}
# The GCE instance to start / stop before / after running the job.
GCE_ZONE: "us-west1-b"
GCE_INSTANCE: "can-actions-runner"
jobs:
start-runner:
runs-on: ubuntu-latest
steps:
- id: "auth"
uses: "google-github-actions/auth@v1"
with:
credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}"
- name: "Set up Cloud SDK"
uses: "google-github-actions/setup-gcloud@v1"
- name: "Start ${{env.GCE_INSTANCE}} VM."
run: "gcloud compute instances start --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}"
update-and-promote-datasets:
needs: "start-runner"
runs-on: gce-runner
steps:
- name: Parse covid data model branch name and set env variable
run: |
echo "COVID_DATA_MODEL_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
- name: Checkout covid-data-model
uses: actions/checkout@v2
with:
repository: act-now-coalition/covid-data-model
path: covid-data-model
ref: '${{env.COVID_DATA_MODEL_REF}}'
lfs: true
- name: Update NYTimes anomalies file
working-directory: ./covid-data-model
run: |
curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/anomalies.csv --output data/nyt_anomalies.csv
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Cache Pip
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
${{ runner.os }}-
- name: Install Dependencies
working-directory: ./covid-data-model
run: pip install -r requirements.txt
- name: prune covid-data-model
working-directory: ./covid-data-model
run: git lfs prune
- name: Update and Promote dataset.
working-directory: ./covid-data-model
run: |
./run.py data update ${{env.REFRESH_DATASETS_ARG}}
- name: Create Update Commit
working-directory: ./covid-data-model
run: ./tools/push-data-update.sh
- name: Maybe Trigger API build
if: env.TRIGGER_API_BUILD == 'true'
working-directory: ./covid-data-model
env:
GITHUB_TOKEN: ${{ secrets.CAN_ROBOT_PERSONAL_ACCESS_TOKEN }}
run: |
./tools/build-snapshot.sh main
- name: Slack notification
if: env.TRIGGER_API_BUILD == 'true'
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
with:
args: 'Started new API build from dataset updater action.'
# TODO(https://trello.com/c/4dFFtQiH/1239-fix-data-availability-report-or-officially-replace-it-with-toms-dashboard)
# - name: Update Data Availability Sheet
# working-directory: ./covid-data-model
# run: |
# ./run.py data update-availability-report
- name: Slack notification
if: failure()
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
STATUS: ${{job.status}}
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
with:
args: 'update-dataset-snapshot failed'
- name: Slack notification
if: success()
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
STATUS: ${{job.status}}
DATA_AVAILABILITY_URL: http://tiny.cc/can-data
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
with:
args: 'update-dataset-snapshot succeeded. View Data Availability Report at {{DATA_AVAILABILITY_URL}}'
stop-runner:
if: ${{ always() }}
needs: ["start-runner", "update-and-promote-datasets"]
runs-on: ubuntu-latest
steps:
- id: "auth"
uses: "google-github-actions/auth@v1"
with:
credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}"
- name: "Set up Cloud SDK"
uses: "google-github-actions/setup-gcloud@v1"
- name: "Stop ${{env.GCE_INSTANCE}} VM."
run: "gcloud compute instances stop --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}"