datasets/ml_datasets/penguins/pipeline.yaml

# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
resources:
  # A list of GCP resources that are unique and specific to your pipeline.
  #
  # The currently supported resources are shown below. Use only the resources
  # needed by your pipeline, and delete the rest of the examples.
  #
  # We will keep adding to the list below to support more Google Cloud resources
  # over time. If a resource you need isn't supported, please file an issue on
  # the repository.

  - type: bigquery_table
    # A Google BigQuery table to store your data. Requires a `bigquery_dataset`
    # to be specified in the config (i.e. `dataset.yaml) for the dataset that
    # this pipeline belongs in.
    #
    # Required Properties:
    #   table_id
    table_id: penguins

dag:
  # The DAG acronym stands for directed acyclic graph. This block represents
  # your data pipeline along with every property and configuration it needs to
  # onboard your data.
  airflow_version: 1
  initialize:
    dag_id: penguins
    default_args:
      owner: "Google"

      # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded
      depends_on_past: False
      start_date: "2021-03-01"
    max_active_runs: 1
    schedule_interval: "@once"
    catchup: False
    default_view: graph

  tasks:
    # This is where you specify the tasks (a.k.a. processes) that your data
    # pipeline will run to onboard the data.
    #
    # As the examples below will show, every task must be represented by an
    # Airflow operator. The list of suported operators are listed in
    #
    #   scripts/dag_imports.json
    #
    # If an operator you need isn't supported, please file an issue on the
    # repository.
    #
    # Use the YAML list syntax in this block to specify every task for your
    # pipeline.

    - operator: "GoogleCloudStorageToBigQueryOperator"
      # Initializes GCS to BQ task for the DAG. This operator is used to load a
      # CSV file from GCS into a BigQuery table.

      # Task description
      description: "Task to load CSV data to a BigQuery table"

      args:
        # Arguments supported by this operator:
        # http://airflow.apache.org/docs/apache-airflow/1.10.14/howto/operator/gcp/gcs.html#googlecloudstoragetobigqueryoperator

        task_id: "penguins_gcs_to_bq_task"

        # The GCS bucket where the CSV file is located in.
        bucket: "cloud-samples-data"

        # The GCS object path for the CSV file
        source_objects:
          [
            "ai-platform/penguins/penguins.data.csv",
            "ai-platform/penguins/penguins.test.csv",
          ]
        source_format: "CSV"
        destination_project_dataset_table: "ml_datasets.penguins"

        # Use this if your CSV file contains a header row
        skip_leading_rows: 1

        # How to write data to the table: overwrite, append, or write if empty
        # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition
        write_disposition: "WRITE_TRUNCATE"

        # The BigQuery table schema based on the CSV file. For more info, see
        # https://cloud.google.com/bigquery/docs/schemas.
        # Always use snake_case and lowercase for column names, and be explicit,
        # i.e. specify modes for all columns.
        schema_fields:
          - name: "species"
            type: "STRING"
            mode: "REQUIRED"
          - name: "island"
            type: "STRING"
            mode: "NULLABLE"
          - name: "culmen_length_mm"
            type: "FLOAT"
            mode: "NULLABLE"
          - name: "culmen_depth_mm"
            type: "FLOAT"
            mode: "NULLABLE"
          - name: "flipper_length_mm"
            type: "FLOAT"
            mode: "NULLABLE"
          - name: "body_mass_g"
            type: "FLOAT"
            mode: "NULLABLE"
          - name: "sex"
            type: "STRING"
            mode: "NULLABLE"

  graph_paths:
    # This is where you specify the relationships (i.e. directed paths/edges)
    # among the tasks specified above. Use the bitshift operator to define the
    # relationships and the `task_id` value above to represent tasks.
    #
    # For more info, see
    # https://airflow.apache.org/docs/apache-airflow/stable/tutorial.html#setting-up-dependencies
    - "penguins_gcs_to_bq_task"