Google Cloud - Vertex_AI - AutoML - Tables - Create_dataset - Added t…

…he "From_GCS" and "From_BigQuery" components
Ark-kun · Aug 3, 2021 · eab9868 · eab9868
1 parent 204f65b
commit eab9868
Show file tree

Hide file tree

Showing 4 changed files with 530 additions and 0 deletions.
diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.py b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.py
@@ -0,0 +1,80 @@
+from typing import NamedTuple
+
+from kfp.components import create_component_from_func
+
+def create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(
+    data_uri: 'GoogleCloudBigQueryUri',
+    display_name: str = None,
+    encryption_spec_key_name: str = None,
+    project: str = None,
+    location: str = 'us-central1',
+) -> NamedTuple('Outputs', [
+    ('dataset_name', 'GoogleCloudVertexAiTabularDatasetName'),
+    ('dataset_dict', dict),
+]):
+    '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+
+    Args:
+        data_uri: Google Cloud BigQuery URI pointing to the data that should be imported into the dataset.
+            The bucket must be a regional bucket in the us-central1 region.
+            The file name must have a (case-insensitive) '.CSV' file extension.
+        display_name: Display name for the AutoML Dataset.
+            Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
+        encryption_spec_key_name (Optional[str]):
+            Optional. The Cloud KMS resource identifier of the customer
+            managed encryption key used to protect a resource. Has the
+            form:
+            ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
+            The key needs to be in the same region as where the compute
+            resource is created.
+        project: Google Cloud project ID. If not set, the default one will be used.
+        location: Google Cloud region. AutoML Tables only supports us-central1.
+    Returns:
+        dataset_name: Dataset name (fully-qualified)
+        dataset_dict: Dataset object in JSON format
+    '''
+
+    import datetime
+    import json
+    import logging
+
+    from google.cloud import aiplatform
+    from google.protobuf import json_format
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    if not display_name:
+        display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
+
+    aiplatform.init(
+        project=project,
+        location=location,
+        encryption_spec_key_name=encryption_spec_key_name,
+    )
+    dataset = aiplatform.TabularDataset.create(
+        display_name=display_name,
+        bq_source=data_uri,
+    )
+    (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/')
+    dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}'
+    logging.info(f'Created dataset {dataset.name}.')
+    logging.info(f'Link: {dataset_web_url}')
+    dataset_json = json_format.MessageToJson(dataset._gca_resource._pb)
+    print(dataset_json)
+    return (dataset.resource_name, dataset_json, dataset_web_url)
+
+
+if __name__ == '__main__':
+    create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI_op = create_component_from_func(
+        create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI,
+        base_image='python:3.9',
+        packages_to_install=['google-cloud-aiplatform==1.1.1'],
+        output_component_file='component.yaml',
+        annotations={
+            "author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
+            "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_BigQuery/component.yaml",
+        },
+    )
diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.yaml b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.yaml
@@ -0,0 +1,177 @@
+name: Create tabular dataset from BigQuery for Google Cloud Vertex AI
+description: Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in
+  GCS.
+metadata:
+  annotations: {author: Alexey Volkov <alexey.volkov@ark-kun.com>, canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_BigQuery/component.yaml'}
+inputs:
+- name: data_uri
+  type: GoogleCloudBigQueryUri
+  description: |-
+    Google Cloud BigQuery URI pointing to the data that should be imported into the dataset.
+    The bucket must be a regional bucket in the us-central1 region.
+    The file name must have a (case-insensitive) '.CSV' file extension.
+- name: display_name
+  type: String
+  description: |-
+    Display name for the AutoML Dataset.
+    Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
+  optional: true
+- name: encryption_spec_key_name
+  type: String
+  description: |-
+    Optional. The Cloud KMS resource identifier of the customer
+    managed encryption key used to protect a resource. Has the
+    form:
+    ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
+    The key needs to be in the same region as where the compute
+    resource is created.
+  optional: true
+- {name: project, type: String, description: 'Google Cloud project ID. If not set,
+    the default one will be used.', optional: true}
+- {name: location, type: String, description: Google Cloud region. AutoML Tables only
+    supports us-central1., default: us-central1, optional: true}
+outputs:
+- {name: dataset_name, type: GoogleCloudVertexAiTabularDatasetName}
+- {name: dataset_dict, type: JsonObject}
+implementation:
+  container:
+    image: python:3.9
+    command:
+    - sh
+    - -c
+    - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
+      'google-cloud-aiplatform==1.1.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3
+      -m pip install --quiet --no-warn-script-location 'google-cloud-aiplatform==1.1.1'
+      --user) && "$0" "$@"
+    - sh
+    - -ec
+    - |
+      program_path=$(mktemp)
+      printf "%s" "$0" > "$program_path"
+      python3 -u "$program_path" "$@"
+    - |
+      def create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(
+          data_uri,
+          display_name = None,
+          encryption_spec_key_name = None,
+          project = None,
+          location = 'us-central1',
+      ):
+          '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.
+
+          Annotations:
+              author: Alexey Volkov <alexey.volkov@ark-kun.com>
+
+          Args:
+              data_uri: Google Cloud BigQuery URI pointing to the data that should be imported into the dataset.
+                  The bucket must be a regional bucket in the us-central1 region.
+                  The file name must have a (case-insensitive) '.CSV' file extension.
+              display_name: Display name for the AutoML Dataset.
+                  Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
+              encryption_spec_key_name (Optional[str]):
+                  Optional. The Cloud KMS resource identifier of the customer
+                  managed encryption key used to protect a resource. Has the
+                  form:
+                  ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
+                  The key needs to be in the same region as where the compute
+                  resource is created.
+              project: Google Cloud project ID. If not set, the default one will be used.
+              location: Google Cloud region. AutoML Tables only supports us-central1.
+          Returns:
+              dataset_name: Dataset name (fully-qualified)
+              dataset_dict: Dataset object in JSON format
+          '''
+
+          import datetime
+          import json
+          import logging
+
+          from google.cloud import aiplatform
+          from google.protobuf import json_format
+
+          logging.getLogger().setLevel(logging.INFO)
+
+          if not display_name:
+              display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
+
+          aiplatform.init(
+              project=project,
+              location=location,
+              encryption_spec_key_name=encryption_spec_key_name,
+          )
+          dataset = aiplatform.TabularDataset.create(
+              display_name=display_name,
+              bq_source=data_uri,
+          )
+          (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/')
+          dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}'
+          logging.info(f'Created dataset {dataset.name}.')
+          logging.info(f'Link: {dataset_web_url}')
+          dataset_json = json_format.MessageToJson(dataset._gca_resource._pb)
+          print(dataset_json)
+          return (dataset.resource_name, dataset_json, dataset_web_url)
+
+      def _serialize_json(obj) -> str:
+          if isinstance(obj, str):
+              return obj
+          import json
+          def default_serializer(obj):
+              if hasattr(obj, 'to_struct'):
+                  return obj.to_struct()
+              else:
+                  raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
+          return json.dumps(obj, default=default_serializer, sort_keys=True)
+
+      import argparse
+      _parser = argparse.ArgumentParser(prog='Create tabular dataset from BigQuery for Google Cloud Vertex AI', description='Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.')
+      _parser.add_argument("--data-uri", dest="data_uri", type=str, required=True, default=argparse.SUPPRESS)
+      _parser.add_argument("--display-name", dest="display_name", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--encryption-spec-key-name", dest="encryption_spec_key_name", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--project", dest="project", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("--location", dest="location", type=str, required=False, default=argparse.SUPPRESS)
+      _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
+      _parsed_args = vars(_parser.parse_args())
+      _output_files = _parsed_args.pop("_output_paths", [])
+
+      _outputs = create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(**_parsed_args)
+
+      _output_serializers = [
+          str,
+          _serialize_json,
+
+      ]
+
+      import os
+      for idx, output_file in enumerate(_output_files):
+          try:
+              os.makedirs(os.path.dirname(output_file))
+          except OSError:
+              pass
+          with open(output_file, 'w') as f:
+              f.write(_output_serializers[idx](_outputs[idx]))
+    args:
+    - --data-uri
+    - {inputValue: data_uri}
+    - if:
+        cond: {isPresent: display_name}
+        then:
+        - --display-name
+        - {inputValue: display_name}
+    - if:
+        cond: {isPresent: encryption_spec_key_name}
+        then:
+        - --encryption-spec-key-name
+        - {inputValue: encryption_spec_key_name}
+    - if:
+        cond: {isPresent: project}
+        then:
+        - --project
+        - {inputValue: project}
+    - if:
+        cond: {isPresent: location}
+        then:
+        - --location
+        - {inputValue: location}
+    - '----output-paths'
+    - {outputPath: dataset_name}
+    - {outputPath: dataset_dict}
diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.py b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.py
@@ -0,0 +1,88 @@
+from typing import NamedTuple
+
+from kfp.components import create_component_from_func
+
+def create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI(
+    data_uri: 'GoogleCloudStorageUri',  # data_type: "CSV"
+    display_name: str = None,
+    encryption_spec_key_name: str = None,
+    project: str = None,
+    location: str = 'us-central1',
+) -> NamedTuple('Outputs', [
+    ('dataset_name', 'GoogleCloudVertexAiTabularDatasetName'),
+    ('dataset_dict', dict),
+]):
+    '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.
+
+    Annotations:
+        author: Alexey Volkov <alexey.volkov@ark-kun.com>
+
+    Args:
+        data_uri: Google Cloud Storage URI pointing to the data in CSV format that should be imported into the dataset.
+            The bucket must be a regional bucket in the us-central1 region.
+            The file name must have a (case-insensitive) '.CSV' file extension.
+        display_name: Display name for the AutoML Dataset.
+            Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
+        encryption_spec_key_name (Optional[str]):
+            Optional. The Cloud KMS resource identifier of the customer
+            managed encryption key used to protect a resource. Has the
+            form:
+            ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
+            The key needs to be in the same region as where the compute
+            resource is created.
+        project: Google Cloud project ID. If not set, the default one will be used.
+        location: Google Cloud region. AutoML Tables only supports us-central1.
+    Returns:
+        dataset_name: Dataset name (fully-qualified)
+        dataset_dict: Dataset object in JSON format
+    '''
+
+    import datetime
+    import json
+    import logging
+
+    from google.cloud import aiplatform
+    from google.protobuf import json_format
+
+    logging.getLogger().setLevel(logging.INFO)
+
+    if not display_name:
+        display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
+
+    # Hack to enable passing multiple URIs
+    # I could have created another component or added another input, but it seems to be too much hassle for now.
+    # An alternative would have been to accept comma-delimited or semicolon-delimited URLs.
+    if data_uri.startswith("["):
+        data_uris = json.loads(data_uri)
+    else:
+        data_uris = [data_uri]
+
+    aiplatform.init(
+        project=project,
+        location=location,
+        encryption_spec_key_name=encryption_spec_key_name,
+    )
+    dataset = aiplatform.TabularDataset.create(
+        display_name=display_name,
+        gcs_source=data_uris,
+    )
+    (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/')
+    dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}'
+    logging.info(f'Created dataset {dataset.name}.')
+    logging.info(f'Link: {dataset_web_url}')
+    dataset_json = json_format.MessageToJson(dataset._gca_resource._pb)
+    print(dataset_json)
+    return (dataset.resource_name, dataset_json, dataset_web_url)
+
+
+if __name__ == '__main__':
+    create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI_op = create_component_from_func(
+        create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI,
+        base_image='python:3.9',
+        packages_to_install=['google-cloud-aiplatform==1.1.1'],
+        output_component_file='component.yaml',
+        annotations={
+            "author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
+            "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml",
+        },
+    )