Skip to content

Commit

Permalink
Google Cloud - Vertex_AI - AutoML - Tables - Create_dataset - Added t…
Browse files Browse the repository at this point in the history
…he "From_GCS" and "From_BigQuery" components
  • Loading branch information
Ark-kun committed Aug 3, 2021
1 parent 204f65b commit eab9868
Show file tree
Hide file tree
Showing 4 changed files with 530 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from typing import NamedTuple

from kfp.components import create_component_from_func

def create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(
data_uri: 'GoogleCloudBigQueryUri',
display_name: str = None,
encryption_spec_key_name: str = None,
project: str = None,
location: str = 'us-central1',
) -> NamedTuple('Outputs', [
('dataset_name', 'GoogleCloudVertexAiTabularDatasetName'),
('dataset_dict', dict),
]):
'''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
data_uri: Google Cloud BigQuery URI pointing to the data that should be imported into the dataset.
The bucket must be a regional bucket in the us-central1 region.
The file name must have a (case-insensitive) '.CSV' file extension.
display_name: Display name for the AutoML Dataset.
Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
encryption_spec_key_name (Optional[str]):
Optional. The Cloud KMS resource identifier of the customer
managed encryption key used to protect a resource. Has the
form:
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
The key needs to be in the same region as where the compute
resource is created.
project: Google Cloud project ID. If not set, the default one will be used.
location: Google Cloud region. AutoML Tables only supports us-central1.
Returns:
dataset_name: Dataset name (fully-qualified)
dataset_dict: Dataset object in JSON format
'''

import datetime
import json
import logging

from google.cloud import aiplatform
from google.protobuf import json_format

logging.getLogger().setLevel(logging.INFO)

if not display_name:
display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")

aiplatform.init(
project=project,
location=location,
encryption_spec_key_name=encryption_spec_key_name,
)
dataset = aiplatform.TabularDataset.create(
display_name=display_name,
bq_source=data_uri,
)
(_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/')
dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}'
logging.info(f'Created dataset {dataset.name}.')
logging.info(f'Link: {dataset_web_url}')
dataset_json = json_format.MessageToJson(dataset._gca_resource._pb)
print(dataset_json)
return (dataset.resource_name, dataset_json, dataset_web_url)


if __name__ == '__main__':
create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI_op = create_component_from_func(
create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI,
base_image='python:3.9',
packages_to_install=['google-cloud-aiplatform==1.1.1'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_BigQuery/component.yaml",
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
name: Create tabular dataset from BigQuery for Google Cloud Vertex AI
description: Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in
GCS.
metadata:
annotations: {author: Alexey Volkov <alexey.volkov@ark-kun.com>, canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_BigQuery/component.yaml'}
inputs:
- name: data_uri
type: GoogleCloudBigQueryUri
description: |-
Google Cloud BigQuery URI pointing to the data that should be imported into the dataset.
The bucket must be a regional bucket in the us-central1 region.
The file name must have a (case-insensitive) '.CSV' file extension.
- name: display_name
type: String
description: |-
Display name for the AutoML Dataset.
Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
optional: true
- name: encryption_spec_key_name
type: String
description: |-
Optional. The Cloud KMS resource identifier of the customer
managed encryption key used to protect a resource. Has the
form:
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
The key needs to be in the same region as where the compute
resource is created.
optional: true
- {name: project, type: String, description: 'Google Cloud project ID. If not set,
the default one will be used.', optional: true}
- {name: location, type: String, description: Google Cloud region. AutoML Tables only
supports us-central1., default: us-central1, optional: true}
outputs:
- {name: dataset_name, type: GoogleCloudVertexAiTabularDatasetName}
- {name: dataset_dict, type: JsonObject}
implementation:
container:
image: python:3.9
command:
- sh
- -c
- (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
'google-cloud-aiplatform==1.1.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3
-m pip install --quiet --no-warn-script-location 'google-cloud-aiplatform==1.1.1'
--user) && "$0" "$@"
- sh
- -ec
- |
program_path=$(mktemp)
printf "%s" "$0" > "$program_path"
python3 -u "$program_path" "$@"
- |
def create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(
data_uri,
display_name = None,
encryption_spec_key_name = None,
project = None,
location = 'us-central1',
):
'''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
data_uri: Google Cloud BigQuery URI pointing to the data that should be imported into the dataset.
The bucket must be a regional bucket in the us-central1 region.
The file name must have a (case-insensitive) '.CSV' file extension.
display_name: Display name for the AutoML Dataset.
Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
encryption_spec_key_name (Optional[str]):
Optional. The Cloud KMS resource identifier of the customer
managed encryption key used to protect a resource. Has the
form:
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
The key needs to be in the same region as where the compute
resource is created.
project: Google Cloud project ID. If not set, the default one will be used.
location: Google Cloud region. AutoML Tables only supports us-central1.
Returns:
dataset_name: Dataset name (fully-qualified)
dataset_dict: Dataset object in JSON format
'''
import datetime
import json
import logging
from google.cloud import aiplatform
from google.protobuf import json_format
logging.getLogger().setLevel(logging.INFO)
if not display_name:
display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
aiplatform.init(
project=project,
location=location,
encryption_spec_key_name=encryption_spec_key_name,
)
dataset = aiplatform.TabularDataset.create(
display_name=display_name,
bq_source=data_uri,
)
(_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/')
dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}'
logging.info(f'Created dataset {dataset.name}.')
logging.info(f'Link: {dataset_web_url}')
dataset_json = json_format.MessageToJson(dataset._gca_resource._pb)
print(dataset_json)
return (dataset.resource_name, dataset_json, dataset_web_url)
def _serialize_json(obj) -> str:
if isinstance(obj, str):
return obj
import json
def default_serializer(obj):
if hasattr(obj, 'to_struct'):
return obj.to_struct()
else:
raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__)
return json.dumps(obj, default=default_serializer, sort_keys=True)
import argparse
_parser = argparse.ArgumentParser(prog='Create tabular dataset from BigQuery for Google Cloud Vertex AI', description='Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.')
_parser.add_argument("--data-uri", dest="data_uri", type=str, required=True, default=argparse.SUPPRESS)
_parser.add_argument("--display-name", dest="display_name", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--encryption-spec-key-name", dest="encryption_spec_key_name", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--project", dest="project", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("--location", dest="location", type=str, required=False, default=argparse.SUPPRESS)
_parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2)
_parsed_args = vars(_parser.parse_args())
_output_files = _parsed_args.pop("_output_paths", [])
_outputs = create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(**_parsed_args)
_output_serializers = [
str,
_serialize_json,
]
import os
for idx, output_file in enumerate(_output_files):
try:
os.makedirs(os.path.dirname(output_file))
except OSError:
pass
with open(output_file, 'w') as f:
f.write(_output_serializers[idx](_outputs[idx]))
args:
- --data-uri
- {inputValue: data_uri}
- if:
cond: {isPresent: display_name}
then:
- --display-name
- {inputValue: display_name}
- if:
cond: {isPresent: encryption_spec_key_name}
then:
- --encryption-spec-key-name
- {inputValue: encryption_spec_key_name}
- if:
cond: {isPresent: project}
then:
- --project
- {inputValue: project}
- if:
cond: {isPresent: location}
then:
- --location
- {inputValue: location}
- '----output-paths'
- {outputPath: dataset_name}
- {outputPath: dataset_dict}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from typing import NamedTuple

from kfp.components import create_component_from_func

def create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI(
data_uri: 'GoogleCloudStorageUri', # data_type: "CSV"
display_name: str = None,
encryption_spec_key_name: str = None,
project: str = None,
location: str = 'us-central1',
) -> NamedTuple('Outputs', [
('dataset_name', 'GoogleCloudVertexAiTabularDatasetName'),
('dataset_dict', dict),
]):
'''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
Args:
data_uri: Google Cloud Storage URI pointing to the data in CSV format that should be imported into the dataset.
The bucket must be a regional bucket in the us-central1 region.
The file name must have a (case-insensitive) '.CSV' file extension.
display_name: Display name for the AutoML Dataset.
Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9.
encryption_spec_key_name (Optional[str]):
Optional. The Cloud KMS resource identifier of the customer
managed encryption key used to protect a resource. Has the
form:
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``.
The key needs to be in the same region as where the compute
resource is created.
project: Google Cloud project ID. If not set, the default one will be used.
location: Google Cloud region. AutoML Tables only supports us-central1.
Returns:
dataset_name: Dataset name (fully-qualified)
dataset_dict: Dataset object in JSON format
'''

import datetime
import json
import logging

from google.cloud import aiplatform
from google.protobuf import json_format

logging.getLogger().setLevel(logging.INFO)

if not display_name:
display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")

# Hack to enable passing multiple URIs
# I could have created another component or added another input, but it seems to be too much hassle for now.
# An alternative would have been to accept comma-delimited or semicolon-delimited URLs.
if data_uri.startswith("["):
data_uris = json.loads(data_uri)
else:
data_uris = [data_uri]

aiplatform.init(
project=project,
location=location,
encryption_spec_key_name=encryption_spec_key_name,
)
dataset = aiplatform.TabularDataset.create(
display_name=display_name,
gcs_source=data_uris,
)
(_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/')
dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}'
logging.info(f'Created dataset {dataset.name}.')
logging.info(f'Link: {dataset_web_url}')
dataset_json = json_format.MessageToJson(dataset._gca_resource._pb)
print(dataset_json)
return (dataset.resource_name, dataset_json, dataset_web_url)


if __name__ == '__main__':
create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI_op = create_component_from_func(
create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI,
base_image='python:3.9',
packages_to_install=['google-cloud-aiplatform==1.1.1'],
output_component_file='component.yaml',
annotations={
"author": "Alexey Volkov <alexey.volkov@ark-kun.com>",
"canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml",
},
)

0 comments on commit eab9868

Please sign in to comment.