Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add pipeline client init and run to vertex AI
- Loading branch information
Showing
5 changed files
with
784 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,351 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright 2021 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import time | ||
from typing import Optional, Dict, List | ||
|
||
import logging | ||
import re | ||
import sys | ||
|
||
from google.auth import credentials as auth_credentials | ||
|
||
from google.cloud.aiplatform import base | ||
from google.cloud.aiplatform import compat | ||
from google.cloud.aiplatform import initializer | ||
from google.cloud.aiplatform import utils | ||
from google.cloud.aiplatform.utils import pipeline_runtime_config_builder | ||
|
||
from google.cloud.aiplatform.compat.services import pipeline_service_client | ||
from google.cloud.aiplatform.compat.types import ( | ||
pipeline_job_v1beta1 as gca_pipeline_job_v1beta1, | ||
pipeline_state_v1beta1 as gca_pipeline_state_v1beta1, | ||
) | ||
|
||
from google.rpc import code_pb2 | ||
|
||
logging.basicConfig(level=logging.INFO, stream=sys.stdout) | ||
_LOGGER = base.Logger(__name__) | ||
|
||
_PIPELINE_COMPLETE_STATES = set( | ||
[ | ||
gca_pipeline_state_v1beta1.PipelineState.PIPELINE_STATE_SUCCEEDED, | ||
gca_pipeline_state_v1beta1.PipelineState.PIPELINE_STATE_FAILED, | ||
gca_pipeline_state_v1beta1.PipelineState.PIPELINE_STATE_CANCELLED, | ||
gca_pipeline_state_v1beta1.PipelineState.PIPELINE_STATE_PAUSED, | ||
] | ||
) | ||
|
||
_PIPELINE_CLIENT_VERSION='v1beta1' | ||
|
||
# AIPlatformPipelines service API job name relative name prefix pattern. | ||
_JOB_NAME_PATTERN = '{parent}/pipelineJobs/{job_id}' | ||
|
||
# Pattern for valid names used as a Vertex resource name. | ||
_VALID_NAME_PATTERN = re.compile('^[a-z][-a-z0-9]{0,127}$') | ||
|
||
def _set_enable_caching_value(pipeline_spec: Dict, | ||
enable_caching: bool) -> None: | ||
"""Sets pipeline tasks caching options. | ||
Args: | ||
pipeline_spec: The dictionary of pipeline spec. | ||
enable_caching: Whether to enable caching. | ||
""" | ||
for component in [pipeline_spec['root']] + list( | ||
pipeline_spec['components'].values()): | ||
if 'dag' in component: | ||
for task in component['dag']['tasks'].values(): | ||
task['cachingOptions'] = {'enableCache': enable_caching} | ||
|
||
|
||
class PipelineJob(base.VertexAiResourceNounWithFutureManager): | ||
|
||
client_class = utils.PipelineClientWithOverride | ||
_is_client_prediction_client = False | ||
|
||
_resource_noun = "pipelineJobs" | ||
_getter_method = "get_pipeline_job" | ||
_list_method = "list_pipeline_jobs" | ||
_cancel_method = "cancel_pipeline_job" | ||
_delete_method = "delete_pipeline_job" | ||
|
||
def __init__( | ||
self, | ||
display_name: str, | ||
job_spec_path: str, | ||
job_id: Optional[str] = None, | ||
pipeline_root: Optional[str] = None, | ||
parameter_values: Optional[Dict] = None, | ||
enable_caching: bool = True, | ||
encryption_spec_key_name: Optional[str] = None, | ||
network: Optional[str] = None, | ||
labels: Optional[Dict] = None, | ||
service_account: Optional[str] = None, | ||
credentials: Optional[auth_credentials.Credentials] = None, | ||
project: Optional[str] = None, | ||
location: Optional[str] = None, | ||
): | ||
"""Retrieves a PipelineJob resource and instantiates its | ||
representation. | ||
Args: | ||
display_name (str): | ||
Required. The user-defined name of this Pipeline. | ||
job_spec_path (str): | ||
The path of PipelineJob JSON file. It can be a local path or a | ||
GS URI. Example: "gs://project.name" | ||
job_id (Optional[str]): | ||
Optionally, the user can provide the unique ID of the job run. | ||
If not specified, pipeline name + timestamp will be used. | ||
pipeline_root (Optional[str]): | ||
Optionally the user can override the pipeline root | ||
specified during the compile time. Default to be staging bucket. | ||
parameter_values (Optional[Dict]): | ||
The mapping from runtime parameter names to its values that | ||
control the pipeline run. | ||
enable_caching (bool): | ||
Required. Whether to turn on caching for the run. Defaults to True. | ||
encryption_spec_key_name (Optional[str]): | ||
Optional. The Cloud KMS resource identifier of the customer | ||
managed encryption key used to protect the job. Has the | ||
form: | ||
``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. | ||
The key needs to be in the same region as where the compute | ||
resource is created. | ||
If this is set, then all | ||
resources created by the BatchPredictionJob will | ||
be encrypted with the provided encryption key. | ||
Overrides encryption_spec_key_name set in aiplatform.init. | ||
labels (Optional[Dict]): | ||
The user defined metadata to organize PipelineJob. | ||
credentials (Optional[auth_credentials.Credentials]): | ||
Custom credentials to use to create this batch prediction | ||
job. Overrides credentials set in aiplatform.init. | ||
project: Optional[str] = None, | ||
Optional project to retrieve PipelineJob from. If not set, | ||
project set in aiplatform.init will be used. | ||
location: Optional[str] = None, | ||
Optional location to retrieve PipelineJob from. If not set, | ||
location set in aiplatform.init will be used. | ||
""" | ||
utils.validate_display_name(display_name) | ||
|
||
super().__init__(project=project, location=location, credentials=credentials) | ||
|
||
self._parent = initializer.global_config.common_location_path( | ||
project=project, location=location | ||
) | ||
pipeline_root = pipeline_root or initializer.global_config.staging_bucket | ||
pipeline_spec = utils.load_json(job_spec_path) | ||
pipeline_name = pipeline_spec['pipelineSpec']['pipelineInfo']['name'] | ||
job_id = job_id or '{pipeline_name}-{timestamp}'.format( | ||
pipeline_name=re.sub('[^-0-9a-z]+', '-', | ||
pipeline_name.lower()).lstrip('-').rstrip('-'), | ||
timestamp=_get_current_time().strftime('%Y%m%d%H%M%S')) | ||
if not _VALID_NAME_PATTERN.match(job_id): | ||
raise ValueError( | ||
'Generated job ID: {} is illegal as a Vertex pipelines job ID. ' | ||
'Expecting an ID following the regex pattern ' | ||
'"[a-z][-a-z0-9]{{0,127}}"'.format(job_id)) | ||
|
||
job_name = _JOB_NAME_PATTERN.format(parent=self._parent, job_id=job_id) | ||
|
||
pipeline_spec['name'] = job_name | ||
pipeline_spec['displayName'] = job_id | ||
|
||
builder = pipeline_runtime_config_builder.PipelineRuntimeConfigBuilder.from_job_spec_json( | ||
pipeline_spec) | ||
builder.update_pipeline_root(pipeline_root) | ||
builder.update_runtime_parameters(parameter_values) | ||
|
||
runtime_config = builder.build() | ||
pipeline_spec['runtimeConfig'] = runtime_config | ||
|
||
_set_enable_caching_value(pipeline_spec['pipelineSpec'], enable_caching) | ||
|
||
if encryption_spec_key_name is not None: | ||
pipeline_spec['encryptionSpec'] = {'kmsKeyName': encryption_spec_key_name} | ||
if service_account is not None: | ||
pipeline_spec['serviceAccount'] = service_account | ||
if network is not None: | ||
pipeline_spec['network'] = network | ||
|
||
if labels: | ||
if not isinstance(labels, Dict): | ||
raise ValueError( | ||
'Expect labels to be a mapping of string key value pairs. ' | ||
'Got "{}" of type "{}"'.format(labels, type(labels))) | ||
for k, v in labels.items(): | ||
if not isinstance(k, str) or not isinstance(v, str): | ||
raise ValueError( | ||
'Expect labels to be a mapping of string key value pairs. ' | ||
'Got "{}".'.format(labels)) | ||
|
||
pipeline_spec['labels'] = labels | ||
|
||
self._gca_resource = gca_pipeline_job_v1beta1.PipelineJob( | ||
display_name=display_name, | ||
pipeline_spec=pipeline_spec, | ||
labels=labels, | ||
runtime_config=None, | ||
encryption_spec=initializer.global_config.get_encryption_spec( | ||
encryption_spec_key_name=encryption_spec_key_name | ||
), | ||
service_account=service_account, | ||
network=network, | ||
) | ||
|
||
@base.optional_sync() | ||
def run( | ||
self, | ||
service_account: Optional[str] = None, | ||
network: Optional[str] = None, | ||
sync: bool = True, | ||
) -> None: | ||
"""Run this configured PipelineJob. | ||
Args: | ||
service_account (str): | ||
Optional. Specifies the service account for workload run-as account. | ||
Users submitting jobs must have act-as permission on this run-as account. | ||
network (str): | ||
Optional. The full name of the Compute Engine network to which the job | ||
should be peered. For example, projects/12345/global/networks/myVPC. | ||
Private services access must already be configured for the network. | ||
If left unspecified, the job is not peered with any network. | ||
sync (bool): | ||
Whether to execute this method synchronously. If False, this method | ||
will unblock and it will be executed in a concurrent Future. | ||
""" | ||
|
||
if service_account: | ||
self._gca_resource.pipeline_spec.service_account = service_account | ||
|
||
if network: | ||
self._gca_resource.pipeline_spec.network = network | ||
|
||
_LOGGER.log_create_with_lro(self.__class__) | ||
|
||
self._gca_resource = self.api_client.select_version(_PIPELINE_CLIENT_VERSION).create_pipeline_job( | ||
parent=self._parent, | ||
pipeline_job=self._gca_resource | ||
) | ||
|
||
_LOGGER.log_create_complete_with_getter( | ||
self.__class__, self._gca_resource, "pipeline_job" | ||
) | ||
|
||
_LOGGER.info("View Pipeline Job:\n%s" % self._dashboard_uri()) | ||
|
||
self._block_until_complete() | ||
|
||
@property | ||
def pipeline_spec(self): | ||
return self._gca_resource.pipeline_spec | ||
|
||
@property | ||
def state(self) -> Optional[gca_pipeline_state_v1beta1.PipelineState]: | ||
"""Current pipeline state.""" | ||
|
||
if self._assert_has_run(): | ||
return | ||
|
||
self._sync_gca_resource() | ||
return self._gca_resource.state | ||
|
||
@property | ||
def _has_run(self) -> bool: | ||
"""Helper property to check if this pipeline job has been run.""" | ||
return self._gca_resource is not None | ||
|
||
def _assert_has_run(self) -> bool: | ||
"""Helper method to assert that this pipeline has run.""" | ||
if not self._has_run: | ||
if self._is_waiting_to_run(): | ||
return True | ||
raise RuntimeError( | ||
"PipelineJob has not been launched. You must run this" | ||
" PipelineJob using PipelineJob.run. " | ||
) | ||
return False | ||
|
||
@property | ||
def has_failed(self) -> bool: | ||
"""Returns True if pipeline has failed. | ||
False otherwise. | ||
""" | ||
self._assert_has_run() | ||
return self.state == gca_pipeline_state_v1beta1.PipelineState.PIPELINE_STATE_FAILED | ||
|
||
def _dashboard_uri(self) -> str: | ||
"""Helper method to compose the dashboard uri where pipeline can be | ||
viewed.""" | ||
fields = utils.extract_fields_from_resource_name(self.resource_name) | ||
url = f"https://console.cloud.google.com/ai/platform/locations/{fields.location}/pipelines/runs/{fields.id}?project={fields.project}" | ||
return url | ||
|
||
def _sync_gca_resource(self): | ||
"""Helper method to sync the local gca_source against the service.""" | ||
|
||
self._gca_resource = self.api_client.select_version(_PIPELINE_CLIENT_VERSION).get_pipeline_job( | ||
name=self.resource_name | ||
) | ||
|
||
def _block_until_complete(self): | ||
"""Helper method to block and check on job until complete.""" | ||
|
||
# Used these numbers so failures surface fast | ||
wait = 5 # start at five seconds | ||
log_wait = 5 | ||
max_wait = 60 * 5 # 5 minute wait | ||
multiplier = 2 # scale wait by 2 every iteration | ||
|
||
previous_time = time.time() | ||
while self.state not in _PIPELINE_COMPLETE_STATES: | ||
current_time = time.time() | ||
if current_time - previous_time >= log_wait: | ||
_LOGGER.info( | ||
"%s %s current state:\n%s" | ||
% ( | ||
self.__class__.__name__, | ||
self._gca_resource.name, | ||
self._gca_resource.state, | ||
) | ||
) | ||
log_wait = min(log_wait * multiplier, max_wait) | ||
previous_time = current_time | ||
time.sleep(wait) | ||
|
||
self._raise_failure() | ||
|
||
_LOGGER.log_action_completed_against_resource("run", "completed", self) | ||
|
||
if self._gca_resource.name and not self.has_failed: | ||
_LOGGER.info("Pipeline Job available at:\n%s" % self._dashboard_uri()) | ||
|
||
def _raise_failure(self): | ||
"""Helper method to raise failure if PipelineJob fails. | ||
Raises: | ||
RuntimeError: If pipeline failed. | ||
""" | ||
|
||
if self._gca_resource.error.code != code_pb2.OK: | ||
raise RuntimeError("Pipeline failed with:\n%s" % self._gca_resource.error) | ||
|
Oops, something went wrong.