From 6e50b41e9c91ddaa86c1c1d8876944cbdd51f388 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Wed, 12 May 2021 20:04:59 -0400 Subject: [PATCH 01/29] checkpoint --- google/cloud/aiplatform/__init__.py | 3 +- google/cloud/aiplatform/jobs.py | 159 ++++++++++++++++++++++++++-- 2 files changed, 155 insertions(+), 7 deletions(-) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 58eb824454..c631c2fd3c 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -27,7 +27,7 @@ ) from google.cloud.aiplatform.models import Endpoint from google.cloud.aiplatform.models import Model -from google.cloud.aiplatform.jobs import BatchPredictionJob +from google.cloud.aiplatform.jobs import BatchPredictionJob, CustomJob from google.cloud.aiplatform.training_jobs import ( CustomTrainingJob, CustomContainerTrainingJob, @@ -55,6 +55,7 @@ "AutoMLTextTrainingJob", "AutoMLVideoTrainingJob", "BatchPredictionJob", + "CustomJob", "CustomTrainingJob", "CustomContainerTrainingJob", "CustomPythonPackageTrainingJob", diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index ee6d46dde9..29d6073d9c 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -22,29 +22,33 @@ import time import logging + from google.cloud import storage from google.cloud import bigquery from google.auth import credentials as auth_credentials +from google.protobuf import duration_pb2 # type: ignore from google.cloud import aiplatform from google.cloud.aiplatform import base -from google.cloud.aiplatform import initializer from google.cloud.aiplatform import compat from google.cloud.aiplatform import constants +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform import training_jobs from google.cloud.aiplatform import utils from google.cloud.aiplatform.compat.services import job_service_client from google.cloud.aiplatform.compat.types import ( - io as gca_io_compat, - io_v1beta1 as gca_io_v1beta1, - job_state as gca_job_state, batch_prediction_job as gca_bp_job_compat, batch_prediction_job_v1 as gca_bp_job_v1, batch_prediction_job_v1beta1 as gca_bp_job_v1beta1, + custom_job as gca_custom_job_compat, + explanation_v1beta1 as gca_explanation_v1beta1, + io as gca_io_compat, + io_v1beta1 as gca_io_v1beta1, + job_state as gca_job_state, machine_resources as gca_machine_resources_compat, machine_resources_v1beta1 as gca_machine_resources_v1beta1, - explanation_v1beta1 as gca_explanation_v1beta1, ) logging.basicConfig(level=logging.INFO, stream=sys.stdout) @@ -173,7 +177,7 @@ def _block_until_complete(self): ) ) log_wait = min(log_wait * multiplier, max_wait) - previous_time = current_time + previous_time = current_time time.sleep(wait) _LOGGER.log_action_completed_against_resource("", "run", self) @@ -777,6 +781,149 @@ class CustomJob(_Job): _job_type = "training" pass + def __init__(self, + display_name: str, + worker_pool_specs: Union[Dict], + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + staging_bucket: Optional[str] = None): + + base.AiPlatformResourceNounWithFutureManager.__init__(self, + project=project, + location=location, + credentials=credentials + ) + + self._parent = aiplatform.initializer.global_config.common_location_path( + project=project, + location=location + ) + + staging_bucket = staging_bucket or initializer.global_config.staging_bucket + + if not staging_bucket: + raise RuntimeError( + "staging_bucket should be passed to CustomJob constructor or " + "should be set using aiplatform.init(staging_bucket='gs://my-bucket')" + ) + + self._gca_resource = gca_custom_job_compat.CustomJob( + display_name=display_name, + job_spec = gca_custom_job_compat.CustomJobSpec( + worker_pool_specs=worker_pool_specs, + base_output_directory=gca_io_compat.GcsDestination(output_uri_prefix=staging_bucket), + ), + encryption_spec= initializer.global_config.get_encryption_spec( + encryption_spec_key_name=encryption_spec_key_name + ) + ) + + + @classmethod + def from_local_script( + cls, + display_name: str, + script_path: str, + container_uri: str, + args: Optional[List[Union[str, float, int]]] = None, + requirements: Optional[Sequence[str]] = None, + environment_variables: Optional[Dict[str, str]] = None, + replica_count: int = 1, + machine_type: str = "n1-standard-4", + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + accelerator_count: int = 0, + project: Optional[str] = None, + location: Optional[str] = None, + staging_bucket: Optional[str]= None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + ) -> 'CustomJob': + + project = project or initializer.global_config.project + location = location or initializer.global_config.location + staging_bucket = staging_bucket or initializer.global_config.staging_bucket + + if not staging_bucket: + raise RuntimeError( + "staging_bucket should be passed to CustomJob.from_local_script or " + "should be set using aiplatform.init(staging_bucket='gs://my-bucket')" + ) + + worker_pool_specs = training_jobs._DistributedTrainingSpec.chief_worker_pool( + replica_count=replica_count, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ).pool_specs + + + python_packager = training_jobs._TrainingScriptPythonPackager( + script_path=script_path, requirements=requirements + ) + + package_gcs_uri = python_packager.package_and_copy_to_gcs( + gcs_staging_dir = staging_bucket, + project = project, + credentials = credentials, + ) + + for spec in worker_pool_specs: + spec["pythonPackageSpec"] = { + "executorImageUri": container_uri, + "pythonModule": python_packager.module_name, + "packageUris": [package_gcs_uri], + } + + if args: + spec["pythonPackageSpec"]["args"] = args + + if environment_variables: + spec["pythonPackageSpec"]["env"] = [ + {"name": key, "value": value} + for key, value in environment_variables.items() + ] + + return cls( + display_name=display_name, + worker_pool_specs=worker_pool, + project=project, + location=location, + credentials=credentials, + encryption_spec_key_name=encryption_spec_key_name, + staging_bucket=staging_bucket) + + + @base.optional_sync() + def run( + self, + service_account: Optional[str] = None, + network: Optional[str] = None, + timeout: Optional[int] = None, # seconds + restart_job_on_worker_restart: bool=False, + sync: bool = True): + + if service_account: + self._gca_resource.service_account = service_account + + if network: + self._gca_resource.network = network + + + if timeout or restart_job_on_worker_restart: + timout = duration_pb2.Duration(seconds=timout) if timeout else None + self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling( + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart + ) + + self._gca_resource = self.api_client.create_custom_job( + parent=self._parent, custom_job=self._gca_resource + ) + + self._block_until_complete() + class DataLabelingJob(_Job): _resource_noun = "dataLabelingJobs" From c1dfd629b8691c5122deb635a981e98e7b79c690 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Thu, 13 May 2021 11:02:46 -0400 Subject: [PATCH 02/29] checkpoint --- google/__init__.py | 0 google/cloud/__init__.py | 0 google/cloud/aiplatform/jobs.py | 56 +-- google/cloud/aiplatform/training_jobs.py | 458 +----------------- .../{utils.py => utils/__init__.py} | 67 +++ google/cloud/aiplatform/utils/source_utils.py | 216 +++++++++ .../aiplatform/utils/worker_spec_utils.py | 181 +++++++ tests/unit/aiplatform/test_end_to_end.py | 5 +- tests/unit/aiplatform/test_training_jobs.py | 85 ++-- 9 files changed, 543 insertions(+), 525 deletions(-) create mode 100644 google/__init__.py create mode 100644 google/cloud/__init__.py rename google/cloud/aiplatform/{utils.py => utils/__init__.py} (87%) create mode 100644 google/cloud/aiplatform/utils/source_utils.py create mode 100644 google/cloud/aiplatform/utils/worker_spec_utils.py diff --git a/google/__init__.py b/google/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/google/cloud/__init__.py b/google/cloud/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 29d6073d9c..1d068d3435 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -22,7 +22,6 @@ import time import logging - from google.cloud import storage from google.cloud import bigquery @@ -34,8 +33,9 @@ from google.cloud.aiplatform import compat from google.cloud.aiplatform import constants from google.cloud.aiplatform import initializer -from google.cloud.aiplatform import training_jobs from google.cloud.aiplatform import utils +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud.aiplatform.compat.services import job_service_client from google.cloud.aiplatform.compat.types import ( @@ -823,23 +823,23 @@ def __init__(self, @classmethod def from_local_script( - cls, - display_name: str, - script_path: str, - container_uri: str, - args: Optional[List[Union[str, float, int]]] = None, - requirements: Optional[Sequence[str]] = None, - environment_variables: Optional[Dict[str, str]] = None, - replica_count: int = 1, - machine_type: str = "n1-standard-4", - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", - accelerator_count: int = 0, - project: Optional[str] = None, - location: Optional[str] = None, - staging_bucket: Optional[str]= None, - credentials: Optional[auth_credentials.Credentials] = None, - encryption_spec_key_name: Optional[str] = None, - ) -> 'CustomJob': + cls, + display_name: str, + script_path: str, + container_uri: str, + args: Optional[List[Union[str, float, int]]] = None, + requirements: Optional[Sequence[str]] = None, + environment_variables: Optional[Dict[str, str]] = None, + replica_count: int = 1, + machine_type: str = "n1-standard-4", + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + accelerator_count: int = 0, + project: Optional[str] = None, + location: Optional[str] = None, + staging_bucket: Optional[str]= None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + ) -> 'CustomJob': project = project or initializer.global_config.project location = location or initializer.global_config.location @@ -851,7 +851,7 @@ def from_local_script( "should be set using aiplatform.init(staging_bucket='gs://my-bucket')" ) - worker_pool_specs = training_jobs._DistributedTrainingSpec.chief_worker_pool( + worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=replica_count, machine_type=machine_type, accelerator_count=accelerator_count, @@ -859,7 +859,7 @@ def from_local_script( ).pool_specs - python_packager = training_jobs._TrainingScriptPythonPackager( + python_packager = source_utils._TrainingScriptPythonPackager( script_path=script_path, requirements=requirements ) @@ -870,24 +870,24 @@ def from_local_script( ) for spec in worker_pool_specs: - spec["pythonPackageSpec"] = { - "executorImageUri": container_uri, - "pythonModule": python_packager.module_name, - "packageUris": [package_gcs_uri], + spec["python_package_spec"] = { + "executor_image_uri": container_uri, + "python_module": python_packager.module_name, + "package_uris": [package_gcs_uri], } if args: - spec["pythonPackageSpec"]["args"] = args + spec["python_package_spec"]["args"] = args if environment_variables: - spec["pythonPackageSpec"]["env"] = [ + spec["python_package_spec"]["env"] = [ {"name": key, "value": value} for key, value in environment_variables.items() ] return cls( display_name=display_name, - worker_pool_specs=worker_pool, + worker_pool_specs=worker_pool_specs, project=project, location=location, credentials=credentials, diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 2912806a12..f8f56bd5da 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -15,16 +15,9 @@ # limitations under the License. # -import datetime -import functools -import logging -import pathlib -import shutil -import subprocess import sys -import tempfile import time -from typing import Callable, Dict, List, Optional, NamedTuple, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union import abc @@ -38,25 +31,25 @@ from google.cloud.aiplatform import utils from google.cloud.aiplatform.compat.types import ( - accelerator_type as gca_accelerator_type, env_var as gca_env_var, io as gca_io, model as gca_model, pipeline_state as gca_pipeline_state, training_pipeline as gca_training_pipeline, ) +from google.cloud.aiplatform.utils import _timestamped_gcs_dir +from google.cloud.aiplatform.utils.source_utils import _TrainingScriptPythonPackager +from google.cloud.aiplatform.utils.worker_spec_utils import _DistributedTrainingSpec from google.cloud.aiplatform.v1.schema.trainingjob import ( definition_v1 as training_job_inputs, ) -from google.cloud import storage from google.rpc import code_pb2 import proto -logging.basicConfig(level=logging.INFO, stream=sys.stdout) _LOGGER = base.Logger(__name__) _PIPELINE_COMPLETE_STATES = set( @@ -780,449 +773,6 @@ def cancel(self) -> None: self.api_client.cancel_training_pipeline(name=self.resource_name) -def _timestamped_gcs_dir(root_gcs_path: str, dir_name_prefix: str) -> str: - """Composes a timestamped GCS directory. - - Args: - root_gcs_path: GCS path to put the timestamped directory. - dir_name_prefix: Prefix to add the timestamped directory. - Returns: - Timestamped gcs directory path in root_gcs_path. - """ - timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") - dir_name = "-".join([dir_name_prefix, timestamp]) - if root_gcs_path.endswith("/"): - root_gcs_path = root_gcs_path[:-1] - gcs_path = "/".join([root_gcs_path, dir_name]) - if not gcs_path.startswith("gs://"): - return "gs://" + gcs_path - return gcs_path - - -def _timestamped_copy_to_gcs( - local_file_path: str, - gcs_dir: str, - project: Optional[str] = None, - credentials: Optional[auth_credentials.Credentials] = None, -) -> str: - """Copies a local file to a GCS path. - - The file copied to GCS is the name of the local file prepended with an - "aiplatform-{timestamp}-" string. - - Args: - local_file_path (str): Required. Local file to copy to GCS. - gcs_dir (str): - Required. The GCS directory to copy to. - project (str): - Project that contains the staging bucket. Default will be used if not - provided. Model Builder callers should pass this in. - credentials (auth_credentials.Credentials): - Custom credentials to use with bucket. Model Builder callers should pass - this in. - Returns: - gcs_path (str): The path of the copied file in gcs. - """ - - gcs_bucket, gcs_blob_prefix = utils.extract_bucket_and_prefix_from_gcs_path(gcs_dir) - - local_file_name = pathlib.Path(local_file_path).name - timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") - blob_path = "-".join(["aiplatform", timestamp, local_file_name]) - - if gcs_blob_prefix: - blob_path = "/".join([gcs_blob_prefix, blob_path]) - - # TODO(b/171202993) add user agent - client = storage.Client(project=project, credentials=credentials) - bucket = client.bucket(gcs_bucket) - blob = bucket.blob(blob_path) - blob.upload_from_filename(local_file_path) - - gcs_path = "".join(["gs://", "/".join([blob.bucket.name, blob.name])]) - return gcs_path - - -def _get_python_executable() -> str: - """Returns Python executable. - - Returns: - Python executable to use for setuptools packaging. - Raises: - EnvironmentError: If Python executable is not found. - """ - - python_executable = sys.executable - - if not python_executable: - raise EnvironmentError("Cannot find Python executable for packaging.") - return python_executable - - -class _TrainingScriptPythonPackager: - """Converts a Python script into Python package suitable for aiplatform - training. - - Copies the script to specified location. - - Class Attributes: - _TRAINER_FOLDER: Constant folder name to build package. - _ROOT_MODULE: Constant root name of module. - _TEST_MODULE_NAME: Constant name of module that will store script. - _SETUP_PY_VERSION: Constant version of this created python package. - _SETUP_PY_TEMPLATE: Constant template used to generate setup.py file. - _SETUP_PY_SOURCE_DISTRIBUTION_CMD: - Constant command to generate the source distribution package. - - Attributes: - script_path: local path of script to package - requirements: list of Python dependencies to add to package - - Usage: - - packager = TrainingScriptPythonPackager('my_script.py', ['pandas', 'pytorch']) - gcs_path = packager.package_and_copy_to_gcs( - gcs_staging_dir='my-bucket', - project='my-prject') - module_name = packager.module_name - - The package after installed can be executed as: - python -m aiplatform_custom_trainer_script.task - """ - - _TRAINER_FOLDER = "trainer" - _ROOT_MODULE = "aiplatform_custom_trainer_script" - _TASK_MODULE_NAME = "task" - _SETUP_PY_VERSION = "0.1" - - _SETUP_PY_TEMPLATE = """from setuptools import find_packages -from setuptools import setup - -setup( - name='{name}', - version='{version}', - packages=find_packages(), - install_requires=({requirements}), - include_package_data=True, - description='My training application.' -)""" - - _SETUP_PY_SOURCE_DISTRIBUTION_CMD = "setup.py sdist --formats=gztar" - - # Module name that can be executed during training. ie. python -m - module_name = f"{_ROOT_MODULE}.{_TASK_MODULE_NAME}" - - def __init__(self, script_path: str, requirements: Optional[Sequence[str]] = None): - """Initializes packager. - - Args: - script_path (str): Required. Local path to script. - requirements (Sequence[str]): - List of python packages dependencies of script. - """ - - self.script_path = script_path - self.requirements = requirements or [] - - def make_package(self, package_directory: str) -> str: - """Converts script into a Python package suitable for python module - execution. - - Args: - package_directory (str): Directory to build package in. - Returns: - source_distribution_path (str): Path to built package. - Raises: - RunTimeError: If package creation fails. - """ - # The root folder to builder the package in - package_path = pathlib.Path(package_directory) - - # Root directory of the package - trainer_root_path = package_path / self._TRAINER_FOLDER - - # The root module of the python package - trainer_path = trainer_root_path / self._ROOT_MODULE - - # __init__.py path in root module - init_path = trainer_path / "__init__.py" - - # The module that will contain the script - script_out_path = trainer_path / f"{self._TASK_MODULE_NAME}.py" - - # The path to setup.py in the package. - setup_py_path = trainer_root_path / "setup.py" - - # The path to the generated source distribution. - source_distribution_path = ( - trainer_root_path - / "dist" - / f"{self._ROOT_MODULE}-{self._SETUP_PY_VERSION}.tar.gz" - ) - - trainer_root_path.mkdir() - trainer_path.mkdir() - - # Make empty __init__.py - with init_path.open("w"): - pass - - # Format the setup.py file. - setup_py_output = self._SETUP_PY_TEMPLATE.format( - name=self._ROOT_MODULE, - requirements=",".join(f'"{r}"' for r in self.requirements), - version=self._SETUP_PY_VERSION, - ) - - # Write setup.py - with setup_py_path.open("w") as fp: - fp.write(setup_py_output) - - # Copy script as module of python package. - shutil.copy(self.script_path, script_out_path) - - # Run setup.py to create the source distribution. - setup_cmd = [ - _get_python_executable() - ] + self._SETUP_PY_SOURCE_DISTRIBUTION_CMD.split() - - p = subprocess.Popen( - args=setup_cmd, - cwd=trainer_root_path, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - output, error = p.communicate() - - # Raise informative error if packaging fails. - if p.returncode != 0: - raise RuntimeError( - "Packaging of training script failed with code %d\n%s \n%s" - % (p.returncode, output.decode(), error.decode()) - ) - - return str(source_distribution_path) - - def package_and_copy(self, copy_method: Callable[[str], str]) -> str: - """Packages the script and executes copy with given copy_method. - - Args: - copy_method Callable[[str], str] - Takes a string path, copies to a desired location, and returns the - output path location. - Returns: - output_path str: Location of copied package. - """ - - with tempfile.TemporaryDirectory() as tmpdirname: - source_distribution_path = self.make_package(tmpdirname) - output_location = copy_method(source_distribution_path) - _LOGGER.info("Training script copied to:\n%s." % output_location) - return output_location - - def package_and_copy_to_gcs( - self, - gcs_staging_dir: str, - project: str = None, - credentials: Optional[auth_credentials.Credentials] = None, - ) -> str: - """Packages script in Python package and copies package to GCS bucket. - - Args - gcs_staging_dir (str): Required. GCS Staging directory. - project (str): Required. Project where GCS Staging bucket is located. - credentials (auth_credentials.Credentials): - Optional credentials used with GCS client. - Returns: - GCS location of Python package. - """ - - copy_method = functools.partial( - _timestamped_copy_to_gcs, - gcs_dir=gcs_staging_dir, - project=project, - credentials=credentials, - ) - return self.package_and_copy(copy_method=copy_method) - - -class _MachineSpec(NamedTuple): - """Specification container for Machine specs used for distributed training. - - Usage: - - spec = _MachineSpec( - replica_count=10, - machine_type='n1-standard-4', - accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80') - - Note that container and python package specs are not stored with this spec. - """ - - replica_count: int = 0 - machine_type: str = "n1-standard-4" - accelerator_count: int = 0 - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED" - - def _get_accelerator_type(self) -> Optional[str]: - """Validates accelerator_type and returns the name of the accelerator. - - Returns: - None if no accelerator or valid accelerator name. - - Raise: - ValueError if accelerator type is invalid. - """ - - # Raises ValueError if invalid accelerator_type - utils.validate_accelerator_type(self.accelerator_type) - - accelerator_enum = getattr( - gca_accelerator_type.AcceleratorType, self.accelerator_type - ) - - if ( - accelerator_enum - != gca_accelerator_type.AcceleratorType.ACCELERATOR_TYPE_UNSPECIFIED - ): - return self.accelerator_type - - @property - def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]: - """Return specification as a Dict.""" - spec = { - "machineSpec": {"machineType": self.machine_type}, - "replicaCount": self.replica_count, - } - accelerator_type = self._get_accelerator_type() - if accelerator_type and self.accelerator_count: - spec["machineSpec"]["acceleratorType"] = accelerator_type - spec["machineSpec"]["acceleratorCount"] = self.accelerator_count - - return spec - - @property - def is_empty(self) -> bool: - """Returns True is replica_count > 0 False otherwise.""" - return self.replica_count <= 0 - - -class _DistributedTrainingSpec(NamedTuple): - """Configuration for distributed training worker pool specs. - - AI Platform Training expects configuration in this order: - [ - chief spec, # can only have one replica - worker spec, - parameter server spec, - evaluator spec - ] - - Usage: - - dist_training_spec = _DistributedTrainingSpec( - chief_spec = _MachineSpec( - replica_count=1, - machine_type='n1-standard-4', - accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80' - ), - worker_spec = _MachineSpec( - replica_count=10, - machine_type='n1-standard-4', - accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80' - ) - ) - """ - - chief_spec: _MachineSpec = _MachineSpec() - worker_spec: _MachineSpec = _MachineSpec() - parameter_server_spec: _MachineSpec = _MachineSpec() - evaluator_spec: _MachineSpec = _MachineSpec() - - @property - def pool_specs( - self, - ) -> List[Dict[str, Union[int, str, Dict[str, Union[int, str]]]]]: - """Return each pools spec in correct order for AI Platform as a list of - dicts. - - Also removes specs if they are empty but leaves specs in if there unusual - specifications to not break the ordering in AI Platform Training. - ie. 0 chief replica, 10 worker replica, 3 ps replica - - Returns: - Order list of worker pool specs suitable for AI Platform Training. - """ - if self.chief_spec.replica_count > 1: - raise ValueError("Chief spec replica count cannot be greater than 1.") - - spec_order = [ - self.chief_spec, - self.worker_spec, - self.parameter_server_spec, - self.evaluator_spec, - ] - specs = [s.spec_dict for s in spec_order] - for i in reversed(range(len(spec_order))): - if spec_order[i].is_empty: - specs.pop() - else: - break - return specs - - @classmethod - def chief_worker_pool( - cls, - replica_count: int = 0, - machine_type: str = "n1-standard-4", - accelerator_count: int = 0, - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", - ) -> "_DistributedTrainingSpec": - """Parameterizes Config to support only chief with worker replicas. - - For replica is assigned to chief and the remainder to workers. All spec have the - same machine type, accelerator count, and accelerator type. - - Args: - replica_count (int): - The number of worker replicas. Assigns 1 chief replica and - replica_count - 1 worker replicas. - machine_type (str): - The type of machine to use for training. - accelerator_type (str): - Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, - NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, - NVIDIA_TESLA_T4 - accelerator_count (int): - The number of accelerators to attach to a worker replica. - - Returns: - _DistributedTrainingSpec representing one chief and n workers all of same - type. If replica_count <= 0 then an empty spec is returned. - """ - if replica_count <= 0: - return cls() - - chief_spec = _MachineSpec( - replica_count=1, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - ) - - worker_spec = _MachineSpec( - replica_count=replica_count - 1, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - ) - - return cls(chief_spec=chief_spec, worker_spec=worker_spec) - - class _CustomTrainingJob(_TrainingJob): """ABC for Custom Training Pipelines..""" diff --git a/google/cloud/aiplatform/utils.py b/google/cloud/aiplatform/utils/__init__.py similarity index 87% rename from google/cloud/aiplatform/utils.py rename to google/cloud/aiplatform/utils/__init__.py index ff86fc1cb8..c847a56244 100644 --- a/google/cloud/aiplatform/utils.py +++ b/google/cloud/aiplatform/utils/__init__.py @@ -17,6 +17,8 @@ import abc +import datetime +import pathlib from collections import namedtuple import logging import re @@ -25,6 +27,8 @@ from google.api_core import client_options from google.api_core import gapic_v1 from google.auth import credentials as auth_credentials +from google.cloud import storage + from google.cloud.aiplatform import compat from google.cloud.aiplatform import constants from google.cloud.aiplatform import initializer @@ -499,3 +503,66 @@ def __init__(self, warning_level: int): def filter(self, record): return record.levelname == self._warning_level + + +def _timestamped_gcs_dir(root_gcs_path: str, dir_name_prefix: str) -> str: + """Composes a timestamped GCS directory. + + Args: + root_gcs_path: GCS path to put the timestamped directory. + dir_name_prefix: Prefix to add the timestamped directory. + Returns: + Timestamped gcs directory path in root_gcs_path. + """ + timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") + dir_name = "-".join([dir_name_prefix, timestamp]) + if root_gcs_path.endswith("/"): + root_gcs_path = root_gcs_path[:-1] + gcs_path = "/".join([root_gcs_path, dir_name]) + if not gcs_path.startswith("gs://"): + return "gs://" + gcs_path + return gcs_path + + +def _timestamped_copy_to_gcs( + local_file_path: str, + gcs_dir: str, + project: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, +) -> str: + """Copies a local file to a GCS path. + + The file copied to GCS is the name of the local file prepended with an + "aiplatform-{timestamp}-" string. + + Args: + local_file_path (str): Required. Local file to copy to GCS. + gcs_dir (str): + Required. The GCS directory to copy to. + project (str): + Project that contains the staging bucket. Default will be used if not + provided. Model Builder callers should pass this in. + credentials (auth_credentials.Credentials): + Custom credentials to use with bucket. Model Builder callers should pass + this in. + Returns: + gcs_path (str): The path of the copied file in gcs. + """ + + gcs_bucket, gcs_blob_prefix = extract_bucket_and_prefix_from_gcs_path(gcs_dir) + + local_file_name = pathlib.Path(local_file_path).name + timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") + blob_path = "-".join(["aiplatform", timestamp, local_file_name]) + + if gcs_blob_prefix: + blob_path = "/".join([gcs_blob_prefix, blob_path]) + + # TODO(b/171202993) add user agent + client = storage.Client(project=project, credentials=credentials) + bucket = client.bucket(gcs_bucket) + blob = bucket.blob(blob_path) + blob.upload_from_filename(local_file_path) + + gcs_path = "".join(["gs://", "/".join([blob.bucket.name, blob.name])]) + return gcs_path \ No newline at end of file diff --git a/google/cloud/aiplatform/utils/source_utils.py b/google/cloud/aiplatform/utils/source_utils.py new file mode 100644 index 0000000000..8539e3122d --- /dev/null +++ b/google/cloud/aiplatform/utils/source_utils.py @@ -0,0 +1,216 @@ +import functools +import pathlib +import shutil +import subprocess +import sys +import tempfile +from typing import Optional, Sequence, Callable + +from google.auth import credentials as auth_credentials +from google.cloud.aiplatform import base +from google.cloud.aiplatform import utils + +_LOGGER = base.Logger(__name__) + + +def _get_python_executable() -> str: + """Returns Python executable. + + Returns: + Python executable to use for setuptools packaging. + Raises: + EnvironmentError: If Python executable is not found. + """ + + python_executable = sys.executable + + if not python_executable: + raise EnvironmentError("Cannot find Python executable for packaging.") + return python_executable + + +class _TrainingScriptPythonPackager: + """Converts a Python script into Python package suitable for aiplatform + training. + + Copies the script to specified location. + + Class Attributes: + _TRAINER_FOLDER: Constant folder name to build package. + _ROOT_MODULE: Constant root name of module. + _TEST_MODULE_NAME: Constant name of module that will store script. + _SETUP_PY_VERSION: Constant version of this created python package. + _SETUP_PY_TEMPLATE: Constant template used to generate setup.py file. + _SETUP_PY_SOURCE_DISTRIBUTION_CMD: + Constant command to generate the source distribution package. + + Attributes: + script_path: local path of script to package + requirements: list of Python dependencies to add to package + + Usage: + + packager = TrainingScriptPythonPackager('my_script.py', ['pandas', 'pytorch']) + gcs_path = packager.package_and_copy_to_gcs( + gcs_staging_dir='my-bucket', + project='my-prject') + module_name = packager.module_name + + The package after installed can be executed as: + python -m aiplatform_custom_trainer_script.task + """ + + _TRAINER_FOLDER = "trainer" + _ROOT_MODULE = "aiplatform_custom_trainer_script" + _TASK_MODULE_NAME = "task" + _SETUP_PY_VERSION = "0.1" + + _SETUP_PY_TEMPLATE = """from setuptools import find_packages +from setuptools import setup + +setup( + name='{name}', + version='{version}', + packages=find_packages(), + install_requires=({requirements}), + include_package_data=True, + description='My training application.' +)""" + + _SETUP_PY_SOURCE_DISTRIBUTION_CMD = "setup.py sdist --formats=gztar" + + # Module name that can be executed during training. ie. python -m + module_name = f"{_ROOT_MODULE}.{_TASK_MODULE_NAME}" + + def __init__(self, script_path: str, requirements: Optional[Sequence[str]] = None): + """Initializes packager. + + Args: + script_path (str): Required. Local path to script. + requirements (Sequence[str]): + List of python packages dependencies of script. + """ + + self.script_path = script_path + self.requirements = requirements or [] + + def make_package(self, package_directory: str) -> str: + """Converts script into a Python package suitable for python module + execution. + + Args: + package_directory (str): Directory to build package in. + Returns: + source_distribution_path (str): Path to built package. + Raises: + RunTimeError: If package creation fails. + """ + # The root folder to builder the package in + package_path = pathlib.Path(package_directory) + + # Root directory of the package + trainer_root_path = package_path / self._TRAINER_FOLDER + + # The root module of the python package + trainer_path = trainer_root_path / self._ROOT_MODULE + + # __init__.py path in root module + init_path = trainer_path / "__init__.py" + + # The module that will contain the script + script_out_path = trainer_path / f"{self._TASK_MODULE_NAME}.py" + + # The path to setup.py in the package. + setup_py_path = trainer_root_path / "setup.py" + + # The path to the generated source distribution. + source_distribution_path = ( + trainer_root_path + / "dist" + / f"{self._ROOT_MODULE}-{self._SETUP_PY_VERSION}.tar.gz" + ) + + trainer_root_path.mkdir() + trainer_path.mkdir() + + # Make empty __init__.py + with init_path.open("w"): + pass + + # Format the setup.py file. + setup_py_output = self._SETUP_PY_TEMPLATE.format( + name=self._ROOT_MODULE, + requirements=",".join(f'"{r}"' for r in self.requirements), + version=self._SETUP_PY_VERSION, + ) + + # Write setup.py + with setup_py_path.open("w") as fp: + fp.write(setup_py_output) + + # Copy script as module of python package. + shutil.copy(self.script_path, script_out_path) + + # Run setup.py to create the source distribution. + setup_cmd = [ + _get_python_executable() + ] + self._SETUP_PY_SOURCE_DISTRIBUTION_CMD.split() + + p = subprocess.Popen( + args=setup_cmd, + cwd=trainer_root_path, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output, error = p.communicate() + + # Raise informative error if packaging fails. + if p.returncode != 0: + raise RuntimeError( + "Packaging of training script failed with code %d\n%s \n%s" + % (p.returncode, output.decode(), error.decode()) + ) + + return str(source_distribution_path) + + def package_and_copy(self, copy_method: Callable[[str], str]) -> str: + """Packages the script and executes copy with given copy_method. + + Args: + copy_method Callable[[str], str] + Takes a string path, copies to a desired location, and returns the + output path location. + Returns: + output_path str: Location of copied package. + """ + + with tempfile.TemporaryDirectory() as tmpdirname: + source_distribution_path = self.make_package(tmpdirname) + output_location = copy_method(source_distribution_path) + _LOGGER.info("Training script copied to:\n%s." % output_location) + return output_location + + def package_and_copy_to_gcs( + self, + gcs_staging_dir: str, + project: str = None, + credentials: Optional[auth_credentials.Credentials] = None, + ) -> str: + """Packages script in Python package and copies package to GCS bucket. + + Args + gcs_staging_dir (str): Required. GCS Staging directory. + project (str): Required. Project where GCS Staging bucket is located. + credentials (auth_credentials.Credentials): + Optional credentials used with GCS client. + Returns: + GCS location of Python package. + """ + + copy_method = functools.partial( + utils._timestamped_copy_to_gcs, + gcs_dir=gcs_staging_dir, + project=project, + credentials=credentials, + ) + return self.package_and_copy(copy_method=copy_method) \ No newline at end of file diff --git a/google/cloud/aiplatform/utils/worker_spec_utils.py b/google/cloud/aiplatform/utils/worker_spec_utils.py new file mode 100644 index 0000000000..a23b997f48 --- /dev/null +++ b/google/cloud/aiplatform/utils/worker_spec_utils.py @@ -0,0 +1,181 @@ +from typing import NamedTuple, Optional, Dict, Union, List + +from google.cloud.aiplatform import utils +from google.cloud.aiplatform.compat.types import accelerator_type as gca_accelerator_type_compat + + +class _MachineSpec(NamedTuple): + """Specification container for Machine specs used for distributed training. + + Usage: + + spec = _MachineSpec( + replica_count=10, + machine_type='n1-standard-4', + accelerator_count=2, + accelerator_type='NVIDIA_TESLA_K80') + + Note that container and python package specs are not stored with this spec. + """ + + replica_count: int = 0 + machine_type: str = "n1-standard-4" + accelerator_count: int = 0 + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED" + + def _get_accelerator_type(self) -> Optional[str]: + """Validates accelerator_type and returns the name of the accelerator. + + Returns: + None if no accelerator or valid accelerator name. + + Raise: + ValueError if accelerator type is invalid. + """ + + # Raises ValueError if invalid accelerator_type + utils.validate_accelerator_type(self.accelerator_type) + + accelerator_enum = getattr( + gca_accelerator_type_compat.AcceleratorType, self.accelerator_type + ) + + if ( + accelerator_enum + != gca_accelerator_type_compat.AcceleratorType.ACCELERATOR_TYPE_UNSPECIFIED + ): + return self.accelerator_type + + @property + def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]: + """Return specification as a Dict.""" + spec = { + "machine_spec": {"machine_type": self.machine_type}, + "replica_count": self.replica_count, + } + accelerator_type = self._get_accelerator_type() + if accelerator_type and self.accelerator_count: + spec["machine_spec"]["accelerator_type"] = accelerator_type + spec["machine_spec"]["accelerator_count"] = self.accelerator_count + + return spec + + @property + def is_empty(self) -> bool: + """Returns True is replica_count > 0 False otherwise.""" + return self.replica_count <= 0 + + +class _DistributedTrainingSpec(NamedTuple): + """Configuration for distributed training worker pool specs. + + AI Platform Training expects configuration in this order: + [ + chief spec, # can only have one replica + worker spec, + parameter server spec, + evaluator spec + ] + + Usage: + + dist_training_spec = _DistributedTrainingSpec( + chief_spec = _MachineSpec( + replica_count=1, + machine_type='n1-standard-4', + accelerator_count=2, + accelerator_type='NVIDIA_TESLA_K80' + ), + worker_spec = _MachineSpec( + replica_count=10, + machine_type='n1-standard-4', + accelerator_count=2, + accelerator_type='NVIDIA_TESLA_K80' + ) + ) + """ + + chief_spec: _MachineSpec = _MachineSpec() + worker_spec: _MachineSpec = _MachineSpec() + parameter_server_spec: _MachineSpec = _MachineSpec() + evaluator_spec: _MachineSpec = _MachineSpec() + + @property + def pool_specs( + self, + ) -> List[Dict[str, Union[int, str, Dict[str, Union[int, str]]]]]: + """Return each pools spec in correct order for AI Platform as a list of + dicts. + + Also removes specs if they are empty but leaves specs in if there unusual + specifications to not break the ordering in AI Platform Training. + ie. 0 chief replica, 10 worker replica, 3 ps replica + + Returns: + Order list of worker pool specs suitable for AI Platform Training. + """ + if self.chief_spec.replica_count > 1: + raise ValueError("Chief spec replica count cannot be greater than 1.") + + spec_order = [ + self.chief_spec, + self.worker_spec, + self.parameter_server_spec, + self.evaluator_spec, + ] + specs = [s.spec_dict for s in spec_order] + for i in reversed(range(len(spec_order))): + if spec_order[i].is_empty: + specs.pop() + else: + break + return specs + + @classmethod + def chief_worker_pool( + cls, + replica_count: int = 0, + machine_type: str = "n1-standard-4", + accelerator_count: int = 0, + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + ) -> "_DistributedTrainingSpec": + """Parameterizes Config to support only chief with worker replicas. + + For replica is assigned to chief and the remainder to workers. All spec have the + same machine type, accelerator count, and accelerator type. + + Args: + replica_count (int): + The number of worker replicas. Assigns 1 chief replica and + replica_count - 1 worker replicas. + machine_type (str): + The type of machine to use for training. + accelerator_type (str): + Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, + NVIDIA_TESLA_T4 + accelerator_count (int): + The number of accelerators to attach to a worker replica. + + Returns: + _DistributedTrainingSpec representing one chief and n workers all of same + type. If replica_count <= 0 then an empty spec is returned. + """ + if replica_count <= 0: + return cls() + + chief_spec = _MachineSpec( + replica_count=1, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ) + + worker_spec = _MachineSpec( + replica_count=replica_count - 1, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ) + + return cls(chief_spec=chief_spec, worker_spec=worker_spec) \ No newline at end of file diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py index 69c5517a69..f4b1355679 100644 --- a/tests/unit/aiplatform/test_end_to_end.py +++ b/tests/unit/aiplatform/test_end_to_end.py @@ -19,6 +19,7 @@ from importlib import reload +import google.cloud.aiplatform.utils.source_utils from google.cloud import aiplatform from google.cloud.aiplatform import initializer from google.cloud.aiplatform import models @@ -212,7 +213,7 @@ def test_dataset_create_to_model_predict( }, "pythonPackageSpec": { "executorImageUri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, @@ -393,7 +394,7 @@ def test_dataset_create_to_model_predict_with_pipeline_fail( }, "pythonPackageSpec": { "executorImageUri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py index 8fd82c7727..8f783edc15 100644 --- a/tests/unit/aiplatform/test_training_jobs.py +++ b/tests/unit/aiplatform/test_training_jobs.py @@ -30,6 +30,9 @@ from google.auth import credentials as auth_credentials +import google.cloud.aiplatform.utils +import google.cloud.aiplatform.utils.source_utils +import google.cloud.aiplatform.utils.worker_spec_utils from google.cloud import aiplatform from google.cloud.aiplatform import datasets @@ -234,7 +237,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_bucket( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT, @@ -261,7 +264,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_gcs_path( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_GCS_PATH_WITH_TRAILING_SLASH, project=_TEST_PROJECT, @@ -289,7 +292,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_trailing_slash( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_GCS_PATH, project=_TEST_PROJECT, @@ -315,7 +318,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client(self, mock_client_bucket): mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT, @@ -332,10 +335,10 @@ def test_timestamp_copy_to_gcs_calls_gcs_client(self, mock_client_bucket): def test_get_python_executable_raises_if_None(self): with patch.object(sys, "executable", new=None): with pytest.raises(EnvironmentError): - training_jobs._get_python_executable() + google.cloud.aiplatform.utils.source_utils._get_python_executable() def test_get_python_executable_returns_python_executable(self): - assert "python" in training_jobs._get_python_executable().lower() + assert "python" in google.cloud.aiplatform.utils.source_utils._get_python_executable().lower() class TestTrainingScriptPythonPackager: @@ -347,7 +350,7 @@ def setup_method(self): def teardown_method(self): pathlib.Path(_TEST_LOCAL_SCRIPT_FILE_NAME).unlink() - python_package_file = f"{training_jobs._TrainingScriptPythonPackager._ROOT_MODULE}-{training_jobs._TrainingScriptPythonPackager._SETUP_PY_VERSION}.tar.gz" + python_package_file = f"{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}.tar.gz" if pathlib.Path(python_package_file).is_file(): pathlib.Path(python_package_file).unlink() subprocess.check_output( @@ -355,34 +358,34 @@ def teardown_method(self): "pip3", "uninstall", "-y", - training_jobs._TrainingScriptPythonPackager._ROOT_MODULE, + google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._ROOT_MODULE, ] ) def test_packager_creates_and_copies_python_package(self): - tsp = training_jobs._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) tsp.package_and_copy(copy_method=local_copy_method) assert pathlib.Path( f"{tsp._ROOT_MODULE}-{tsp._SETUP_PY_VERSION}.tar.gz" ).is_file() def test_created_package_module_is_installable_and_can_be_run(self): - tsp = training_jobs._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) source_dist_path = tsp.package_and_copy(copy_method=local_copy_method) subprocess.check_output(["pip3", "install", source_dist_path]) module_output = subprocess.check_output( - [training_jobs._get_python_executable(), "-m", tsp.module_name] + [google.cloud.aiplatform.utils.source_utils._get_python_executable(), "-m", tsp.module_name] ) assert "hello world" in module_output.decode() def test_requirements_are_in_package(self): - tsp = training_jobs._TrainingScriptPythonPackager( + tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager( _TEST_LOCAL_SCRIPT_FILE_NAME, requirements=_TEST_REQUIREMENTS ) source_dist_path = tsp.package_and_copy(copy_method=local_copy_method) with tarfile.open(source_dist_path) as tf: with tempfile.TemporaryDirectory() as tmpdirname: - setup_py_path = f"{training_jobs._TrainingScriptPythonPackager._ROOT_MODULE}-{training_jobs._TrainingScriptPythonPackager._SETUP_PY_VERSION}/setup.py" + setup_py_path = f"{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}/setup.py" tf.extract(setup_py_path, path=tmpdirname) setup_py = core.run_setup( pathlib.Path(tmpdirname, setup_py_path), stop_after="init" @@ -395,7 +398,7 @@ def test_packaging_fails_whith_RuntimeError(self): mock_subprocess.communicate.return_value = (b"", b"") mock_subprocess.returncode = 1 mock_popen.return_value = mock_subprocess - tsp = training_jobs._TrainingScriptPythonPackager( + tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager( _TEST_LOCAL_SCRIPT_FILE_NAME ) with pytest.raises(RuntimeError): @@ -404,7 +407,7 @@ def test_packaging_fails_whith_RuntimeError(self): def test_package_and_copy_to_gcs_copies_to_gcs(self, mock_client_bucket): mock_client_bucket, mock_blob = mock_client_bucket - tsp = training_jobs._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) gcs_path = tsp.package_and_copy_to_gcs( gcs_staging_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT @@ -512,7 +515,7 @@ def mock_model_service_get(): @pytest.fixture def mock_python_package_to_gcs(): with mock.patch.object( - training_jobs._TrainingScriptPythonPackager, "package_and_copy_to_gcs" + google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager, "package_and_copy_to_gcs" ) as mock_package_to_copy_gcs: mock_package_to_copy_gcs.return_value = _TEST_OUTPUT_PYTHON_PACKAGE_PATH yield mock_package_to_copy_gcs @@ -638,7 +641,7 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -797,7 +800,7 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1072,7 +1075,7 @@ def test_run_call_pipeline_service_create_with_no_dataset( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1324,7 +1327,7 @@ def test_run_call_pipeline_service_create_distributed_training( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1339,7 +1342,7 @@ def test_run_call_pipeline_service_create_distributed_training( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1552,7 +1555,7 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, + "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, @@ -2689,7 +2692,7 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_raises_if_anno class Test_MachineSpec: def test_machine_spec_return_spec_dict(self): - test_spec = training_jobs._MachineSpec( + test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2708,7 +2711,7 @@ def test_machine_spec_return_spec_dict(self): assert test_spec.spec_dict == true_spec_dict def test_machine_spec_return_spec_dict_with_no_accelerator(self): - test_spec = training_jobs._MachineSpec( + test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=0, @@ -2723,7 +2726,7 @@ def test_machine_spec_return_spec_dict_with_no_accelerator(self): assert test_spec.spec_dict == true_spec_dict def test_machine_spec_spec_dict_raises_invalid_accelerator(self): - test_spec = training_jobs._MachineSpec( + test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2734,7 +2737,7 @@ def test_machine_spec_spec_dict_raises_invalid_accelerator(self): test_spec.spec_dict def test_machine_spec_spec_dict_is_empty(self): - test_spec = training_jobs._MachineSpec( + test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=0, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2744,7 +2747,7 @@ def test_machine_spec_spec_dict_is_empty(self): assert test_spec.is_empty def test_machine_spec_spec_dict_is_not_empty(self): - test_spec = training_jobs._MachineSpec( + test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2757,26 +2760,26 @@ def test_machine_spec_spec_dict_is_not_empty(self): class Test_DistributedTrainingSpec: def test_machine_spec_returns_pool_spec(self): - spec = training_jobs._DistributedTrainingSpec( - chief_spec=training_jobs._MachineSpec( + spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec( + chief_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=training_jobs._MachineSpec( + worker_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - parameter_server_spec=training_jobs._MachineSpec( + parameter_server_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=training_jobs._MachineSpec( + evaluator_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2823,7 +2826,7 @@ def test_machine_spec_returns_pool_spec(self): def test_chief_worker_pool_returns_spec(self): - chief_worker_spec = training_jobs._DistributedTrainingSpec.chief_worker_pool( + chief_worker_spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2853,7 +2856,7 @@ def test_chief_worker_pool_returns_spec(self): def test_chief_worker_pool_returns_just_chief(self): - chief_worker_spec = training_jobs._DistributedTrainingSpec.chief_worker_pool( + chief_worker_spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2875,8 +2878,8 @@ def test_chief_worker_pool_returns_just_chief(self): def test_machine_spec_raise_with_more_than_one_chief_replica(self): - spec = training_jobs._DistributedTrainingSpec( - chief_spec=training_jobs._MachineSpec( + spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec( + chief_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=2, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2889,21 +2892,21 @@ def test_machine_spec_raise_with_more_than_one_chief_replica(self): def test_machine_spec_handles_missing_pools(self): - spec = training_jobs._DistributedTrainingSpec( - chief_spec=training_jobs._MachineSpec( + spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec( + chief_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=training_jobs._MachineSpec(replica_count=0), - parameter_server_spec=training_jobs._MachineSpec( + worker_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec(replica_count=0), + parameter_server_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=training_jobs._MachineSpec(replica_count=0), + evaluator_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec(replica_count=0), ) true_pool_spec = [ From 3178b11fe769306f09315732c1a1e175e05a8054 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 09:16:21 -0400 Subject: [PATCH 03/29] checkpoint --- google/cloud/aiplatform/jobs.py | 114 ++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 7 deletions(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 1d068d3435..535e0fb9dc 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -47,6 +47,7 @@ io as gca_io_compat, io_v1beta1 as gca_io_v1beta1, job_state as gca_job_state, + hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat, machine_resources as gca_machine_resources_compat, machine_resources_v1beta1 as gca_machine_resources_v1beta1, ) @@ -925,6 +926,13 @@ def run( self._block_until_complete() + @property + def worker_pool_specs(self): + return self._gca_resource.job_spec.worker_pool_specs + + + + class DataLabelingJob(_Job): _resource_noun = "dataLabelingJobs" _getter_method = "get_data_labeling_job" @@ -935,10 +943,102 @@ class DataLabelingJob(_Job): pass -class HyperparameterTuningJob(_Job): - _resource_noun = "hyperparameterTuningJobs" - _getter_method = "get_hyperparameter_tuning_job" - _list_method = "list_hyperparameter_tuning_jobs" - _cancel_method = "cancel_hyperparameter_tuning_job" - _delete_method = "delete_hyperparameter_tuning_job" - pass +# class HyperparameterTuningJob(_Job): +# _resource_noun = "hyperparameterTuningJobs" +# _getter_method = "get_hyperparameter_tuning_job" +# _list_method = "list_hyperparameter_tuning_jobs" +# _cancel_method = "cancel_hyperparameter_tuning_job" +# _delete_method = "delete_hyperparameter_tuning_job" + + +# def __init__(self, +# display_name: str, +# custom_job: CustomJob, +# metric_spec: Dict[str, str], +# parameter_spec: dict[str, hyperparameter_tuning_job.Parameter], +# max_trial_count: int, +# parallel_trial_count: int, +# max_failed_trials_count: int = 0, +# observation_noise: Optional[str] = 'low', +# algorithm: Optional[str] = 'random', +# measurement_selection: Optional[str] = 'best', +# # project: Optional[str] = None, +# # location: Optional[str] = None, +# # credentials: Optional[auth_credentials.Credentials] = None, +# encryption_spec_key_name: Optional[str] = None, +# # staging_bucket: Optional[str] = None +# ): +# base.AiPlatformResourceNounWithFutureManager.__init__(self, +# project=custom_job.project, +# location=custom_job.location, +# credentials=custom_job.credentials +# ) + +# metrics = [ +# gca_study.StudySpec.MetricSpec(metric_id=metric_id, goal=goal.upper()) +# for metric_id, goal in metric_spec_dict.items() +# ] + +# parameters = [ +# parameter.to_parameter_spec(parameter_id=parameter_id) +# for parameter_id, parameter in parameter_spec.items() +# ] + +# study_spec = gca_study_compat.StudySpec( +# metrics = metrics, +# parameters = p +# algorithm + + +# ) + +# self._gca_resource = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( +# display_name=display_name, +# study_spec=, +# max_trial_count=max_trial_count, +# parallel_trial_count=parallel_trial_count, +# max_failed_trial_count=max_failed_trial_count, +# trial_job_spec=custom_job.job_spec._gca_resource.copy(), +# encryption_spec= initializer.global_config.get_encryption_spec( +# encryption_spec_key_name = encryption_spec_key_name +# ) +# ) + +# @staticmethod +# def _convert_metric_spec_dict_to_metric_spec( +# metric_spec_dict: Dict[str, str]) -> gca_study_compat.StudySpec.MetricSpec: +# return + + +# @base.optional_sync() +# def run( +# self, +# service_account: Optional[str] = None, +# network: Optional[str] = None, +# timeout: Optional[int] = None, # seconds +# restart_job_on_worker_restart: bool=False, +# sync: bool = True): + +# if service_account: +# self._gca_resource.trial_job_spec.service_account = service_account + +# if network: +# self._gca_resource.trial_job_spec.network = network + + +# if timeout or restart_job_on_worker_restart: +# timout = duration_pb2.Duration(seconds=timout) if timeout else None +# self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( +# timeout=timeout, +# restart_job_on_worker_restart=restart_job_on_worker_restart +# ) + +# self._gca_resource = self.api_client.create_hyperparameter_tuning_job( +# parent=self._parent, +# hyperparameter_tuning_job=self._gca_resource +# ) + +# self._block_until_complete() + + + From a6fe1d7e0ff20d4bb01084733b3f051901dd0f2a Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 11:48:39 -0400 Subject: [PATCH 04/29] checkpoint --- google/cloud/aiplatform/__init__.py | 7 +- google/cloud/aiplatform/compat/__init__.py | 2 + .../cloud/aiplatform/compat/types/__init__.py | 2 + .../cloud/aiplatform/hyperparameter_tuning.py | 152 ++++++++++++ google/cloud/aiplatform/jobs.py | 227 ++++++++++-------- 5 files changed, 290 insertions(+), 100 deletions(-) create mode 100644 google/cloud/aiplatform/hyperparameter_tuning.py diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 70b90b39fd..36edec3960 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -26,9 +26,11 @@ TimeSeriesDataset, VideoDataset, ) +from google.cloud.aiplatform import hyperparameter_tuning +from google.cloud.aiplatform.metadata import metadata from google.cloud.aiplatform.models import Endpoint from google.cloud.aiplatform.models import Model -from google.cloud.aiplatform.jobs import BatchPredictionJob, CustomJob +from google.cloud.aiplatform.jobs import BatchPredictionJob, CustomJob, HyperparameterTuningJob from google.cloud.aiplatform.training_jobs import ( CustomTrainingJob, CustomContainerTrainingJob, @@ -39,7 +41,6 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) -from google.cloud.aiplatform.metadata import metadata """ Usage: @@ -60,6 +61,7 @@ "explain", "gapic", "init", + "hyperparameter_tuning" "log_params", "log_metrics", "get_experiment_df", @@ -77,6 +79,7 @@ "CustomPythonPackageTrainingJob", "Endpoint", "ImageDataset", + "HyperparameterTuningJob", "Model", "TabularDataset", "TextDataset", diff --git a/google/cloud/aiplatform/compat/__init__.py b/google/cloud/aiplatform/compat/__init__.py index 980c554fe1..55a72fea16 100644 --- a/google/cloud/aiplatform/compat/__init__.py +++ b/google/cloud/aiplatform/compat/__init__.py @@ -70,6 +70,7 @@ types.prediction_service = types.prediction_service_v1beta1 types.specialist_pool = types.specialist_pool_v1beta1 types.specialist_pool_service = types.specialist_pool_service_v1beta1 + types.study = types.study_v1beta1 types.training_pipeline = types.training_pipeline_v1beta1 types.metadata_service = types.metadata_service_v1beta1 types.tensorboard_service = types.tensorboard_service_v1beta1 @@ -120,6 +121,7 @@ types.prediction_service = types.prediction_service_v1 types.specialist_pool = types.specialist_pool_v1 types.specialist_pool_service = types.specialist_pool_service_v1 + types.study = types.study_v1 types.training_pipeline = types.training_pipeline_v1 __all__ = ( diff --git a/google/cloud/aiplatform/compat/types/__init__.py b/google/cloud/aiplatform/compat/types/__init__.py index f45bb2e11e..7bd512e7e8 100644 --- a/google/cloud/aiplatform/compat/types/__init__.py +++ b/google/cloud/aiplatform/compat/types/__init__.py @@ -49,6 +49,7 @@ prediction_service as prediction_service_v1beta1, specialist_pool as specialist_pool_v1beta1, specialist_pool_service as specialist_pool_service_v1beta1, + study as study_v1beta1, training_pipeline as training_pipeline_v1beta1, metadata_service as metadata_service_v1beta1, tensorboard_service as tensorboard_service_v1beta1, @@ -90,6 +91,7 @@ prediction_service as prediction_service_v1, specialist_pool as specialist_pool_v1, specialist_pool_service as specialist_pool_service_v1, + study as study_v1, training_pipeline as training_pipeline_v1, ) diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py new file mode 100644 index 0000000000..045e3f955e --- /dev/null +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -0,0 +1,152 @@ +import abc +from typing import Dict, List, Optional, Tuple, Union + +import proto + +from google.cloud.aiplatform.compat.types import study as gca_study_compat + +_scale_type_map = { + 'linear': gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + 'log': gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, + 'reverse_log': gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_REVERSE_LOG_SCALE, +} + + +class _ParameterSpec(metaclass=abc.ABCMeta): + + def __init__( + self, + conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, + parent_values: Optional[List[Union[float, int, str]]] = None): + + self.conditional_parameter_spec = conditional_parameter_spec + self.parent_values = parent_values + + @property + @classmethod + @abc.abstractmethod + def _proto_parameter_value_class(self) -> proto.Message: + pass + + @property + @classmethod + @abc.abstractmethod + def _parameter_value_map(self) -> Tuple[Tuple[str, str]]: + pass + + @property + @classmethod + @abc.abstractmethod + def _parameter_spec_value_key(self) -> Tuple[Tuple[str, str]]: + pass + + + @property + def _proto_parameter_value_spec(self) -> proto.Message: + proto_parameter_value_spec = self._proto_parameter_value_class() + for self_attr_key, proto_attr_key in self._parameter_value_map: + setattr(proto_parameter_value_spec, proto_attr_key, getattr(self, self_attr_key)) + return proto_parameter_value_spec + + + def _to_parameter_spec(self, parameter_id: str) -> gca_study_compat.StudySpec.ParameterSpec: + # TODO: Conditional parameters + parameter_spec = gca_study_compat.StudySpec.ParameterSpec( + parameter_id=parameter_id, + scale_type=_scale_type_map.get(getattr(self, 'scale')) + ) + + setattr(parameter_spec, self._parameter_spec_value_key, self._proto_parameter_value_spec) + + return parameter_spec + + +class DoubleParameterSpec(_ParameterSpec): + + _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.DoubleValueSpec + _parameter_value_map = (('min', 'min_value'), ('max', 'max_value')) + _parameter_spec_value_key = 'double_value_spec' + + def __init__( + self, + min: float, + max: float, + scale: str, + conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, + parent_values: Optional[List[Union[float, int, str]]] = None + ): + + super().__init__( + conditional_parameter_spec=conditional_parameter_spec, + parent_values=parent_values) + + self.min = min + self.max = max + self.scale=scale + + +class IntegerParameterSpec(_ParameterSpec): + + _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.IntegerValueSpec + _parameter_value_map = (('min', 'min_value'), ('max', 'max_value')) + _parameter_spec_value_key = 'integer_value_spec' + + def __init__( + self, + min: int, + max: int, + scale: str, + conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, + parent_values: Optional[List[Union[float, int, str]]] = None + ): + + super().__init__( + conditional_parameter_spec=conditional_parameter_spec, + parent_value=parent_values) + + self.min = min + self.max = max, + self.scale=scale + +class CategoricalValueSpec(_ParameterSpec): + + _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec + _parameter_value_map = (('values', 'values')) + _parameter_spec_value_key = 'categorical_value_spec' + + def __init__( + self, + values: List[str], + conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, + parent_values: Optional[List[Union[float, int, str]]] = None + ): + + super().__init__( + conditional_parameter_spec=conditional_parameter_spec, + parent_value=parent_values) + + self.values = values + + +class DiscreteValueSpec(_ParameterSpec): + + _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec + _parameter_value_map = (('values', 'values')) + _parameter_spec_value_key = 'discrete_value_spec' + + def __init__( + self, + values: List[float], + scale: str, + conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, + parent_values: Optional[List[Union[float, int, str]]] = None + ): + + super().__init__( + conditional_parameter_spec=conditional_parameter_spec, + parent_value=parent_values) + + self.values = values + self.scale = scale + + diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 535e0fb9dc..aa06fc6370 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -18,6 +18,7 @@ from typing import Iterable, Optional, Union, Sequence, Dict, List import abc +import copy import sys import time import logging @@ -33,6 +34,7 @@ from google.cloud.aiplatform import compat from google.cloud.aiplatform import constants from google.cloud.aiplatform import initializer +from google.cloud.aiplatform import hyperparameter_tuning from google.cloud.aiplatform import utils from google.cloud.aiplatform.utils import source_utils from google.cloud.aiplatform.utils import worker_spec_utils @@ -50,6 +52,7 @@ hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat, machine_resources as gca_machine_resources_compat, machine_resources_v1beta1 as gca_machine_resources_v1beta1, + study as gca_study_compat ) logging.basicConfig(level=logging.INFO, stream=sys.stdout) @@ -181,7 +184,7 @@ def _block_until_complete(self): previous_time = current_time time.sleep(wait) - _LOGGER.log_action_completed_against_resource("", "run", self) + _LOGGER.log_action_completed_against_resource("run", "completed", self) # Error is only populated when the job state is # JOB_STATE_FAILED or JOB_STATE_CANCELLED. @@ -903,7 +906,7 @@ def run( network: Optional[str] = None, timeout: Optional[int] = None, # seconds restart_job_on_worker_restart: bool=False, - sync: bool = True): + sync: bool = True) -> None: if service_account: self._gca_resource.service_account = service_account @@ -919,16 +922,24 @@ def run( restart_job_on_worker_restart=restart_job_on_worker_restart ) + _LOGGER.log_create_with_lro(self.__class__) + self._gca_resource = self.api_client.create_custom_job( parent=self._parent, custom_job=self._gca_resource ) + _LOGGER.log_create_complete(self.__class__, self._gca_resource, "custom_job") + + _LOGGER.info( + "View Custom Job:\n%s" % self._dashboard_uri() + ) + self._block_until_complete() @property - def worker_pool_specs(self): - return self._gca_resource.job_spec.worker_pool_specs + def job_spec(self): + return self._gca_resource.job_spec @@ -943,102 +954,122 @@ class DataLabelingJob(_Job): pass -# class HyperparameterTuningJob(_Job): -# _resource_noun = "hyperparameterTuningJobs" -# _getter_method = "get_hyperparameter_tuning_job" -# _list_method = "list_hyperparameter_tuning_jobs" -# _cancel_method = "cancel_hyperparameter_tuning_job" -# _delete_method = "delete_hyperparameter_tuning_job" +_search_algorithm_to_proto_value = { + 'random': gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, + 'grid': gca_study_compat.StudySpec.Algorithm.GRID_SEARCH + +} + +_measurement_selection_to_proto_value = { + 'best': gca_study_compat.StudySpec.MeasurementSelectionType.BEST_MEASUREMENT, + 'last': gca_study_compat.StudySpec.MeasurementSelectionType.LAST_MEASUREMENT +} + +class HyperparameterTuningJob(_Job): + _resource_noun = "hyperparameterTuningJobs" + _getter_method = "get_hyperparameter_tuning_job" + _list_method = "list_hyperparameter_tuning_jobs" + _cancel_method = "cancel_hyperparameter_tuning_job" + _delete_method = "delete_hyperparameter_tuning_job" + _job_type = "training" -# def __init__(self, -# display_name: str, -# custom_job: CustomJob, -# metric_spec: Dict[str, str], -# parameter_spec: dict[str, hyperparameter_tuning_job.Parameter], -# max_trial_count: int, -# parallel_trial_count: int, -# max_failed_trials_count: int = 0, -# observation_noise: Optional[str] = 'low', -# algorithm: Optional[str] = 'random', -# measurement_selection: Optional[str] = 'best', -# # project: Optional[str] = None, -# # location: Optional[str] = None, -# # credentials: Optional[auth_credentials.Credentials] = None, -# encryption_spec_key_name: Optional[str] = None, -# # staging_bucket: Optional[str] = None -# ): -# base.AiPlatformResourceNounWithFutureManager.__init__(self, -# project=custom_job.project, -# location=custom_job.location, -# credentials=custom_job.credentials -# ) - -# metrics = [ -# gca_study.StudySpec.MetricSpec(metric_id=metric_id, goal=goal.upper()) -# for metric_id, goal in metric_spec_dict.items() -# ] - -# parameters = [ -# parameter.to_parameter_spec(parameter_id=parameter_id) -# for parameter_id, parameter in parameter_spec.items() -# ] - -# study_spec = gca_study_compat.StudySpec( -# metrics = metrics, -# parameters = p -# algorithm - - -# ) - -# self._gca_resource = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( -# display_name=display_name, -# study_spec=, -# max_trial_count=max_trial_count, -# parallel_trial_count=parallel_trial_count, -# max_failed_trial_count=max_failed_trial_count, -# trial_job_spec=custom_job.job_spec._gca_resource.copy(), -# encryption_spec= initializer.global_config.get_encryption_spec( -# encryption_spec_key_name = encryption_spec_key_name -# ) -# ) - -# @staticmethod -# def _convert_metric_spec_dict_to_metric_spec( -# metric_spec_dict: Dict[str, str]) -> gca_study_compat.StudySpec.MetricSpec: -# return - - -# @base.optional_sync() -# def run( -# self, -# service_account: Optional[str] = None, -# network: Optional[str] = None, -# timeout: Optional[int] = None, # seconds -# restart_job_on_worker_restart: bool=False, -# sync: bool = True): - -# if service_account: -# self._gca_resource.trial_job_spec.service_account = service_account - -# if network: -# self._gca_resource.trial_job_spec.network = network - - -# if timeout or restart_job_on_worker_restart: -# timout = duration_pb2.Duration(seconds=timout) if timeout else None -# self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( -# timeout=timeout, -# restart_job_on_worker_restart=restart_job_on_worker_restart -# ) - -# self._gca_resource = self.api_client.create_hyperparameter_tuning_job( -# parent=self._parent, -# hyperparameter_tuning_job=self._gca_resource -# ) - -# self._block_until_complete() + def __init__(self, + display_name: str, + custom_job: CustomJob, + metric_spec: Dict[str, str], + parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec], + max_trial_count: int, + parallel_trial_count: int, + max_failed_trial_count: int = 0, + search_algorithm: Optional[str] = 'random', + # observation_noise: Optional[str] = 'low', + measurement_selection: Optional[str] = 'best', + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + # staging_bucket: Optional[str] = None + ): + base.AiPlatformResourceNounWithFutureManager.__init__(self, + project=project, + location=location, + credentials=credentials, + ) + + self._parent = aiplatform.initializer.global_config.common_location_path( + project=project, + location=location + ) + + metrics = [ + gca_study_compat.StudySpec.MetricSpec(metric_id=metric_id, goal=goal.upper()) + for metric_id, goal in metric_spec.items() + ] + + parameters = [ + parameter._to_parameter_spec(parameter_id=parameter_id) + for parameter_id, parameter in parameter_spec.items() + ] + + study_spec = gca_study_compat.StudySpec( + metrics = metrics, + parameters = parameters, + algorithm = _search_algorithm_to_proto_value[search_algorithm], + # observation_noise = observation_noise.upper(), + measurement_selection_type = _measurement_selection_to_proto_value[measurement_selection] + ) + + self._gca_resource = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( + display_name=display_name, + study_spec=study_spec, + max_trial_count=max_trial_count, + parallel_trial_count=parallel_trial_count, + max_failed_trial_count=max_failed_trial_count, + trial_job_spec=copy.deepcopy(custom_job.job_spec), + encryption_spec= initializer.global_config.get_encryption_spec( + encryption_spec_key_name = encryption_spec_key_name + ) + ) + + @base.optional_sync() + def run( + self, + service_account: Optional[str] = None, + network: Optional[str] = None, + timeout: Optional[int] = None, # seconds + restart_job_on_worker_restart: bool=False, + sync: bool = True) -> None: + + if service_account: + self._gca_resource.trial_job_spec.service_account = service_account + + if network: + self._gca_resource.trial_job_spec.network = network + + + if timeout or restart_job_on_worker_restart: + timout = duration_pb2.Duration(seconds=timout) if timeout else None + self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart + ) + + _LOGGER.log_create_with_lro(self.__class__) + + self._gca_resource = self.api_client.create_hyperparameter_tuning_job( + parent=self._parent, + hyperparameter_tuning_job=self._gca_resource + ) + + _LOGGER.log_create_complete(self.__class__, self._gca_resource, "hpt_job") + + _LOGGER.info( + "View HyperparameterTuningJob:\n%s" % self._dashboard_uri() + ) + + + self._block_until_complete() From b5cc6e59c6998b1bc6cf507949a8b1871eb7f735 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 11:55:21 -0400 Subject: [PATCH 05/29] chore: update test imports --- tests/unit/aiplatform/test_training_jobs.py | 88 ++++++++++----------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py index 8f783edc15..04c5d609d5 100644 --- a/tests/unit/aiplatform/test_training_jobs.py +++ b/tests/unit/aiplatform/test_training_jobs.py @@ -30,9 +30,9 @@ from google.auth import credentials as auth_credentials -import google.cloud.aiplatform.utils -import google.cloud.aiplatform.utils.source_utils -import google.cloud.aiplatform.utils.worker_spec_utils +from google.cloud.aiplatform import utils +from utils import source_utils +from utils import worker_spec_utils from google.cloud import aiplatform from google.cloud.aiplatform import datasets @@ -237,7 +237,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_bucket( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT, @@ -264,7 +264,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_gcs_path( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_GCS_PATH_WITH_TRAILING_SLASH, project=_TEST_PROJECT, @@ -292,7 +292,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_trailing_slash( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_GCS_PATH, project=_TEST_PROJECT, @@ -318,7 +318,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client(self, mock_client_bucket): mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = google.cloud.aiplatform.utils._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT, @@ -335,10 +335,10 @@ def test_timestamp_copy_to_gcs_calls_gcs_client(self, mock_client_bucket): def test_get_python_executable_raises_if_None(self): with patch.object(sys, "executable", new=None): with pytest.raises(EnvironmentError): - google.cloud.aiplatform.utils.source_utils._get_python_executable() + source_utils._get_python_executable() def test_get_python_executable_returns_python_executable(self): - assert "python" in google.cloud.aiplatform.utils.source_utils._get_python_executable().lower() + assert "python" in source_utils._get_python_executable().lower() class TestTrainingScriptPythonPackager: @@ -350,7 +350,7 @@ def setup_method(self): def teardown_method(self): pathlib.Path(_TEST_LOCAL_SCRIPT_FILE_NAME).unlink() - python_package_file = f"{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}.tar.gz" + python_package_file = f"{source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}.tar.gz" if pathlib.Path(python_package_file).is_file(): pathlib.Path(python_package_file).unlink() subprocess.check_output( @@ -358,34 +358,34 @@ def teardown_method(self): "pip3", "uninstall", "-y", - google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._ROOT_MODULE, + source_utils._TrainingScriptPythonPackager._ROOT_MODULE, ] ) def test_packager_creates_and_copies_python_package(self): - tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) tsp.package_and_copy(copy_method=local_copy_method) assert pathlib.Path( f"{tsp._ROOT_MODULE}-{tsp._SETUP_PY_VERSION}.tar.gz" ).is_file() def test_created_package_module_is_installable_and_can_be_run(self): - tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) source_dist_path = tsp.package_and_copy(copy_method=local_copy_method) subprocess.check_output(["pip3", "install", source_dist_path]) module_output = subprocess.check_output( - [google.cloud.aiplatform.utils.source_utils._get_python_executable(), "-m", tsp.module_name] + [source_utils._get_python_executable(), "-m", tsp.module_name] ) assert "hello world" in module_output.decode() def test_requirements_are_in_package(self): - tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager( + tsp = source_utils._TrainingScriptPythonPackager( _TEST_LOCAL_SCRIPT_FILE_NAME, requirements=_TEST_REQUIREMENTS ) source_dist_path = tsp.package_and_copy(copy_method=local_copy_method) with tarfile.open(source_dist_path) as tf: with tempfile.TemporaryDirectory() as tmpdirname: - setup_py_path = f"{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}/setup.py" + setup_py_path = f"{source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}/setup.py" tf.extract(setup_py_path, path=tmpdirname) setup_py = core.run_setup( pathlib.Path(tmpdirname, setup_py_path), stop_after="init" @@ -398,7 +398,7 @@ def test_packaging_fails_whith_RuntimeError(self): mock_subprocess.communicate.return_value = (b"", b"") mock_subprocess.returncode = 1 mock_popen.return_value = mock_subprocess - tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager( + tsp = source_utils._TrainingScriptPythonPackager( _TEST_LOCAL_SCRIPT_FILE_NAME ) with pytest.raises(RuntimeError): @@ -407,7 +407,7 @@ def test_packaging_fails_whith_RuntimeError(self): def test_package_and_copy_to_gcs_copies_to_gcs(self, mock_client_bucket): mock_client_bucket, mock_blob = mock_client_bucket - tsp = google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) gcs_path = tsp.package_and_copy_to_gcs( gcs_staging_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT @@ -515,7 +515,7 @@ def mock_model_service_get(): @pytest.fixture def mock_python_package_to_gcs(): with mock.patch.object( - google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager, "package_and_copy_to_gcs" + source_utils._TrainingScriptPythonPackager, "package_and_copy_to_gcs" ) as mock_package_to_copy_gcs: mock_package_to_copy_gcs.return_value = _TEST_OUTPUT_PYTHON_PACKAGE_PATH yield mock_package_to_copy_gcs @@ -641,7 +641,7 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, + "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -800,7 +800,7 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, + "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1075,7 +1075,7 @@ def test_run_call_pipeline_service_create_with_no_dataset( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, + "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1327,7 +1327,7 @@ def test_run_call_pipeline_service_create_distributed_training( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, + "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1342,7 +1342,7 @@ def test_run_call_pipeline_service_create_distributed_training( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, + "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, @@ -1555,7 +1555,7 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( }, "pythonPackageSpec": { "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, + "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, @@ -2692,7 +2692,7 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_raises_if_anno class Test_MachineSpec: def test_machine_spec_return_spec_dict(self): - test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2711,7 +2711,7 @@ def test_machine_spec_return_spec_dict(self): assert test_spec.spec_dict == true_spec_dict def test_machine_spec_return_spec_dict_with_no_accelerator(self): - test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=0, @@ -2726,7 +2726,7 @@ def test_machine_spec_return_spec_dict_with_no_accelerator(self): assert test_spec.spec_dict == true_spec_dict def test_machine_spec_spec_dict_raises_invalid_accelerator(self): - test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2737,7 +2737,7 @@ def test_machine_spec_spec_dict_raises_invalid_accelerator(self): test_spec.spec_dict def test_machine_spec_spec_dict_is_empty(self): - test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=0, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2747,7 +2747,7 @@ def test_machine_spec_spec_dict_is_empty(self): assert test_spec.is_empty def test_machine_spec_spec_dict_is_not_empty(self): - test_spec = google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2760,26 +2760,26 @@ def test_machine_spec_spec_dict_is_not_empty(self): class Test_DistributedTrainingSpec: def test_machine_spec_returns_pool_spec(self): - spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec( - chief_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + spec = worker_spec_utils._DistributedTrainingSpec( + chief_spec=worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + worker_spec=worker_spec_utils._MachineSpec( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - parameter_server_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + parameter_server_spec=worker_spec_utils._MachineSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + evaluator_spec=worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2826,7 +2826,7 @@ def test_machine_spec_returns_pool_spec(self): def test_chief_worker_pool_returns_spec(self): - chief_worker_spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( + chief_worker_spec = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2856,7 +2856,7 @@ def test_chief_worker_pool_returns_spec(self): def test_chief_worker_pool_returns_just_chief(self): - chief_worker_spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( + chief_worker_spec = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2878,8 +2878,8 @@ def test_chief_worker_pool_returns_just_chief(self): def test_machine_spec_raise_with_more_than_one_chief_replica(self): - spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec( - chief_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + spec = worker_spec_utils._DistributedTrainingSpec( + chief_spec=worker_spec_utils._MachineSpec( replica_count=2, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2892,21 +2892,21 @@ def test_machine_spec_raise_with_more_than_one_chief_replica(self): def test_machine_spec_handles_missing_pools(self): - spec = google.cloud.aiplatform.utils.worker_spec_utils._DistributedTrainingSpec( - chief_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + spec = worker_spec_utils._DistributedTrainingSpec( + chief_spec=worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec(replica_count=0), - parameter_server_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec( + worker_spec=worker_spec_utils._MachineSpec(replica_count=0), + parameter_server_spec=worker_spec_utils._MachineSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=google.cloud.aiplatform.utils.worker_spec_utils._MachineSpec(replica_count=0), + evaluator_spec=worker_spec_utils._MachineSpec(replica_count=0), ) true_pool_spec = [ From 4a1b0cae263bbda951b4b04d944e3dba9922e223 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 12:05:52 -0400 Subject: [PATCH 06/29] fix: remove added __init__ files --- google/__init__.py | 0 google/cloud/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 google/__init__.py delete mode 100644 google/cloud/__init__.py diff --git a/google/__init__.py b/google/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/google/cloud/__init__.py b/google/cloud/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 From 10f4f80dc29b03db318bbd217c604e0187d3e45e Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 12:15:36 -0400 Subject: [PATCH 07/29] chore: update test imports --- tests/unit/aiplatform/test_training_jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py index 04c5d609d5..b4745d29f1 100644 --- a/tests/unit/aiplatform/test_training_jobs.py +++ b/tests/unit/aiplatform/test_training_jobs.py @@ -31,8 +31,8 @@ from google.auth import credentials as auth_credentials from google.cloud.aiplatform import utils -from utils import source_utils -from utils import worker_spec_utils +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud import aiplatform from google.cloud.aiplatform import datasets From d61079c49189cb10f0317a0e2c9d3f4a723301fc Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 13:07:10 -0400 Subject: [PATCH 08/29] feat: add hp tuning metric reporter to training utils --- google/cloud/aiplatform/__init__.py | 2 + google/cloud/aiplatform/training_utils.py | 88 ++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 36edec3960..99f7df09d3 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -41,6 +41,7 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) +from google.cloud.aiplatform import training_utils """ Usage: @@ -67,6 +68,7 @@ "get_experiment_df", "get_pipeline_df", "start_run", + "training_utils", "AutoMLImageTrainingJob", "AutoMLTabularTrainingJob", "AutoMLForecastingTrainingJob", diff --git a/google/cloud/aiplatform/training_utils.py b/google/cloud/aiplatform/training_utils.py index fea60c5005..71645a8446 100644 --- a/google/cloud/aiplatform/training_utils.py +++ b/google/cloud/aiplatform/training_utils.py @@ -15,9 +15,10 @@ # limitations under the License. # +import collections import json import os - +import time from typing import Dict, Optional @@ -103,3 +104,88 @@ def tf_config(self) -> Optional[Dict]: return json.loads(tf_config_env) else: return None + + +_DEFAULT_HYPERPARAMETER_METRIC_TAG = 'training/hptuning/metric' +_DEFAULT_METRIC_PATH = '/tmp/hypertune/output.metrics' +# TODO(0olwzo0): consider to make it configurable +_MAX_NUM_METRIC_ENTRIES_TO_PRESERVE = 100 + + +class _HyperparameterTuningJobReporterSingleton: + """Main class for HyperTune.""" + + initialized = False + + @classmethod + def initialize(cls): + if cls.initialized: + return + + cls.metric_path = os.environ.get('CLOUD_ML_HP_METRIC_FILE', + _DEFAULT_METRIC_PATH) + if not os.path.exists(os.path.dirname(cls.metric_path)): + os.makedirs(os.path.dirname(cls.metric_path)) + + cls.trial_id = os.environ.get('CLOUD_ML_TRIAL_ID', 0) + cls.metrics_queue = collections.deque( + maxlen=_MAX_NUM_METRIC_ENTRIES_TO_PRESERVE) + + cls.initialized = True + + @classmethod + def _dump_metrics_to_file(cls): + with open(cls.metric_path, 'w') as metric_file: + for metric in cls.metrics_queue: + metric_file.write(json.dumps(metric, sort_keys=True) + '\n') + + @classmethod + def report_hyperparameter_tuning_metric(cls, + hyperparameter_metric_tag, + metric_value, + global_step=None, + checkpoint_path=''): + """Method to report hyperparameter tuning metric. + Args: + hyperparameter_metric_tag: The hyperparameter metric name this metric + value is associated with. Should keep consistent with the tag + specified in HyperparameterSpec. + metric_value: float, the values for the hyperparameter metric to report. + global_step: int, the global step this metric value is associated with. + checkpoint_path: The checkpoint path which can be used to warmstart from. + """ + metric_value = float(metric_value) + metric_tag = _DEFAULT_HYPERPARAMETER_METRIC_TAG + if hyperparameter_metric_tag: + metric_tag = hyperparameter_metric_tag + metric_body = { + 'timestamp': time.time(), + 'trial': str(cls.trial_id), + metric_tag: str(metric_value), + 'global_step': str(int(global_step) if global_step else 0), + 'checkpoint_path': checkpoint_path + } + cls.metrics_queue.append(metric_body) + cls._dump_metrics_to_file() + + +def report_hyperparameter_tuning_metric( + metrics: Dict[str, float], + global_step: Optional[int] = None, + checkpoint_path='' + ): + _HyperparameterTuningJobReporterSingleton.initialize() + + for hyperparameter_metric_tag, metric_value in metrics.items(): + _HyperparameterTuningJobReporterSingleton.report_hyperparameter_tuning_metric( + hyperparameter_metric_tag=hyperparameter_metric_tag, + metric_value=metric_value, + global_step=global_step, + checkpoint_path=checkpoint_path + ) + + + + + + From 455944dc80812ed31669262a9e1e38b71dc64da1 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 13:11:08 -0400 Subject: [PATCH 09/29] chore: make plural --- google/cloud/aiplatform/training_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/training_utils.py b/google/cloud/aiplatform/training_utils.py index 71645a8446..48af524e3f 100644 --- a/google/cloud/aiplatform/training_utils.py +++ b/google/cloud/aiplatform/training_utils.py @@ -169,7 +169,7 @@ def report_hyperparameter_tuning_metric(cls, cls._dump_metrics_to_file() -def report_hyperparameter_tuning_metric( +def report_hyperparameter_tuning_metrics( metrics: Dict[str, float], global_step: Optional[int] = None, checkpoint_path='' From a8e0da693a099b49815037442d54cbdced53ee5b Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 16:07:48 -0400 Subject: [PATCH 10/29] feat: added trials property, refactored job classes, updating training pipelines to use snake case instead of camel case to be consistent with custom job --- google/cloud/aiplatform/__init__.py | 9 +- google/cloud/aiplatform/base.py | 5 + .../cloud/aiplatform/hyperparameter_tuning.py | 121 +++-- google/cloud/aiplatform/jobs.py | 248 +++++---- google/cloud/aiplatform/training_jobs.py | 60 +-- google/cloud/aiplatform/training_utils.py | 68 ++- google/cloud/aiplatform/utils/__init__.py | 2 +- google/cloud/aiplatform/utils/source_utils.py | 2 +- .../aiplatform/utils/worker_spec_utils.py | 6 +- tests/unit/aiplatform/test_end_to_end.py | 46 +- tests/unit/aiplatform/test_training_jobs.py | 504 ++++++++++-------- 11 files changed, 585 insertions(+), 486 deletions(-) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 99f7df09d3..32ec7538e0 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -30,7 +30,11 @@ from google.cloud.aiplatform.metadata import metadata from google.cloud.aiplatform.models import Endpoint from google.cloud.aiplatform.models import Model -from google.cloud.aiplatform.jobs import BatchPredictionJob, CustomJob, HyperparameterTuningJob +from google.cloud.aiplatform.jobs import ( + BatchPredictionJob, + CustomJob, + HyperparameterTuningJob, +) from google.cloud.aiplatform.training_jobs import ( CustomTrainingJob, CustomContainerTrainingJob, @@ -62,8 +66,7 @@ "explain", "gapic", "init", - "hyperparameter_tuning" - "log_params", + "hyperparameter_tuning" "log_params", "log_metrics", "get_experiment_df", "get_pipeline_df", diff --git a/google/cloud/aiplatform/base.py b/google/cloud/aiplatform/base.py index f46db9c47e..b42b630f75 100644 --- a/google/cloud/aiplatform/base.py +++ b/google/cloud/aiplatform/base.py @@ -543,6 +543,11 @@ def update_time(self) -> datetime.datetime: self._sync_gca_resource() return self._gca_resource.update_time + @property + def gca_resource(self) -> proto.Message: + """The underlying resource proto represenation.""" + return self._gca_resource + def __repr__(self) -> str: return f"{object.__repr__(self)} \nresource name: {self.resource_name}" diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index 045e3f955e..5a2ef35fda 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -6,18 +6,18 @@ from google.cloud.aiplatform.compat.types import study as gca_study_compat _scale_type_map = { - 'linear': gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, - 'log': gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, - 'reverse_log': gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_REVERSE_LOG_SCALE, + "linear": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + "log": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, + "reverse_log": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_REVERSE_LOG_SCALE, } class _ParameterSpec(metaclass=abc.ABCMeta): - def __init__( self, - conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, - parent_values: Optional[List[Union[float, int, str]]] = None): + conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + parent_values: Optional[List[Union[float, int, str]]] = None, + ): self.conditional_parameter_spec = conditional_parameter_spec self.parent_values = parent_values @@ -39,114 +39,131 @@ def _parameter_value_map(self) -> Tuple[Tuple[str, str]]: @abc.abstractmethod def _parameter_spec_value_key(self) -> Tuple[Tuple[str, str]]: pass - - @property + @property def _proto_parameter_value_spec(self) -> proto.Message: proto_parameter_value_spec = self._proto_parameter_value_class() for self_attr_key, proto_attr_key in self._parameter_value_map: - setattr(proto_parameter_value_spec, proto_attr_key, getattr(self, self_attr_key)) + setattr( + proto_parameter_value_spec, proto_attr_key, getattr(self, self_attr_key) + ) return proto_parameter_value_spec - - def _to_parameter_spec(self, parameter_id: str) -> gca_study_compat.StudySpec.ParameterSpec: + def _to_parameter_spec( + self, parameter_id: str + ) -> gca_study_compat.StudySpec.ParameterSpec: # TODO: Conditional parameters parameter_spec = gca_study_compat.StudySpec.ParameterSpec( - parameter_id=parameter_id, - scale_type=_scale_type_map.get(getattr(self, 'scale')) - ) + parameter_id=parameter_id, + scale_type=_scale_type_map.get(getattr(self, "scale")), + ) - setattr(parameter_spec, self._parameter_spec_value_key, self._proto_parameter_value_spec) + setattr( + parameter_spec, + self._parameter_spec_value_key, + self._proto_parameter_value_spec, + ) return parameter_spec class DoubleParameterSpec(_ParameterSpec): - _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.DoubleValueSpec - _parameter_value_map = (('min', 'min_value'), ('max', 'max_value')) - _parameter_spec_value_key = 'double_value_spec' - + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.DoubleValueSpec + ) + _parameter_value_map = (("min", "min_value"), ("max", "max_value")) + _parameter_spec_value_key = "double_value_spec" + def __init__( self, min: float, max: float, scale: str, - conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, - parent_values: Optional[List[Union[float, int, str]]] = None - ): + conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + parent_values: Optional[List[Union[float, int, str]]] = None, + ): super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_values=parent_values) + parent_values=parent_values, + ) self.min = min self.max = max - self.scale=scale + self.scale = scale class IntegerParameterSpec(_ParameterSpec): - - _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.IntegerValueSpec - _parameter_value_map = (('min', 'min_value'), ('max', 'max_value')) - _parameter_spec_value_key = 'integer_value_spec' + + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.IntegerValueSpec + ) + _parameter_value_map = (("min", "min_value"), ("max", "max_value")) + _parameter_spec_value_key = "integer_value_spec" def __init__( self, min: int, max: int, scale: str, - conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, - parent_values: Optional[List[Union[float, int, str]]] = None - ): + conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + parent_values: Optional[List[Union[float, int, str]]] = None, + ): super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_value=parent_values) + parent_value=parent_values, + ) self.min = min - self.max = max, - self.scale=scale + self.max = (max,) + self.scale = scale + class CategoricalValueSpec(_ParameterSpec): - _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec - _parameter_value_map = (('values', 'values')) - _parameter_spec_value_key = 'categorical_value_spec' - + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec + ) + _parameter_value_map = ("values", "values") + _parameter_spec_value_key = "categorical_value_spec" + def __init__( self, values: List[str], - conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, - parent_values: Optional[List[Union[float, int, str]]] = None - ): + conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + parent_values: Optional[List[Union[float, int, str]]] = None, + ): super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_value=parent_values) + parent_value=parent_values, + ) self.values = values class DiscreteValueSpec(_ParameterSpec): - _proto_parameter_value_class = gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec - _parameter_value_map = (('values', 'values')) - _parameter_spec_value_key = 'discrete_value_spec' - + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec + ) + _parameter_value_map = ("values", "values") + _parameter_spec_value_key = "discrete_value_spec" + def __init__( self, values: List[float], scale: str, - conditional_parameter_spec: Optional[Dict[str, '_Parameter']] = None, - parent_values: Optional[List[Union[float, int, str]]] = None - ): + conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + parent_values: Optional[List[Union[float, int, str]]] = None, + ): super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_value=parent_values) + parent_value=parent_values, + ) self.values = values self.scale = scale - - diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index aa06fc6370..25f065f502 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -36,8 +36,8 @@ from google.cloud.aiplatform import initializer from google.cloud.aiplatform import hyperparameter_tuning from google.cloud.aiplatform import utils -from google.cloud.aiplatform.utils import source_utils -from google.cloud.aiplatform.utils import worker_spec_utils +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud.aiplatform.compat.services import job_service_client from google.cloud.aiplatform.compat.types import ( @@ -52,7 +52,7 @@ hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat, machine_resources as gca_machine_resources_compat, machine_resources_v1beta1 as gca_machine_resources_v1beta1, - study as gca_study_compat + study as gca_study_compat, ) logging.basicConfig(level=logging.INFO, stream=sys.stdout) @@ -776,7 +776,76 @@ def iter_outputs( ) -class CustomJob(_Job): +class _RunnableJob(_Job): + def __init__( + self, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + ): + + base.AiPlatformResourceNounWithFutureManager.__init__( + self, project=project, location=location, credentials=credentials + ) + + self._parent = aiplatform.initializer.global_config.common_location_path( + project=project, location=location + ) + + @abc.abstractmethod + def run(self) -> None: + pass + + @property + def _has_run(self) -> bool: + return bool(getattr(self._gca_resource, "name")) + + @property + def state(self) -> gca_job_state.JobState: + if not self._has_run: + raise RuntimeError("Job has not run. No state available.") + + return super().state + + @classmethod + def get( + cls, + resource_name: str, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + ) -> "_RunnableJob": + """Get an AI Platform Job for the given resource_name. + + Args: + resource_name (str): + Required. A fully-qualified resource name or ID. + project (str): + Optional project to retrieve dataset from. If not set, project + set in aiplatform.init will be used. + location (str): + Optional location to retrieve dataset from. If not set, location + set in aiplatform.init will be used. + credentials (auth_credentials.Credentials): + Custom credentials to use to upload this model. Overrides + credentials set in aiplatform.init. + + Returns: + An AI Platform Job. + """ + self = cls._empty_constructor( + project=project, + location=location, + credentials=credentials, + resource_name=resource_name, + ) + + self._gca_resource = self._get_gca_resource(resource_name=resource_name) + + return self + + +class CustomJob(_RunnableJob): _resource_noun = "customJobs" _getter_method = "get_custom_job" _list_method = "list_custom_job" @@ -785,25 +854,18 @@ class CustomJob(_Job): _job_type = "training" pass - def __init__(self, + def __init__( + self, display_name: str, worker_pool_specs: Union[Dict], project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, - staging_bucket: Optional[str] = None): - - base.AiPlatformResourceNounWithFutureManager.__init__(self, - project=project, - location=location, - credentials=credentials - ) + staging_bucket: Optional[str] = None, + ): - self._parent = aiplatform.initializer.global_config.common_location_path( - project=project, - location=location - ) + super().__init__(project=project, location=location, credentials=credentials) staging_bucket = staging_bucket or initializer.global_config.staging_bucket @@ -813,18 +875,19 @@ def __init__(self, "should be set using aiplatform.init(staging_bucket='gs://my-bucket')" ) - self._gca_resource = gca_custom_job_compat.CustomJob( + self._gca_resource = gca_custom_job_compat.CustomJob( display_name=display_name, - job_spec = gca_custom_job_compat.CustomJobSpec( + job_spec=gca_custom_job_compat.CustomJobSpec( worker_pool_specs=worker_pool_specs, - base_output_directory=gca_io_compat.GcsDestination(output_uri_prefix=staging_bucket), + base_output_directory=gca_io_compat.GcsDestination( + output_uri_prefix=staging_bucket ), - encryption_spec= initializer.global_config.get_encryption_spec( + ), + encryption_spec=initializer.global_config.get_encryption_spec( encryption_spec_key_name=encryption_spec_key_name - ) + ), ) - @classmethod def from_local_script( cls, @@ -840,10 +903,10 @@ def from_local_script( accelerator_count: int = 0, project: Optional[str] = None, location: Optional[str] = None, - staging_bucket: Optional[str]= None, + staging_bucket: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, - ) -> 'CustomJob': + ) -> "CustomJob": project = project or initializer.global_config.project location = location or initializer.global_config.location @@ -862,15 +925,12 @@ def from_local_script( accelerator_type=accelerator_type, ).pool_specs - python_packager = source_utils._TrainingScriptPythonPackager( - script_path=script_path, requirements=requirements - ) + script_path=script_path, requirements=requirements + ) package_gcs_uri = python_packager.package_and_copy_to_gcs( - gcs_staging_dir = staging_bucket, - project = project, - credentials = credentials, + gcs_staging_dir=staging_bucket, project=project, credentials=credentials, ) for spec in worker_pool_specs: @@ -896,17 +956,18 @@ def from_local_script( location=location, credentials=credentials, encryption_spec_key_name=encryption_spec_key_name, - staging_bucket=staging_bucket) - + staging_bucket=staging_bucket, + ) @base.optional_sync() def run( self, service_account: Optional[str] = None, network: Optional[str] = None, - timeout: Optional[int] = None, # seconds - restart_job_on_worker_restart: bool=False, - sync: bool = True) -> None: + timeout: Optional[int] = None, # seconds + restart_job_on_worker_restart: bool = False, + sync: bool = True, + ) -> None: if service_account: self._gca_resource.service_account = service_account @@ -914,34 +975,28 @@ def run( if network: self._gca_resource.network = network - if timeout or restart_job_on_worker_restart: timout = duration_pb2.Duration(seconds=timout) if timeout else None self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling( - timeout=timeout, - restart_job_on_worker_restart=restart_job_on_worker_restart - ) + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart, + ) _LOGGER.log_create_with_lro(self.__class__) self._gca_resource = self.api_client.create_custom_job( - parent=self._parent, custom_job=self._gca_resource - ) + parent=self._parent, custom_job=self._gca_resource + ) _LOGGER.log_create_complete(self.__class__, self._gca_resource, "custom_job") - _LOGGER.info( - "View Custom Job:\n%s" % self._dashboard_uri() - ) + _LOGGER.info("View Custom Job:\n%s" % self._dashboard_uri()) self._block_until_complete() - @property def job_spec(self): return self._gca_resource.job_spec - - class DataLabelingJob(_Job): @@ -954,27 +1009,27 @@ class DataLabelingJob(_Job): pass -_search_algorithm_to_proto_value = { - 'random': gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, - 'grid': gca_study_compat.StudySpec.Algorithm.GRID_SEARCH - +_SEARCH_ALGORITHM_TO_PROTO_VALUE = { + "random": gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, + "grid": gca_study_compat.StudySpec.Algorithm.GRID_SEARCH, } -_measurement_selection_to_proto_value = { - 'best': gca_study_compat.StudySpec.MeasurementSelectionType.BEST_MEASUREMENT, - 'last': gca_study_compat.StudySpec.MeasurementSelectionType.LAST_MEASUREMENT +_MEASUREMENT_SELECTION_TO_PROTO_VALUE = { + "best": gca_study_compat.StudySpec.MeasurementSelectionType.BEST_MEASUREMENT, + "last": gca_study_compat.StudySpec.MeasurementSelectionType.LAST_MEASUREMENT, } -class HyperparameterTuningJob(_Job): + +class HyperparameterTuningJob(_RunnableJob): _resource_noun = "hyperparameterTuningJobs" _getter_method = "get_hyperparameter_tuning_job" _list_method = "list_hyperparameter_tuning_jobs" _cancel_method = "cancel_hyperparameter_tuning_job" _delete_method = "delete_hyperparameter_tuning_job" _job_type = "training" - - def __init__(self, + def __init__( + self, display_name: str, custom_job: CustomJob, metric_spec: Dict[str, str], @@ -982,28 +1037,19 @@ def __init__(self, max_trial_count: int, parallel_trial_count: int, max_failed_trial_count: int = 0, - search_algorithm: Optional[str] = 'random', - # observation_noise: Optional[str] = 'low', - measurement_selection: Optional[str] = 'best', + search_algorithm: Optional[str] = "random", + measurement_selection: Optional[str] = "best", project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, - # staging_bucket: Optional[str] = None ): - base.AiPlatformResourceNounWithFutureManager.__init__(self, - project=project, - location=location, - credentials=credentials, - ) - - self._parent = aiplatform.initializer.global_config.common_location_path( - project=project, - location=location - ) + super().__init__(project=project, location=location, credentials=credentials) metrics = [ - gca_study_compat.StudySpec.MetricSpec(metric_id=metric_id, goal=goal.upper()) + gca_study_compat.StudySpec.MetricSpec( + metric_id=metric_id, goal=goal.upper() + ) for metric_id, goal in metric_spec.items() ] @@ -1013,23 +1059,24 @@ def __init__(self, ] study_spec = gca_study_compat.StudySpec( - metrics = metrics, - parameters = parameters, - algorithm = _search_algorithm_to_proto_value[search_algorithm], - # observation_noise = observation_noise.upper(), - measurement_selection_type = _measurement_selection_to_proto_value[measurement_selection] + metrics=metrics, + parameters=parameters, + algorithm=_SEARCH_ALGORITHM_TO_PROTO_VALUE[search_algorithm], + measurement_selection_type=_MEASUREMENT_SELECTION_TO_PROTO_VALUE[ + measurement_selection + ], ) self._gca_resource = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( - display_name=display_name, - study_spec=study_spec, - max_trial_count=max_trial_count, - parallel_trial_count=parallel_trial_count, - max_failed_trial_count=max_failed_trial_count, - trial_job_spec=copy.deepcopy(custom_job.job_spec), - encryption_spec= initializer.global_config.get_encryption_spec( - encryption_spec_key_name = encryption_spec_key_name - ) + display_name=display_name, + study_spec=study_spec, + max_trial_count=max_trial_count, + parallel_trial_count=parallel_trial_count, + max_failed_trial_count=max_failed_trial_count, + trial_job_spec=copy.deepcopy(custom_job.job_spec), + encryption_spec=initializer.global_config.get_encryption_spec( + encryption_spec_key_name=encryption_spec_key_name + ), ) @base.optional_sync() @@ -1037,9 +1084,10 @@ def run( self, service_account: Optional[str] = None, network: Optional[str] = None, - timeout: Optional[int] = None, # seconds - restart_job_on_worker_restart: bool=False, - sync: bool = True) -> None: + timeout: Optional[int] = None, # seconds + restart_job_on_worker_restart: bool = False, + sync: bool = True, + ) -> None: if service_account: self._gca_resource.trial_job_spec.service_account = service_account @@ -1047,29 +1095,25 @@ def run( if network: self._gca_resource.trial_job_spec.network = network - if timeout or restart_job_on_worker_restart: timout = duration_pb2.Duration(seconds=timout) if timeout else None self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( - timeout=timeout, - restart_job_on_worker_restart=restart_job_on_worker_restart - ) + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart, + ) _LOGGER.log_create_with_lro(self.__class__) self._gca_resource = self.api_client.create_hyperparameter_tuning_job( - parent=self._parent, - hyperparameter_tuning_job=self._gca_resource - ) + parent=self._parent, hyperparameter_tuning_job=self._gca_resource + ) _LOGGER.log_create_complete(self.__class__, self._gca_resource, "hpt_job") - _LOGGER.info( - "View HyperparameterTuningJob:\n%s" % self._dashboard_uri() - ) - + _LOGGER.info("View HyperparameterTuningJob:\n%s" % self._dashboard_uri()) self._block_until_complete() - - + @property + def trials(self) -> List[gca_study_compat.Trial]: + return list(self._gca_resource.trials) diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index f8f56bd5da..0121c585f5 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -38,8 +38,8 @@ training_pipeline as gca_training_pipeline, ) from google.cloud.aiplatform.utils import _timestamped_gcs_dir -from google.cloud.aiplatform.utils.source_utils import _TrainingScriptPythonPackager -from google.cloud.aiplatform.utils.worker_spec_utils import _DistributedTrainingSpec +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud.aiplatform.v1.schema.trainingjob import ( definition_v1 as training_job_inputs, @@ -1005,7 +1005,7 @@ def _prepare_and_validate_run( machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, - ) -> Tuple[_DistributedTrainingSpec, Optional[gca_model.Model]]: + ) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]: """Create worker pool specs and managed model as well validating the run. @@ -1055,7 +1055,7 @@ def _prepare_and_validate_run( model_display_name = model_display_name or self._display_name + "-model" # validates args and will raise - worker_pool_specs = _DistributedTrainingSpec.chief_worker_pool( + worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=replica_count, machine_type=machine_type, accelerator_count=accelerator_count, @@ -1073,7 +1073,7 @@ def _prepare_and_validate_run( def _prepare_training_task_inputs_and_output_dir( self, - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, base_output_dir: Optional[str] = None, service_account: Optional[str] = None, network: Optional[str] = None, @@ -1081,7 +1081,7 @@ def _prepare_training_task_inputs_and_output_dir( """Prepares training task inputs and output directory for custom job. Args: - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. base_output_dir (str): GCS output directory of job. If not provided a @@ -1106,12 +1106,12 @@ def _prepare_training_task_inputs_and_output_dir( _LOGGER.info("Training Output directory:\n%s " % base_output_dir) training_task_inputs = { - "workerPoolSpecs": worker_pool_specs, - "baseOutputDirectory": {"output_uri_prefix": base_output_dir}, + "worker_pool_specs": worker_pool_specs, + "base_output_directory": {"output_uri_prefix": base_output_dir}, } if service_account: - training_task_inputs["serviceAccount"] = service_account + training_task_inputs["service_account"] = service_account if network: training_task_inputs["network"] = network @@ -1531,7 +1531,7 @@ def run( ) # make and copy package - python_packager = _TrainingScriptPythonPackager( + python_packager = source_utils._TrainingScriptPythonPackager( script_path=self._script_path, requirements=self._requirements ) @@ -1557,7 +1557,7 @@ def run( @base.optional_sync(construct_object_on_arg="managed_model") def _run( self, - python_packager: _TrainingScriptPythonPackager, + python_packager: source_utils._TrainingScriptPythonPackager, dataset: Optional[ Union[ datasets.ImageDataset, @@ -1567,7 +1567,7 @@ def _run( ] ], annotation_schema_uri: Optional[str], - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, managed_model: Optional[gca_model.Model] = None, args: Optional[List[Union[str, float, int]]] = None, environment_variables: Optional[Dict[str, str]] = None, @@ -1584,7 +1584,7 @@ def _run( """Packages local script and launches training_job. Args: - python_packager (_TrainingScriptPythonPackager): + python_packager (source_utils._TrainingScriptPythonPackager): Required. Python Packager pointing to training script locally. dataset ( Union[ @@ -1598,7 +1598,7 @@ def _run( annotation_schema_uri (str): Google Cloud Storage URI points to a YAML file describing annotation schema. - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. managed_model (gca_model.Model): Model proto if this script produces a Managed Model. @@ -1682,17 +1682,17 @@ def _run( ) for spec in worker_pool_specs: - spec["pythonPackageSpec"] = { - "executorImageUri": self._container_uri, - "pythonModule": python_packager.module_name, - "packageUris": [package_gcs_uri], + spec["python_package_spec"] = { + "executor_image_uri": self._container_uri, + "python_module": python_packager.module_name, + "package_uris": [package_gcs_uri], } if args: - spec["pythonPackageSpec"]["args"] = args + spec["python_package_spec"]["args"] = args if environment_variables: - spec["pythonPackageSpec"]["env"] = [ + spec["python_package_spec"]["env"] = [ {"name": key, "value": value} for key, value in environment_variables.items() ] @@ -2146,7 +2146,7 @@ def _run( ] ], annotation_schema_uri: Optional[str], - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, managed_model: Optional[gca_model.Model] = None, args: Optional[List[Union[str, float, int]]] = None, environment_variables: Optional[Dict[str, str]] = None, @@ -2174,7 +2174,7 @@ def _run( annotation_schema_uri (str): Google Cloud Storage URI points to a YAML file describing annotation schema. - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. managed_model (gca_model.Model): Model proto if this script produces a Managed Model. @@ -3943,7 +3943,7 @@ def _run( ] ], annotation_schema_uri: Optional[str], - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, managed_model: Optional[gca_model.Model] = None, args: Optional[List[Union[str, float, int]]] = None, environment_variables: Optional[Dict[str, str]] = None, @@ -3972,7 +3972,7 @@ def _run( annotation_schema_uri (str): Google Cloud Storage URI points to a YAML file describing annotation schema. - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. managed_model (gca_model.Model): Model proto if this script produces a Managed Model. @@ -4035,17 +4035,17 @@ def _run( produce an AI Platform Model. """ for spec in worker_pool_specs: - spec["pythonPackageSpec"] = { - "executorImageUri": self._container_uri, - "pythonModule": self._python_module, - "packageUris": [self._package_gcs_uri], + spec["python_package_spec"] = { + "executor_image_uri": self._container_uri, + "python_module": self._python_module, + "package_uris": [self._package_gcs_uri], } if args: - spec["pythonPackageSpec"]["args"] = args + spec["python_package_spec"]["args"] = args if environment_variables: - spec["pythonPackageSpec"]["env"] = [ + spec["python_package_spec"]["env"] = [ {"name": key, "value": value} for key, value in environment_variables.items() ] diff --git a/google/cloud/aiplatform/training_utils.py b/google/cloud/aiplatform/training_utils.py index 48af524e3f..95e4d2429a 100644 --- a/google/cloud/aiplatform/training_utils.py +++ b/google/cloud/aiplatform/training_utils.py @@ -106,8 +106,8 @@ def tf_config(self) -> Optional[Dict]: return None -_DEFAULT_HYPERPARAMETER_METRIC_TAG = 'training/hptuning/metric' -_DEFAULT_METRIC_PATH = '/tmp/hypertune/output.metrics' +_DEFAULT_HYPERPARAMETER_METRIC_TAG = "training/hptuning/metric" +_DEFAULT_METRIC_PATH = "/tmp/hypertune/output.metrics" # TODO(0olwzo0): consider to make it configurable _MAX_NUM_METRIC_ENTRIES_TO_PRESERVE = 100 @@ -122,29 +122,33 @@ def initialize(cls): if cls.initialized: return - cls.metric_path = os.environ.get('CLOUD_ML_HP_METRIC_FILE', - _DEFAULT_METRIC_PATH) + cls.metric_path = os.environ.get( + "CLOUD_ML_HP_METRIC_FILE", _DEFAULT_METRIC_PATH + ) if not os.path.exists(os.path.dirname(cls.metric_path)): os.makedirs(os.path.dirname(cls.metric_path)) - cls.trial_id = os.environ.get('CLOUD_ML_TRIAL_ID', 0) + cls.trial_id = os.environ.get("CLOUD_ML_TRIAL_ID", 0) cls.metrics_queue = collections.deque( - maxlen=_MAX_NUM_METRIC_ENTRIES_TO_PRESERVE) + maxlen=_MAX_NUM_METRIC_ENTRIES_TO_PRESERVE + ) cls.initialized = True @classmethod def _dump_metrics_to_file(cls): - with open(cls.metric_path, 'w') as metric_file: + with open(cls.metric_path, "w") as metric_file: for metric in cls.metrics_queue: - metric_file.write(json.dumps(metric, sort_keys=True) + '\n') + metric_file.write(json.dumps(metric, sort_keys=True) + "\n") @classmethod - def report_hyperparameter_tuning_metric(cls, - hyperparameter_metric_tag, - metric_value, - global_step=None, - checkpoint_path=''): + def report_hyperparameter_tuning_metric( + cls, + hyperparameter_metric_tag, + metric_value, + global_step=None, + checkpoint_path="", + ): """Method to report hyperparameter tuning metric. Args: hyperparameter_metric_tag: The hyperparameter metric name this metric @@ -159,33 +163,25 @@ def report_hyperparameter_tuning_metric(cls, if hyperparameter_metric_tag: metric_tag = hyperparameter_metric_tag metric_body = { - 'timestamp': time.time(), - 'trial': str(cls.trial_id), + "timestamp": time.time(), + "trial": str(cls.trial_id), metric_tag: str(metric_value), - 'global_step': str(int(global_step) if global_step else 0), - 'checkpoint_path': checkpoint_path + "global_step": str(int(global_step) if global_step else 0), + "checkpoint_path": checkpoint_path, } cls.metrics_queue.append(metric_body) cls._dump_metrics_to_file() def report_hyperparameter_tuning_metrics( - metrics: Dict[str, float], - global_step: Optional[int] = None, - checkpoint_path='' - ): - _HyperparameterTuningJobReporterSingleton.initialize() - - for hyperparameter_metric_tag, metric_value in metrics.items(): - _HyperparameterTuningJobReporterSingleton.report_hyperparameter_tuning_metric( - hyperparameter_metric_tag=hyperparameter_metric_tag, - metric_value=metric_value, - global_step=global_step, - checkpoint_path=checkpoint_path - ) - - - - - - + metrics: Dict[str, float], global_step: Optional[int] = None, checkpoint_path="" +): + _HyperparameterTuningJobReporterSingleton.initialize() + + for hyperparameter_metric_tag, metric_value in metrics.items(): + _HyperparameterTuningJobReporterSingleton.report_hyperparameter_tuning_metric( + hyperparameter_metric_tag=hyperparameter_metric_tag, + metric_value=metric_value, + global_step=global_step, + checkpoint_path=checkpoint_path, + ) diff --git a/google/cloud/aiplatform/utils/__init__.py b/google/cloud/aiplatform/utils/__init__.py index c847a56244..22a4d985bb 100644 --- a/google/cloud/aiplatform/utils/__init__.py +++ b/google/cloud/aiplatform/utils/__init__.py @@ -565,4 +565,4 @@ def _timestamped_copy_to_gcs( blob.upload_from_filename(local_file_path) gcs_path = "".join(["gs://", "/".join([blob.bucket.name, blob.name])]) - return gcs_path \ No newline at end of file + return gcs_path diff --git a/google/cloud/aiplatform/utils/source_utils.py b/google/cloud/aiplatform/utils/source_utils.py index 8539e3122d..f84e37b52a 100644 --- a/google/cloud/aiplatform/utils/source_utils.py +++ b/google/cloud/aiplatform/utils/source_utils.py @@ -213,4 +213,4 @@ def package_and_copy_to_gcs( project=project, credentials=credentials, ) - return self.package_and_copy(copy_method=copy_method) \ No newline at end of file + return self.package_and_copy(copy_method=copy_method) diff --git a/google/cloud/aiplatform/utils/worker_spec_utils.py b/google/cloud/aiplatform/utils/worker_spec_utils.py index a23b997f48..ba81352d73 100644 --- a/google/cloud/aiplatform/utils/worker_spec_utils.py +++ b/google/cloud/aiplatform/utils/worker_spec_utils.py @@ -1,7 +1,9 @@ from typing import NamedTuple, Optional, Dict, Union, List from google.cloud.aiplatform import utils -from google.cloud.aiplatform.compat.types import accelerator_type as gca_accelerator_type_compat +from google.cloud.aiplatform.compat.types import ( + accelerator_type as gca_accelerator_type_compat, +) class _MachineSpec(NamedTuple): @@ -178,4 +180,4 @@ def chief_worker_pool( accelerator_type=accelerator_type, ) - return cls(chief_spec=chief_spec, worker_spec=worker_spec) \ No newline at end of file + return cls(chief_spec=chief_spec, worker_spec=worker_spec) diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py index f4b1355679..e4a50b4014 100644 --- a/tests/unit/aiplatform/test_end_to_end.py +++ b/tests/unit/aiplatform/test_end_to_end.py @@ -19,7 +19,7 @@ from importlib import reload -import google.cloud.aiplatform.utils.source_utils +from google.cloud.aiplatform.utils import source_utils from google.cloud import aiplatform from google.cloud.aiplatform import initializer from google.cloud.aiplatform import models @@ -205,16 +205,16 @@ def test_dataset_create_to_model_predict( true_args = test_training_jobs._TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": test_training_jobs._TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": test_training_jobs._TEST_MACHINE_TYPE, - "acceleratorType": test_training_jobs._TEST_ACCELERATOR_TYPE, - "acceleratorCount": test_training_jobs._TEST_ACCELERATOR_COUNT, + "replica_count": test_training_jobs._TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": test_training_jobs._TEST_MACHINE_TYPE, + "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, + "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -249,8 +249,8 @@ def test_dataset_create_to_model_predict( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": { + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { "output_uri_prefix": test_training_jobs._TEST_BASE_OUTPUT_DIR }, }, @@ -386,16 +386,16 @@ def test_dataset_create_to_model_predict_with_pipeline_fail( true_args = test_training_jobs._TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": test_training_jobs._TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": test_training_jobs._TEST_MACHINE_TYPE, - "acceleratorType": test_training_jobs._TEST_ACCELERATOR_TYPE, - "acceleratorCount": test_training_jobs._TEST_ACCELERATOR_COUNT, + "replica_count": test_training_jobs._TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": test_training_jobs._TEST_MACHINE_TYPE, + "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, + "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": google.cloud.aiplatform.utils.source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -431,8 +431,8 @@ def test_dataset_create_to_model_predict_with_pipeline_fail( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": { + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { "output_uri_prefix": test_training_jobs._TEST_BASE_OUTPUT_DIR }, }, diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py index b4745d29f1..75478263e8 100644 --- a/tests/unit/aiplatform/test_training_jobs.py +++ b/tests/unit/aiplatform/test_training_jobs.py @@ -515,7 +515,7 @@ def mock_model_service_get(): @pytest.fixture def mock_python_package_to_gcs(): with mock.patch.object( - source_utils._TrainingScriptPythonPackager, "package_and_copy_to_gcs" + source_utils._TrainingScriptPythonPackager, "package_and_copy_to_gcs" ) as mock_package_to_copy_gcs: mock_package_to_copy_gcs.return_value = _TEST_OUTPUT_PYTHON_PACKAGE_PATH yield mock_package_to_copy_gcs @@ -633,16 +633,16 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -702,9 +702,11 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, - "serviceAccount": _TEST_SERVICE_ACCOUNT, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, + "service_account": _TEST_SERVICE_ACCOUNT, "network": _TEST_NETWORK, }, struct_pb2.Value(), @@ -792,16 +794,16 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -861,8 +863,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1067,16 +1071,16 @@ def test_run_call_pipeline_service_create_with_no_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -1097,8 +1101,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1319,31 +1325,31 @@ def test_run_call_pipeline_service_create_distributed_training( true_worker_pool_spec = [ { - "replicaCount": 1, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 1, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, }, { - "replicaCount": 9, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 9, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -1385,8 +1391,10 @@ def test_run_call_pipeline_service_create_distributed_training( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": true_worker_pool_spec, - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": true_worker_pool_spec, + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1547,16 +1555,16 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": source_utils._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -1612,8 +1620,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1787,11 +1797,11 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -1855,8 +1865,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1935,11 +1947,11 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2002,8 +2014,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -2192,11 +2206,11 @@ def test_run_call_pipeline_service_create_with_no_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2220,8 +2234,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -2421,11 +2437,11 @@ def test_run_call_pipeline_service_create_distributed_training( true_worker_pool_spec = [ { - "replicaCount": 1, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 1, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2434,11 +2450,11 @@ def test_run_call_pipeline_service_create_distributed_training( }, }, { - "replicaCount": 9, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 9, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2483,8 +2499,10 @@ def test_run_call_pipeline_service_create_distributed_training( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": true_worker_pool_spec, - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": true_worker_pool_spec, + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -2561,11 +2579,11 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2625,9 +2643,11 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, - "serviceAccount": _TEST_SERVICE_ACCOUNT, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, + "service_account": _TEST_SERVICE_ACCOUNT, "network": _TEST_NETWORK, }, struct_pb2.Value(), @@ -2700,12 +2720,12 @@ def test_machine_spec_return_spec_dict(self): ) true_spec_dict = { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": _TEST_REPLICA_COUNT, + "replica_count": _TEST_REPLICA_COUNT, } assert test_spec.spec_dict == true_spec_dict @@ -2719,8 +2739,8 @@ def test_machine_spec_return_spec_dict_with_no_accelerator(self): ) true_spec_dict = { - "machineSpec": {"machineType": _TEST_MACHINE_TYPE}, - "replicaCount": _TEST_REPLICA_COUNT, + "machine_spec": {"machine_type": _TEST_MACHINE_TYPE}, + "replica_count": _TEST_REPLICA_COUNT, } assert test_spec.spec_dict == true_spec_dict @@ -2789,36 +2809,36 @@ def test_machine_spec_returns_pool_spec(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 10, + "replica_count": 10, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 3, + "replica_count": 3, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, ] @@ -2835,20 +2855,20 @@ def test_chief_worker_pool_returns_spec(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 9, + "replica_count": 9, }, ] @@ -2865,12 +2885,12 @@ def test_chief_worker_pool_returns_just_chief(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, } ] @@ -2911,21 +2931,21 @@ def test_machine_spec_handles_missing_pools(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, - {"machineSpec": {"machineType": "n1-standard-4"}, "replicaCount": 0}, + {"machine_spec": {"machine_type": "n1-standard-4"}, "replica_count": 0}, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 3, + "replica_count": 3, }, ] @@ -3002,16 +3022,16 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -3071,9 +3091,11 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, - "serviceAccount": _TEST_SERVICE_ACCOUNT, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, + "service_account": _TEST_SERVICE_ACCOUNT, "network": _TEST_NETWORK, }, struct_pb2.Value(), @@ -3155,16 +3177,16 @@ def test_run_call_pipeline_service_create_with_tabular_dataset_without_model_dis true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -3223,8 +3245,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset_without_model_dis training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3304,16 +3328,16 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -3372,8 +3396,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3566,16 +3592,16 @@ def test_run_call_pipeline_service_create_with_no_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -3595,8 +3621,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3802,30 +3830,30 @@ def test_run_call_pipeline_service_create_distributed_training( true_worker_pool_spec = [ { - "replicaCount": 1, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 1, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, }, { - "replicaCount": 9, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 9, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, }, @@ -3866,8 +3894,10 @@ def test_run_call_pipeline_service_create_distributed_training( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": true_worker_pool_spec, - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": true_worker_pool_spec, + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3943,16 +3973,16 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -4008,8 +4038,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), From c9259bf24bb58793ffa94655087503773e6c9fe7 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 16:56:42 -0400 Subject: [PATCH 11/29] chore: lint --- google/cloud/aiplatform/__init__.py | 3 +- .../cloud/aiplatform/hyperparameter_tuning.py | 10 +- google/cloud/aiplatform/jobs.py | 106 +++++++++++++++--- google/cloud/aiplatform/training_jobs.py | 1 - tests/unit/aiplatform/test_end_to_end.py | 1 - 5 files changed, 99 insertions(+), 22 deletions(-) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 32ec7538e0..d544b4bc8f 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -66,7 +66,8 @@ "explain", "gapic", "init", - "hyperparameter_tuning" "log_params", + "hyperparameter_tuning", + "log_params", "log_metrics", "get_experiment_df", "get_pipeline_df", diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index 5a2ef35fda..de9ecac84e 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -15,7 +15,7 @@ class _ParameterSpec(metaclass=abc.ABCMeta): def __init__( self, - conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): @@ -80,7 +80,7 @@ def __init__( min: float, max: float, scale: str, - conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): @@ -107,7 +107,7 @@ def __init__( min: int, max: int, scale: str, - conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): @@ -132,7 +132,7 @@ class CategoricalValueSpec(_ParameterSpec): def __init__( self, values: List[str], - conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): @@ -156,7 +156,7 @@ def __init__( self, values: List[float], scale: str, - conditional_parameter_spec: Optional[Dict[str, "_Parameter"]] = None, + conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 25f065f502..b96f4b3d0f 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -777,12 +777,22 @@ def iter_outputs( class _RunnableJob(_Job): + """ABC to interface job as a runnable training class.""" + def __init__( self, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, ): + """Initializes job with project, location, and api_client. + + Args: + project(str): Project of the resource noun. + location(str): The location of the resource noun. + credentials(google.auth.crendentials.Crendentials): Optional custom + credentials to use when accessing interacting with resource noun. + """ base.AiPlatformResourceNounWithFutureManager.__init__( self, project=project, location=location, credentials=credentials @@ -798,10 +808,16 @@ def run(self) -> None: @property def _has_run(self) -> bool: + """Property returns true if this class has a resource name.""" return bool(getattr(self._gca_resource, "name")) @property def state(self) -> gca_job_state.JobState: + """Current state of job. + + Raises: + RuntimeError if job run has not been called. + """ if not self._has_run: raise RuntimeError("Job has not run. No state available.") @@ -845,25 +861,93 @@ def get( return self +class DataLabelingJob(_Job): + _resource_noun = "dataLabelingJobs" + _getter_method = "get_data_labeling_job" + _list_method = "list_data_labeling_jobs" + _cancel_method = "cancel_data_labeling_job" + _delete_method = "delete_data_labeling_job" + _job_type = "labeling-tasks" + pass + + class CustomJob(_RunnableJob): + """Creates an AI Platform (Unified) Custom Job.""" + _resource_noun = "customJobs" _getter_method = "get_custom_job" _list_method = "list_custom_job" _cancel_method = "cancel_custom_job" _delete_method = "delete_custom_job" _job_type = "training" - pass + def __init__( self, display_name: str, - worker_pool_specs: Union[Dict], + worker_pool_specs: Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]], project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, staging_bucket: Optional[str] = None, ): + """Cosntruct a Custom Job with Worker Pool Specs. + + Example usage: + worker_pool_specs = [ + { + "machine_spec": { + "machine_type": "n1-standard-4", + "accelerator_type": "NVIDIA_TESLA_K80", + "accelerator_count": 1, + }, + "replica_count": 1, + "container_spec": { + "image_uri": container_image_uri, + "command": [], + "args": [], + }, + } + ] + + my_job = aiplatform.CustomJob( + display_name='my_job', + worker_pool_specs=worker_pool_specs + ) + + my_job.run() + + + For more information on configuring worker pool specs please visit: + https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job + + + Args: + display_name (str): Required. The user-defined name of this Custom Job. + worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]): + Required. The spec of the worker pools including machine type and Docker image. + Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages. + project (str): + Project to run the custom job in. Overrides project set in aiplatform.init. + location (str): + Location to run the custom job in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Custom credentials to use to run call custom job service. Overrides + credentials set in aiplatform.init. + encryption_spec_key_name (str): + Customer-managed encryption key name for a + CustomJob. If this is set, then all resources + created by the CustomJob will be encrypted with + the provided encryption key. + staging_bucket (str): + Bucket for produced custom job artifacts. Overrides + staging_bucket set in aiplatform.init. + + Raises: + RuntimeError is not staging bucket was set using aiplatfrom.init and a staging + bucket was not passed in. + """ super().__init__(project=project, location=location, credentials=credentials) @@ -907,6 +991,9 @@ def from_local_script( credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, ) -> "CustomJob": + """Configures a custom job from a local script. + + """ project = project or initializer.global_config.project location = location or initializer.global_config.location @@ -976,7 +1063,7 @@ def run( self._gca_resource.network = network if timeout or restart_job_on_worker_restart: - timout = duration_pb2.Duration(seconds=timout) if timeout else None + timeout = duration_pb2.Duration(seconds=timeout) if timeout else None self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling( timeout=timeout, restart_job_on_worker_restart=restart_job_on_worker_restart, @@ -999,16 +1086,6 @@ def job_spec(self): return self._gca_resource.job_spec -class DataLabelingJob(_Job): - _resource_noun = "dataLabelingJobs" - _getter_method = "get_data_labeling_job" - _list_method = "list_data_labeling_jobs" - _cancel_method = "cancel_data_labeling_job" - _delete_method = "delete_data_labeling_job" - _job_type = "labeling-tasks" - pass - - _SEARCH_ALGORITHM_TO_PROTO_VALUE = { "random": gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, "grid": gca_study_compat.StudySpec.Algorithm.GRID_SEARCH, @@ -1020,6 +1097,7 @@ class DataLabelingJob(_Job): } + class HyperparameterTuningJob(_RunnableJob): _resource_noun = "hyperparameterTuningJobs" _getter_method = "get_hyperparameter_tuning_job" @@ -1096,7 +1174,7 @@ def run( self._gca_resource.trial_job_spec.network = network if timeout or restart_job_on_worker_restart: - timout = duration_pb2.Duration(seconds=timout) if timeout else None + timeout = duration_pb2.Duration(seconds=timeout) if timeout else None self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( timeout=timeout, restart_job_on_worker_restart=restart_job_on_worker_restart, diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 0121c585f5..470e30bf56 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -15,7 +15,6 @@ # limitations under the License. # -import sys import time from typing import Dict, List, Optional, Sequence, Tuple, Union diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py index e4a50b4014..4aede65f08 100644 --- a/tests/unit/aiplatform/test_end_to_end.py +++ b/tests/unit/aiplatform/test_end_to_end.py @@ -24,7 +24,6 @@ from google.cloud.aiplatform import initializer from google.cloud.aiplatform import models from google.cloud.aiplatform import schema -from google.cloud.aiplatform import training_jobs from google.cloud.aiplatform_v1.types import ( dataset as gca_dataset, From df97a2d0250624b8b8104bfa5283e7a390dbed04 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Fri, 14 May 2021 19:30:09 -0400 Subject: [PATCH 12/29] checkpoint --- google/cloud/aiplatform/jobs.py | 85 ++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 7 deletions(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index b96f4b3d0f..9e58f5d1c4 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -929,19 +929,19 @@ def __init__( Required. The spec of the worker pools including machine type and Docker image. Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages. project (str): - Project to run the custom job in. Overrides project set in aiplatform.init. + Optional.Project to run the custom job in. Overrides project set in aiplatform.init. location (str): - Location to run the custom job in. Overrides location set in aiplatform.init. + Optional.Location to run the custom job in. Overrides location set in aiplatform.init. credentials (auth_credentials.Credentials): - Custom credentials to use to run call custom job service. Overrides + Optional.Custom credentials to use to run call custom job service. Overrides credentials set in aiplatform.init. encryption_spec_key_name (str): - Customer-managed encryption key name for a + Optional.Customer-managed encryption key name for a CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. staging_bucket (str): - Bucket for produced custom job artifacts. Overrides + Optional. Bucket for produced custom job artifacts. Overrides staging_bucket set in aiplatform.init. Raises: @@ -987,13 +987,62 @@ def from_local_script( accelerator_count: int = 0, project: Optional[str] = None, location: Optional[str] = None, - staging_bucket: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, + staging_bucket: Optional[str] = None, ) -> "CustomJob": """Configures a custom job from a local script. - """ + Args: + display_name (str): + Required. The user-defined name of this CustomJob. + script_path (str): Required. Local path to training script. + container_uri (str): + Required: Uri of the training container image to use for custom job. + requirements (Sequence[str]): + Optional. List of python packages dependencies of script. + environment_variables (Dict[str, str]): + Optional. Environment variables to be passed to the container. + Should be a dictionary where keys are environment variable names + and values are environment variable values for those names. + At most 10 environment variables can be specified. + The Name of the environment variable must be unique. + + environment_variables = { + 'MY_KEY': 'MY_VALUE' + } + replica_count (int): + Optional. The number of worker replicas. If replica count = 1 then one chief + replica will be provisioned. If replica_count > 1 the remainder will be + provisioned as a worker replica pool. + machine_type (str): + Optional. The type of machine to use for training. + accelerator_type (str): + Optional. Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, + NVIDIA_TESLA_T4 + accelerator_count (int): + Optional. The number of accelerators to attach to a worker replica. + project (str): + Optional. Project to run the custom job in. Overrides project set in aiplatform.init. + location (str): + Optional. Location to run the custom job in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Optional. Custom credentials to use to run call custom job service. Overrides + credentials set in aiplatform.init. + encryption_spec_key_name (str): + Optional. Customer-managed encryption key name for a + CustomJob. If this is set, then all resources + created by the CustomJob will be encrypted with + the provided encryption key. + staging_bucket (str): + Optional. Bucket for produced custom job artifacts. Overrides + staging_bucket set in aiplatform.init. + + Raises: + RuntimeError is not staging bucket was set using aiplatfrom.init and a staging + bucket was not passed in. + """ project = project or initializer.global_config.project location = location or initializer.global_config.location @@ -1055,6 +1104,28 @@ def run( restart_job_on_worker_restart: bool = False, sync: bool = True, ) -> None: + """Run this configured CustomJob. + + Args: + service_account (str): + Optional. Specifies the service account for workload run-as account. + Users submitting jobs must have act-as permission on this run-as account. + network (str): + Optional. The full name of the Compute Engine network to which the job + should be peered. For example, projects/12345/global/networks/myVPC. + Private services access must already be configured for the network. + If left unspecified, the job is not peered with any network. + timeout (int): + The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. + sync (bool): + Whether to execute this method synchronously. If False, this method + will unblock and it will be executed in a concurrent Future. + """ if service_account: self._gca_resource.service_account = service_account From 23b3249d61876cd724e1e7f676502483713bddf5 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 15:58:52 -0400 Subject: [PATCH 13/29] feat: add custom job and hp tuning job tests --- google/cloud/aiplatform/base.py | 23 ++ .../cloud/aiplatform/hyperparameter_tuning.py | 101 ++++- google/cloud/aiplatform/jobs.py | 145 ++++++- google/cloud/aiplatform/utils/source_utils.py | 17 + .../aiplatform/utils/worker_spec_utils.py | 16 + tests/unit/aiplatform/test_custom_job.py | 323 +++++++++++++++ .../test_hyperparametertuning_job.py | 369 ++++++++++++++++++ 7 files changed, 967 insertions(+), 27 deletions(-) create mode 100644 tests/unit/aiplatform/test_custom_job.py create mode 100644 tests/unit/aiplatform/test_hyperparametertuning_job.py diff --git a/google/cloud/aiplatform/base.py b/google/cloud/aiplatform/base.py index b42b630f75..a9fcef24bd 100644 --- a/google/cloud/aiplatform/base.py +++ b/google/cloud/aiplatform/base.py @@ -101,6 +101,29 @@ def log_create_complete( f"{variable_name} = aiplatform.{cls.__name__}('{resource.name}')" ) + def log_create_complete_with_getter( + self, + cls: Type["AiPlatformResourceNoun"], + resource: proto.Message, + variable_name: str, + ): + """Logs create event is complete. + + Will also include code snippet to instantiate resource in SDK. + + Args: + cls (AiPlatformResourceNoun): + AI Platform Resource Noun class that is being created. + resource (proto.Message): + AI Platform Resourc proto.Message + variable_name (str): Name of variable to use for code snippet + """ + self._logger.info(f"{cls.__name__} created. Resource name: {resource.name}") + self._logger.info(f"To use this {cls.__name__} in another session:") + self._logger.info( + f"{variable_name} = aiplatform.{cls.__name__}.get('{resource.name}')" + ) + def log_action_start_against_resource( self, action: str, noun: str, resource_noun_obj: "AiPlatformResourceNoun" ): diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index de9ecac84e..765de6f422 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -1,18 +1,38 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import abc -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union import proto from google.cloud.aiplatform.compat.types import study as gca_study_compat -_scale_type_map = { +_SCALE_TYPE_MAP = { "linear": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, "log": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, "reverse_log": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_REVERSE_LOG_SCALE, + "unspecified": gca_study_compat.StudySpec.ParameterSpec.ScaleType.SCALE_TYPE_UNSPECIFIED, } class _ParameterSpec(metaclass=abc.ABCMeta): + """Base class represents a single parameter to optimize.""" + def __init__( self, conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, @@ -26,22 +46,26 @@ def __init__( @classmethod @abc.abstractmethod def _proto_parameter_value_class(self) -> proto.Message: + """The proto represenation of this parameter.""" pass @property @classmethod @abc.abstractmethod def _parameter_value_map(self) -> Tuple[Tuple[str, str]]: + """A Tuple map of parameter key to underlying proto key.""" pass @property @classmethod @abc.abstractmethod def _parameter_spec_value_key(self) -> Tuple[Tuple[str, str]]: + """The ParameterSpec key this parameter should be assigned.""" pass @property def _proto_parameter_value_spec(self) -> proto.Message: + """Converts this parameter to it's parameter value representation.""" proto_parameter_value_spec = self._proto_parameter_value_class() for self_attr_key, proto_attr_key in self._parameter_value_map: setattr( @@ -52,10 +76,11 @@ def _proto_parameter_value_spec(self) -> proto.Message: def _to_parameter_spec( self, parameter_id: str ) -> gca_study_compat.StudySpec.ParameterSpec: + """Converts this parameter to ParameterSpec.""" # TODO: Conditional parameters parameter_spec = gca_study_compat.StudySpec.ParameterSpec( parameter_id=parameter_id, - scale_type=_scale_type_map.get(getattr(self, "scale")), + scale_type=_SCALE_TYPE_MAP.get(getattr(self, "scale", "unspecified")), ) setattr( @@ -83,6 +108,21 @@ def __init__( conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): + """ + Value specification for a parameter in ``DOUBLE`` type. + + Args: + min (float): + Required. Inclusive minimum value of the + parameter. + max (float): + Required. Inclusive maximum value of the + parameter. + scale (str): + Required. The type of scaling that should be applied to this parameter. + + Accepts: 'linear', 'log', 'reverse_log' + """ super().__init__( conditional_parameter_spec=conditional_parameter_spec, @@ -110,59 +150,94 @@ def __init__( conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): + """ + Value specification for a parameter in ``INTEGER`` type. + + Args: + min (float): + Required. Inclusive minimum value of the + parameter. + max (float): + Required. Inclusive maximum value of the + parameter. + scale (str): + Required. The type of scaling that should be applied to this parameter. + + Accepts: 'linear', 'log', 'reverse_log' + """ super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_value=parent_values, + parent_values=parent_values, ) self.min = min - self.max = (max,) + self.max = max self.scale = scale -class CategoricalValueSpec(_ParameterSpec): +class CategoricalParameterSpec(_ParameterSpec): _proto_parameter_value_class = ( gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec ) - _parameter_value_map = ("values", "values") + _parameter_value_map = (("values", "values"),) _parameter_spec_value_key = "categorical_value_spec" def __init__( self, - values: List[str], + values: Sequence[str], conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): + """Value specification for a parameter in ``CATEGORICAL`` type. + + Args: + values (Sequence[str]): + Required. The list of possible categories. + """ super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_value=parent_values, + parent_values=parent_values, ) self.values = values -class DiscreteValueSpec(_ParameterSpec): +class DiscreteParameterSpec(_ParameterSpec): _proto_parameter_value_class = ( gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec ) - _parameter_value_map = ("values", "values") + _parameter_value_map = (("values", "values"),) _parameter_spec_value_key = "discrete_value_spec" def __init__( self, - values: List[float], + values: Sequence[float], scale: str, conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, parent_values: Optional[List[Union[float, int, str]]] = None, ): + """Value specification for a parameter in ``DISCRETE`` type. + + values (Sequence[float]): + Required. A list of possible values. + The list should be in increasing order and at + least 1e-10 apart. For instance, this parameter + might have possible settings of 1.5, 2.5, and + 4.0. This list should not contain more than + 1,000 values. + scale (str): + Required. The type of scaling that should be applied to this parameter. + + Accepts: 'linear', 'log', 'reverse_log' + """ super().__init__( conditional_parameter_spec=conditional_parameter_spec, - parent_value=parent_values, + parent_values=parent_values, ) self.values = values diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 9e58f5d1c4..9e36189877 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -184,12 +184,20 @@ def _block_until_complete(self): previous_time = current_time time.sleep(wait) - _LOGGER.log_action_completed_against_resource("run", "completed", self) - + _LOGGER.info( + "%s %s current state:\n%s" + % ( + self.__class__.__name__, + self._gca_resource.name, + self._gca_resource.state, + ) + ) # Error is only populated when the job state is # JOB_STATE_FAILED or JOB_STATE_CANCELLED. - if self.state in _JOB_ERROR_STATES: + if self._gca_resource.state in _JOB_ERROR_STATES: raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error) + else: + _LOGGER.log_action_completed_against_resource("run", "completed", self) @classmethod def list( @@ -809,7 +817,7 @@ def run(self) -> None: @property def _has_run(self) -> bool: """Property returns true if this class has a resource name.""" - return bool(getattr(self._gca_resource, "name")) + return bool(self._gca_resource.name) @property def state(self) -> gca_job_state.JobState: @@ -872,8 +880,8 @@ class DataLabelingJob(_Job): class CustomJob(_RunnableJob): - """Creates an AI Platform (Unified) Custom Job.""" - + """AI Platform (Unified) Custom Job.""" + _resource_noun = "customJobs" _getter_method = "get_custom_job" _list_method = "list_custom_job" @@ -881,7 +889,6 @@ class CustomJob(_RunnableJob): _delete_method = "delete_custom_job" _job_type = "training" - def __init__( self, display_name: str, @@ -924,7 +931,10 @@ def __init__( Args: - display_name (str): Required. The user-defined name of this Custom Job. + display_name (str): + Required. The user-defined name of the HyperparameterTuningJob. + The name can be up to 128 characters long and can be consist + of any UTF-8 characters. worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]): Required. The spec of the worker pools including machine type and Docker image. Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages. @@ -1042,7 +1052,7 @@ def from_local_script( Raises: RuntimeError is not staging bucket was set using aiplatfrom.init and a staging bucket was not passed in. - """ + """ project = project or initializer.global_config.project location = location or initializer.global_config.location @@ -1128,10 +1138,10 @@ def run( """ if service_account: - self._gca_resource.service_account = service_account + self._gca_resource.job_spec.service_account = service_account if network: - self._gca_resource.network = network + self._gca_resource.job_spec.network = network if timeout or restart_job_on_worker_restart: timeout = duration_pb2.Duration(seconds=timeout) if timeout else None @@ -1146,7 +1156,9 @@ def run( parent=self._parent, custom_job=self._gca_resource ) - _LOGGER.log_create_complete(self.__class__, self._gca_resource, "custom_job") + _LOGGER.log_create_complete_with_getter( + self.__class__, self._gca_resource, "custom_job" + ) _LOGGER.info("View Custom Job:\n%s" % self._dashboard_uri()) @@ -1168,8 +1180,9 @@ def job_spec(self): } - class HyperparameterTuningJob(_RunnableJob): + """AI Pltatform(Unified) HyperparameterTuning Job""" + _resource_noun = "hyperparameterTuningJobs" _getter_method = "get_hyperparameter_tuning_job" _list_method = "list_hyperparameter_tuning_jobs" @@ -1193,6 +1206,86 @@ def __init__( credentials: Optional[auth_credentials.Credentials] = None, encryption_spec_key_name: Optional[str] = None, ): + """ + Configures a HyperparameterTuning Job. + + Args: + display_name (str): + Required. The user-defined name of the HyperparameterTuningJob. + The name can be up to 128 characters long and can be consist + of any UTF-8 characters. + custom_job (aiplatform.CustomJob): + Required. Configured CustomJob. The worker pool spec from this custom job + applies to the CustomJobs created in all the trials. + metric_spec: Dict[str, str] + Required. Dicionary representing metrics to optimize. The dictionary key is the metric_id, + which is reported by your training job, and the dictionary value is the + optimization goal of the metric('minimize' or 'maximize'). example: + + metric_spec = {'loss': 'minimize', 'accuracy': 'maximize'} + + parameter_spec (Dict[str, hyperparameter_tuning._ParameterSpec]): + Required. Dictionary representing parameters to optimize. The dictionary key is the metric_id, + which is passed into your training job as a command line key word arguemnt, and the + dictionary value is the parameter specification of the metric. + + + from google.cloud.aiplatform import hyperparameter_tuning as hpt + + parameter_spec={ + 'decay': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear'), + 'learning_rate': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear') + 'batch_size': hpt.DiscreteParamterSpec(values=[4, 8, 16, 32, 64, 128], scale='linear') + } + + Supported parameter specifications can be found until aiplatform.hyperparameter_tuning. + These parameter specification are currently supported: + DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec + + max_trial_count (int): + Reuired. The desired total number of Trials. + parallel_trial_count (int): + Required. The desired number of Trials to run in parallel. + max_failed_trial_count (int): + Optional. The number of failed Trials that need to be + seen before failing the HyperparameterTuningJob. + If set to 0, AI Platform decides how many Trials + must fail before the whole job fails. + search_algorithm (str): + The search algorithm specified for the Study. + + Accepts: 'random', 'grid' + measurement_selection (str): + This indicates which measurement to use if/when the service + automatically selects the final measurement from previously reported + intermediate measurements. + + Accepts: 'best', 'last' + + Choose this based on two considerations: + A) Do you expect your measurements to monotonically improve? If so, + choose 'last'. On the other hand, if you're in a situation + where your system can "over-train" and you expect the performance to + get better for a while but then start declining, choose + 'best'. B) Are your measurements significantly noisy + and/or irreproducible? If so, 'best' will tend to be + over-optimistic, and it may be better to choose 'last'. If + both or neither of (A) and (B) apply, it doesn't matter which + selection type is chosen. + project (str): + Optional. Project to run the HyperparameterTuningjob in. Overrides project set in aiplatform.init. + location (str): + Optional. Location to run the HyperparameterTuning in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Optional. Custom credentials to use to run call HyperparameterTuning service. Overrides + credentials set in aiplatform.init. + encryption_spec_key_name (str): + Optional. Customer-managed encryption key options for a + HyperparameterTuningJob. If this is set, then + all resources created by the + HyperparameterTuningJob will be encrypted with + the provided encryption key. + """ super().__init__(project=project, location=location, credentials=credentials) metrics = [ @@ -1237,6 +1330,28 @@ def run( restart_job_on_worker_restart: bool = False, sync: bool = True, ) -> None: + """Run this configured CustomJob. + + Args: + service_account (str): + Optional. Specifies the service account for workload run-as account. + Users submitting jobs must have act-as permission on this run-as account. + network (str): + Optional. The full name of the Compute Engine network to which the job + should be peered. For example, projects/12345/global/networks/myVPC. + Private services access must already be configured for the network. + If left unspecified, the job is not peered with any network. + timeout (int): + The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. + sync (bool): + Whether to execute this method synchronously. If False, this method + will unblock and it will be executed in a concurrent Future. + """ if service_account: self._gca_resource.trial_job_spec.service_account = service_account @@ -1257,7 +1372,9 @@ def run( parent=self._parent, hyperparameter_tuning_job=self._gca_resource ) - _LOGGER.log_create_complete(self.__class__, self._gca_resource, "hpt_job") + _LOGGER.log_create_complete_with_getter( + self.__class__, self._gca_resource, "hpt_job" + ) _LOGGER.info("View HyperparameterTuningJob:\n%s" % self._dashboard_uri()) diff --git a/google/cloud/aiplatform/utils/source_utils.py b/google/cloud/aiplatform/utils/source_utils.py index f84e37b52a..b7fcef806f 100644 --- a/google/cloud/aiplatform/utils/source_utils.py +++ b/google/cloud/aiplatform/utils/source_utils.py @@ -1,3 +1,20 @@ +# -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + import functools import pathlib import shutil diff --git a/google/cloud/aiplatform/utils/worker_spec_utils.py b/google/cloud/aiplatform/utils/worker_spec_utils.py index ba81352d73..9a681d3b98 100644 --- a/google/cloud/aiplatform/utils/worker_spec_utils.py +++ b/google/cloud/aiplatform/utils/worker_spec_utils.py @@ -1,3 +1,19 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + from typing import NamedTuple, Optional, Dict, Union, List from google.cloud.aiplatform import utils diff --git a/tests/unit/aiplatform/test_custom_job.py b/tests/unit/aiplatform/test_custom_job.py new file mode 100644 index 0000000000..3d5e9e510b --- /dev/null +++ b/tests/unit/aiplatform/test_custom_job.py @@ -0,0 +1,323 @@ +# -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +import copy +from importlib import reload +from unittest import mock +from unittest.mock import patch + +from google.protobuf import duration_pb2 # type: ignore +from google.rpc import status_pb2 + +import test_training_jobs +from test_training_jobs import mock_python_package_to_gcs + +from google.cloud import aiplatform +from google.cloud.aiplatform.compat.types import custom_job as gca_custom_job_compat +from google.cloud.aiplatform.compat.types import io as gca_io_compat +from google.cloud.aiplatform.compat.types import job_state as gca_job_state_compat +from google.cloud.aiplatform.compat.types import ( + encryption_spec as gca_encryption_spec_compat, +) +from google.cloud.aiplatform_v1.services.job_service import client as job_service_client + +_TEST_PROJECT = "test-project" +_TEST_LOCATION = "us-central1" +_TEST_ID = "1028944691210842416" +_TEST_DISPLAY_NAME = "my_job_1234" + +_TEST_PARENT = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}" + +_TEST_CUSTOM_JOB_NAME = f"{_TEST_PARENT}/customJobs/{_TEST_ID}" + +_TEST_TRAINING_CONTAINER_IMAGE = "gcr.io/test-training/container:image" + +_TEST_WORKER_POOL_SPEC = [ + { + "machine_spec": { + "machine_type": "n1-standard-4", + "accelerator_type": "NVIDIA_TESLA_K80", + "accelerator_count": 1, + }, + "replica_count": 1, + "container_spec": { + "image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "command": [], + "args": [], + }, + } +] + +_TEST_STAGING_BUCKET = "gs://test-staging-bucket" + +# CMEK encryption +_TEST_DEFAULT_ENCRYPTION_KEY_NAME = "key_default" +_TEST_DEFAULT_ENCRYPTION_SPEC = gca_encryption_spec_compat.EncryptionSpec( + kms_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME +) + +_TEST_SERVICE_ACCOUNT = "vinnys@my-project.iam.gserviceaccount.com" + + +_TEST_NETWORK = f"projects/{_TEST_PROJECT}/global/networks/{_TEST_ID}" + +_TEST_TIMEOUT = 8000 +_TEST_RESTART_JOB_ON_WORKER_RESTART = True + +_TEST_BASE_CUSTOM_JOB_PROTO = gca_custom_job_compat.CustomJob( + display_name=_TEST_DISPLAY_NAME, + job_spec=gca_custom_job_compat.CustomJobSpec( + worker_pool_specs=_TEST_WORKER_POOL_SPEC, + base_output_directory=gca_io_compat.GcsDestination( + output_uri_prefix=_TEST_STAGING_BUCKET + ), + scheduling=gca_custom_job_compat.Scheduling( + timeout=duration_pb2.Duration(seconds=_TEST_TIMEOUT), + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + ), + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + ), + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, +) + + +def _get_custom_job_proto(state=None, name=None, error=None): + custom_job_proto = copy.deepcopy(_TEST_BASE_CUSTOM_JOB_PROTO) + custom_job_proto.name = name + custom_job_proto.state = state + custom_job_proto.error = error + return custom_job_proto + + +@pytest.fixture +def get_custom_job_mock(): + with patch.object( + job_service_client.JobServiceClient, "get_custom_job" + ) as get_custom_job_mock: + get_custom_job_mock.side_effect = [ + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED, + ), + ] + yield get_custom_job_mock + + +@pytest.fixture +def get_custom_job_mock_with_fail(): + with patch.object( + job_service_client.JobServiceClient, "get_custom_job" + ) as get_custom_job_mock: + get_custom_job_mock.side_effect = [ + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_FAILED, + error=status_pb2.Status(message="Test Error"), + ), + ] + yield get_custom_job_mock + + +@pytest.fixture +def create_custom_job_mock(): + with mock.patch.object( + job_service_client.JobServiceClient, "create_custom_job" + ) as create_custom_job_mock: + create_custom_job_mock.return_value = _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ) + yield create_custom_job_mock + + +class TestCustomJob: + def setup_method(self): + reload(aiplatform.initializer) + reload(aiplatform) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + @pytest.mark.parametrize("sync", [True, False]) + def test_create_custom_job(self, create_custom_job_mock, get_custom_job_mock, sync): + + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_custom_job = _get_custom_job_proto() + + create_custom_job_mock.assert_called_once_with( + parent=_TEST_PARENT, custom_job=expected_custom_job + ) + + assert job.job_spec == expected_custom_job.job_spec + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_run_custom_job_with_fail_raises( + self, create_custom_job_mock, get_custom_job_mock_with_fail, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + with pytest.raises(RuntimeError): + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_custom_job = _get_custom_job_proto() + + create_custom_job_mock.assert_called_once_with( + parent=_TEST_PARENT, custom_job=expected_custom_job + ) + + assert job.job_spec == expected_custom_job.job_spec + assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED + + def test_custom_job_get_state_raises_without_run(self): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + with pytest.raises(RuntimeError): + print(job.state) + + def test_no_staging_bucket_raises(self): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with pytest.raises(RuntimeError): + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, + worker_pool_specs=_TEST_WORKER_POOL_SPEC, + ) + + def test_get_custom_job(self, get_custom_job_mock): + + job = aiplatform.CustomJob.get(_TEST_CUSTOM_JOB_NAME) + + get_custom_job_mock.assert_called_once_with(name=_TEST_CUSTOM_JOB_NAME) + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_PENDING + ) + assert job.job_spec == _TEST_BASE_CUSTOM_JOB_PROTO.job_spec + + @pytest.mark.usefixtures("mock_python_package_to_gcs") + @pytest.mark.parametrize("sync", [True, False]) + def test_create_from_local_script( + self, get_custom_job_mock, create_custom_job_mock, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + # configuration on this is tested in test_training_jobs.py + job = aiplatform.CustomJob.from_local_script( + display_name=_TEST_DISPLAY_NAME, + script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, + container_uri=_TEST_TRAINING_CONTAINER_IMAGE, + ) + + job.run(sync=sync) + + job.wait() + + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) + + @pytest.mark.usefixtures("mock_python_package_to_gcs") + @pytest.mark.parametrize("sync", [True, False]) + def test_create_from_local_script_raises_with_no_staging_bucket( + self, get_custom_job_mock, create_custom_job_mock, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + with pytest.raises(RuntimeError): + + # configuration on this is tested in test_training_jobs.py + job = aiplatform.CustomJob.from_local_script( + display_name=_TEST_DISPLAY_NAME, + script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, + container_uri=_TEST_TRAINING_CONTAINER_IMAGE, + ) diff --git a/tests/unit/aiplatform/test_hyperparametertuning_job.py b/tests/unit/aiplatform/test_hyperparametertuning_job.py new file mode 100644 index 0000000000..d406e07efa --- /dev/null +++ b/tests/unit/aiplatform/test_hyperparametertuning_job.py @@ -0,0 +1,369 @@ +# -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +import copy +from importlib import reload +from unittest import mock +from unittest.mock import patch + +from google.protobuf import duration_pb2 # type: ignore +from google.rpc import status_pb2 + +from google.cloud import aiplatform +from google.cloud.aiplatform import hyperparameter_tuning as hpt +from google.cloud.aiplatform.compat.types import job_state as gca_job_state_compat +from google.cloud.aiplatform.compat.types import ( + encryption_spec as gca_encryption_spec_compat, +) +from google.cloud.aiplatform.compat.types import ( + hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat, +) +from google.cloud.aiplatform.compat.types import study as gca_study_compat +from google.cloud.aiplatform_v1.services.job_service import client as job_service_client + +import test_custom_job + +_TEST_PROJECT = "test-project" +_TEST_LOCATION = "us-central1" +_TEST_ID = "1028944691210842416" +_TEST_DISPLAY_NAME = "my_hp_job_1234" + +_TEST_PARENT = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}" + +_TEST_STAGING_BUCKET = test_custom_job._TEST_STAGING_BUCKET + +_TEST_HYPERPARAMETERTUNING_JOB_NAME = ( + f"{_TEST_PARENT}/hyperparameterTuningJobs/{_TEST_ID}" +) + +# CMEK encryption +_TEST_DEFAULT_ENCRYPTION_KEY_NAME = "key_default" +_TEST_DEFAULT_ENCRYPTION_SPEC = gca_encryption_spec_compat.EncryptionSpec( + kms_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME +) + +_TEST_SERVICE_ACCOUNT = "vinnys@my-project.iam.gserviceaccount.com" + + +_TEST_NETWORK = f"projects/{_TEST_PROJECT}/global/networks/{_TEST_ID}" + +_TEST_TIMEOUT = 8000 +_TEST_RESTART_JOB_ON_WORKER_RESTART = True + +_TEST_METRIC_SPEC_KEY = "test-metric" +_TEST_METRIC_SPEC_VALUE = "maximize" + +_TEST_PARALLEL_TRIAL_COUNT = 8 +_TEST_MAX_TRIAL_COUNT = 64 +_TEST_MAX_FAILED_TRIAL_COUNT = 4 +_TEST_SEARCH_ALGORITHM = "random" +_TEST_MEASUREMENT_SELECTION = "best" + + +_TEST_BASE_HYPERPARAMETER_TUNING_JOB_PROTO = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + study_spec=gca_study_compat.StudySpec( + metrics=[ + gca_study_compat.StudySpec.MetricSpec( + metric_id=_TEST_METRIC_SPEC_KEY, goal=_TEST_METRIC_SPEC_VALUE.upper() + ) + ], + parameters=[ + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="lr", + scale_type=gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, + double_value_spec=gca_study_compat.StudySpec.ParameterSpec.DoubleValueSpec( + min_value=0.001, max_value=0.1 + ), + ), + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="units", + scale_type=gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + integer_value_spec=gca_study_compat.StudySpec.ParameterSpec.IntegerValueSpec( + min_value=4, max_value=1028 + ), + ), + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="activation", + categorical_value_spec=gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + ), + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="batch_size", + scale_type=gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + discrete_value_spec=gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec( + values=[16, 32] + ), + ), + ], + algorithm=gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, + measurement_selection_type=gca_study_compat.StudySpec.MeasurementSelectionType.BEST_MEASUREMENT, + ), + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + trial_job_spec=test_custom_job._TEST_BASE_CUSTOM_JOB_PROTO.job_spec, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, +) + + +def _get_hyperparameter_tuning_job_proto(state=None, name=None, error=None): + custom_job_proto = copy.deepcopy(_TEST_BASE_HYPERPARAMETER_TUNING_JOB_PROTO) + custom_job_proto.name = name + custom_job_proto.state = state + custom_job_proto.error = error + return custom_job_proto + + +@pytest.fixture +def get_hyperparameter_tuning_job_mock(): + with patch.object( + job_service_client.JobServiceClient, "get_hyperparameter_tuning_job" + ) as get_hyperparameter_tuning_job_mock: + get_hyperparameter_tuning_job_mock.side_effect = [ + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED, + ), + ] + yield get_hyperparameter_tuning_job_mock + + +@pytest.fixture +def get_hyperparameter_tuning_job_mock_with_fail(): + with patch.object( + job_service_client.JobServiceClient, "get_hyperparameter_tuning_job" + ) as get_hyperparameter_tuning_job_mock: + get_hyperparameter_tuning_job_mock.side_effect = [ + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_FAILED, + error=status_pb2.Status(message="Test Error"), + ), + ] + yield get_hyperparameter_tuning_job_mock + + +@pytest.fixture +def create_hyperparameter_tuning_job_mock(): + with mock.patch.object( + job_service_client.JobServiceClient, "create_hyperparameter_tuning_job" + ) as create_hyperparameter_tuning_job_mock: + create_hyperparameter_tuning_job_mock.return_value = _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ) + yield create_hyperparameter_tuning_job_mock + + +class TestCustomJob: + def setup_method(self): + reload(aiplatform.initializer) + reload(aiplatform) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + @pytest.mark.parametrize("sync", [True, False]) + def test_create_hyperparameter_tuning_job( + self, + create_hyperparameter_tuning_job_mock, + get_hyperparameter_tuning_job_mock, + sync, + ): + + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() + + create_hyperparameter_tuning_job_mock.assert_called_once_with( + parent=_TEST_PARENT, + hyperparameter_tuning_job=expected_hyperparameter_tuning_job, + ) + + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_run_hyperparameter_tuning_job_with_fail_raises( + self, + create_hyperparameter_tuning_job_mock, + get_hyperparameter_tuning_job_mock_with_fail, + sync, + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + with pytest.raises(RuntimeError): + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() + + create_hyperparameter_tuning_job_mock.assert_called_once_with( + parent=_TEST_PARENT, + hyperparameter_tuning_job=expected_hyperparameter_tuning_job, + ) + + assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED + + def test_hyperparameter_tuning_job_get_state_raises_without_run(self): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32, 64], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + with pytest.raises(RuntimeError): + print(job.state) + + def test_get_hyperparameter_tuning_job(self, get_hyperparameter_tuning_job_mock): + + job = aiplatform.HyperparameterTuningJob.get( + _TEST_HYPERPARAMETERTUNING_JOB_NAME + ) + + get_hyperparameter_tuning_job_mock.assert_called_once_with( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME + ) + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_PENDING + ) From bf9452b662f93390735f0878b083d2384afec02b Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 16:10:03 -0400 Subject: [PATCH 14/29] chore: remove training utils. We will re-evaluate whether to add these as a separate package --- google/cloud/aiplatform/__init__.py | 2 - google/cloud/aiplatform/training_utils.py | 187 ------------------- tests/unit/aiplatform/test_training_utils.py | 144 -------------- 3 files changed, 333 deletions(-) delete mode 100644 google/cloud/aiplatform/training_utils.py delete mode 100644 tests/unit/aiplatform/test_training_utils.py diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index d544b4bc8f..6aa8f64161 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -45,7 +45,6 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) -from google.cloud.aiplatform import training_utils """ Usage: @@ -72,7 +71,6 @@ "get_experiment_df", "get_pipeline_df", "start_run", - "training_utils", "AutoMLImageTrainingJob", "AutoMLTabularTrainingJob", "AutoMLForecastingTrainingJob", diff --git a/google/cloud/aiplatform/training_utils.py b/google/cloud/aiplatform/training_utils.py deleted file mode 100644 index 95e4d2429a..0000000000 --- a/google/cloud/aiplatform/training_utils.py +++ /dev/null @@ -1,187 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import collections -import json -import os -import time -from typing import Dict, Optional - - -class EnvironmentVariables: - """Passes on OS' environment variables.""" - - @property - def training_data_uri(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for training data. None if - environment variable not set. - """ - return os.environ.get("AIP_TRAINING_DATA_URI") - - @property - def validation_data_uri(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for validation data. None - if environment variable not set. - """ - return os.environ.get("AIP_VALIDATION_DATA_URI") - - @property - def test_data_uri(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for test data. None if - environment variable not set. - """ - return os.environ.get("AIP_TEST_DATA_URI") - - @property - def model_dir(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for saving model artefacts. - None if environment variable not set. - """ - return os.environ.get("AIP_MODEL_DIR") - - @property - def checkpoint_dir(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for saving checkpoints. - None if environment variable not set. - """ - return os.environ.get("AIP_CHECKPOINT_DIR") - - @property - def tensorboard_log_dir(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for saving TensorBoard logs. - None if environment variable not set. - """ - return os.environ.get("AIP_TENSORBOARD_LOG_DIR") - - @property - def cluster_spec(self) -> Optional[Dict]: - """ - Returns: - json string as described in https://cloud.google.com/ai-platform-unified/docs/training/distributed-training#cluster-variables - None if environment variable not set. - """ - cluster_spec_env = os.environ.get("CLUSTER_SPEC") - if cluster_spec_env is not None: - return json.loads(cluster_spec_env) - else: - return None - - @property - def tf_config(self) -> Optional[Dict]: - """ - Returns: - json string as described in https://cloud.google.com/ai-platform-unified/docs/training/distributed-training#tf-config - None if environment variable not set. - """ - tf_config_env = os.environ.get("TF_CONFIG") - if tf_config_env is not None: - return json.loads(tf_config_env) - else: - return None - - -_DEFAULT_HYPERPARAMETER_METRIC_TAG = "training/hptuning/metric" -_DEFAULT_METRIC_PATH = "/tmp/hypertune/output.metrics" -# TODO(0olwzo0): consider to make it configurable -_MAX_NUM_METRIC_ENTRIES_TO_PRESERVE = 100 - - -class _HyperparameterTuningJobReporterSingleton: - """Main class for HyperTune.""" - - initialized = False - - @classmethod - def initialize(cls): - if cls.initialized: - return - - cls.metric_path = os.environ.get( - "CLOUD_ML_HP_METRIC_FILE", _DEFAULT_METRIC_PATH - ) - if not os.path.exists(os.path.dirname(cls.metric_path)): - os.makedirs(os.path.dirname(cls.metric_path)) - - cls.trial_id = os.environ.get("CLOUD_ML_TRIAL_ID", 0) - cls.metrics_queue = collections.deque( - maxlen=_MAX_NUM_METRIC_ENTRIES_TO_PRESERVE - ) - - cls.initialized = True - - @classmethod - def _dump_metrics_to_file(cls): - with open(cls.metric_path, "w") as metric_file: - for metric in cls.metrics_queue: - metric_file.write(json.dumps(metric, sort_keys=True) + "\n") - - @classmethod - def report_hyperparameter_tuning_metric( - cls, - hyperparameter_metric_tag, - metric_value, - global_step=None, - checkpoint_path="", - ): - """Method to report hyperparameter tuning metric. - Args: - hyperparameter_metric_tag: The hyperparameter metric name this metric - value is associated with. Should keep consistent with the tag - specified in HyperparameterSpec. - metric_value: float, the values for the hyperparameter metric to report. - global_step: int, the global step this metric value is associated with. - checkpoint_path: The checkpoint path which can be used to warmstart from. - """ - metric_value = float(metric_value) - metric_tag = _DEFAULT_HYPERPARAMETER_METRIC_TAG - if hyperparameter_metric_tag: - metric_tag = hyperparameter_metric_tag - metric_body = { - "timestamp": time.time(), - "trial": str(cls.trial_id), - metric_tag: str(metric_value), - "global_step": str(int(global_step) if global_step else 0), - "checkpoint_path": checkpoint_path, - } - cls.metrics_queue.append(metric_body) - cls._dump_metrics_to_file() - - -def report_hyperparameter_tuning_metrics( - metrics: Dict[str, float], global_step: Optional[int] = None, checkpoint_path="" -): - _HyperparameterTuningJobReporterSingleton.initialize() - - for hyperparameter_metric_tag, metric_value in metrics.items(): - _HyperparameterTuningJobReporterSingleton.report_hyperparameter_tuning_metric( - hyperparameter_metric_tag=hyperparameter_metric_tag, - metric_value=metric_value, - global_step=global_step, - checkpoint_path=checkpoint_path, - ) diff --git a/tests/unit/aiplatform/test_training_utils.py b/tests/unit/aiplatform/test_training_utils.py deleted file mode 100644 index 1d4b839151..0000000000 --- a/tests/unit/aiplatform/test_training_utils.py +++ /dev/null @@ -1,144 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json -import os -import pytest - -from google.cloud.aiplatform import training_utils -from unittest import mock - -_TEST_TRAINING_DATA_URI = "gs://training-data-uri" -_TEST_VALIDATION_DATA_URI = "gs://test-validation-data-uri" -_TEST_TEST_DATA_URI = "gs://test-data-uri" -_TEST_MODEL_DIR = "gs://test-model-dir" -_TEST_CHECKPOINT_DIR = "gs://test-checkpoint-dir" -_TEST_TENSORBOARD_LOG_DIR = "gs://test-tensorboard-log-dir" -_TEST_CLUSTER_SPEC = """{ - "cluster": { - "worker_pools":[ - { - "index":0, - "replicas":[ - "training-workerpool0-ab-0:2222" - ] - }, - { - "index":1, - "replicas":[ - "training-workerpool1-ab-0:2222", - "training-workerpool1-ab-1:2222" - ] - } - ] - }, - "environment": "cloud", - "task": { - "worker_pool_index":0, - "replica_index":0, - "trial":"TRIAL_ID" - } -}""" - - -class TestTrainingUtils: - @pytest.fixture - def mock_environment(self): - env_vars = { - "AIP_TRAINING_DATA_URI": _TEST_TRAINING_DATA_URI, - "AIP_VALIDATION_DATA_URI": _TEST_VALIDATION_DATA_URI, - "AIP_TEST_DATA_URI": _TEST_TEST_DATA_URI, - "AIP_MODEL_DIR": _TEST_MODEL_DIR, - "AIP_CHECKPOINT_DIR": _TEST_CHECKPOINT_DIR, - "AIP_TENSORBOARD_LOG_DIR": _TEST_TENSORBOARD_LOG_DIR, - "CLUSTER_SPEC": _TEST_CLUSTER_SPEC, - "TF_CONFIG": _TEST_CLUSTER_SPEC, - } - with mock.patch.dict(os.environ, env_vars): - yield - - @pytest.mark.usefixtures("mock_environment") - def test_training_data_uri(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.training_data_uri == _TEST_TRAINING_DATA_URI - - def test_training_data_uri_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.training_data_uri is None - - @pytest.mark.usefixtures("mock_environment") - def test_validation_data_uri(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.validation_data_uri == _TEST_VALIDATION_DATA_URI - - def test_validation_data_uri_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.validation_data_uri is None - - @pytest.mark.usefixtures("mock_environment") - def test_test_data_uri(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.test_data_uri == _TEST_TEST_DATA_URI - - def test_test_data_uri_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.test_data_uri is None - - @pytest.mark.usefixtures("mock_environment") - def test_model_dir(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.model_dir == _TEST_MODEL_DIR - - def test_model_dir_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.model_dir is None - - @pytest.mark.usefixtures("mock_environment") - def test_checkpoint_dir(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.checkpoint_dir == _TEST_CHECKPOINT_DIR - - def test_checkpoint_dir_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.checkpoint_dir is None - - @pytest.mark.usefixtures("mock_environment") - def test_tensorboard_log_dir(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tensorboard_log_dir == _TEST_TENSORBOARD_LOG_DIR - - def test_tensorboard_log_dir_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tensorboard_log_dir is None - - @pytest.mark.usefixtures("mock_environment") - def test_cluster_spec(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.cluster_spec == json.loads(_TEST_CLUSTER_SPEC) - - def test_cluster_spec_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.cluster_spec is None - - @pytest.mark.usefixtures("mock_environment") - def test_tf_config(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tf_config == json.loads(_TEST_CLUSTER_SPEC) - - def test_tf_config_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tf_config is None From 78a29f4911d3da22e2451e9855ba5bf0e42184fa Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 16:16:59 -0400 Subject: [PATCH 15/29] chore: add additional documentation --- google/cloud/aiplatform/jobs.py | 54 +++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 9e36189877..5a72596177 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -901,6 +901,7 @@ def __init__( ): """Cosntruct a Custom Job with Worker Pool Specs. + ``` Example usage: worker_pool_specs = [ { @@ -924,6 +925,7 @@ def __init__( ) my_job.run() + ``` For more information on configuring worker pool specs please visit: @@ -1209,6 +1211,58 @@ def __init__( """ Configures a HyperparameterTuning Job. + Example usage: + + ``` + from google.cloud.aiplatform import hyperparamter_tuning as hpt + + worker_pool_specs = [ + { + "machine_spec": { + "machine_type": "n1-standard-4", + "accelerator_type": "NVIDIA_TESLA_K80", + "accelerator_count": 1, + }, + "replica_count": 1, + "container_spec": { + "image_uri": container_image_uri, + "command": [], + "args": [], + }, + } + ] + + custom_job = aiplatform.CustomJob( + display_name='my_job', + worker_pool_specs=worker_pool_specs + ) + + + hp_job = aiplatform.HyperparameterTuningJob( + display_name='hp-test', + custom_job=job, + metric_spec={ + 'loss': 'minimize', + }, + parameter_spec={ + 'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'), + 'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'), + 'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']), + 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') + }, + max_trial_count=128, + parallel_trial_count=8, + ) + + hp_job.run() + + print(hp_job.trials) + ``` + + + For more information on using hyperparameter tuning please visit: + https://cloud.google.com/ai-platform-unified/docs/training/using-hyperparameter-tuning + Args: display_name (str): Required. The user-defined name of the HyperparameterTuningJob. From 2434aaf12f240e2ceeab0df7b621bc5d1c416e11 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 16:20:40 -0400 Subject: [PATCH 16/29] chore: rename test --- ...erparametertuning_job.py => test_hyperparameter_tuning_job.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/aiplatform/{test_hyperparametertuning_job.py => test_hyperparameter_tuning_job.py} (100%) diff --git a/tests/unit/aiplatform/test_hyperparametertuning_job.py b/tests/unit/aiplatform/test_hyperparameter_tuning_job.py similarity index 100% rename from tests/unit/aiplatform/test_hyperparametertuning_job.py rename to tests/unit/aiplatform/test_hyperparameter_tuning_job.py From 7752fc2b2a4969803b13b852cecb2deb9c11c6f7 Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 16:26:40 -0400 Subject: [PATCH 17/29] chore: remove conditional parameter spec arguments from public paramters, will follow up to add them in --- .../cloud/aiplatform/hyperparameter_tuning.py | 45 ++++--------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index 765de6f422..2c265cd4f1 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -101,12 +101,7 @@ class DoubleParameterSpec(_ParameterSpec): _parameter_spec_value_key = "double_value_spec" def __init__( - self, - min: float, - max: float, - scale: str, - conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, - parent_values: Optional[List[Union[float, int, str]]] = None, + self, min: float, max: float, scale: str, ): """ Value specification for a parameter in ``DOUBLE`` type. @@ -124,10 +119,7 @@ def __init__( Accepts: 'linear', 'log', 'reverse_log' """ - super().__init__( - conditional_parameter_spec=conditional_parameter_spec, - parent_values=parent_values, - ) + super().__init__() self.min = min self.max = max @@ -143,12 +135,7 @@ class IntegerParameterSpec(_ParameterSpec): _parameter_spec_value_key = "integer_value_spec" def __init__( - self, - min: int, - max: int, - scale: str, - conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, - parent_values: Optional[List[Union[float, int, str]]] = None, + self, min: int, max: int, scale: str, ): """ Value specification for a parameter in ``INTEGER`` type. @@ -166,10 +153,7 @@ def __init__( Accepts: 'linear', 'log', 'reverse_log' """ - super().__init__( - conditional_parameter_spec=conditional_parameter_spec, - parent_values=parent_values, - ) + super().__init__() self.min = min self.max = max @@ -185,10 +169,7 @@ class CategoricalParameterSpec(_ParameterSpec): _parameter_spec_value_key = "categorical_value_spec" def __init__( - self, - values: Sequence[str], - conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, - parent_values: Optional[List[Union[float, int, str]]] = None, + self, values: Sequence[str], ): """Value specification for a parameter in ``CATEGORICAL`` type. @@ -197,10 +178,7 @@ def __init__( Required. The list of possible categories. """ - super().__init__( - conditional_parameter_spec=conditional_parameter_spec, - parent_values=parent_values, - ) + super().__init__() self.values = values @@ -214,11 +192,7 @@ class DiscreteParameterSpec(_ParameterSpec): _parameter_spec_value_key = "discrete_value_spec" def __init__( - self, - values: Sequence[float], - scale: str, - conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, - parent_values: Optional[List[Union[float, int, str]]] = None, + self, values: Sequence[float], scale: str, ): """Value specification for a parameter in ``DISCRETE`` type. @@ -235,10 +209,7 @@ def __init__( Accepts: 'linear', 'log', 'reverse_log' """ - super().__init__( - conditional_parameter_spec=conditional_parameter_spec, - parent_values=parent_values, - ) + super().__init__() self.values = values self.scale = scale From aab2d9c6ffdf1ef3f1c1357cd4b78a912a07650c Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 17:38:13 -0400 Subject: [PATCH 18/29] chore: lint --- .../cloud/aiplatform/hyperparameter_tuning.py | 8 +++---- google/cloud/aiplatform/jobs.py | 24 +++++++++---------- tests/unit/aiplatform/test_custom_job.py | 6 ++--- .../test_hyperparameter_tuning_job.py | 1 - 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index 2c265cd4f1..56e9fcda4e 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -113,7 +113,7 @@ def __init__( max (float): Required. Inclusive maximum value of the parameter. - scale (str): + scale (str): Required. The type of scaling that should be applied to this parameter. Accepts: 'linear', 'log', 'reverse_log' @@ -147,7 +147,7 @@ def __init__( max (float): Required. Inclusive maximum value of the parameter. - scale (str): + scale (str): Required. The type of scaling that should be applied to this parameter. Accepts: 'linear', 'log', 'reverse_log' @@ -174,7 +174,7 @@ def __init__( """Value specification for a parameter in ``CATEGORICAL`` type. Args: - values (Sequence[str]): + values (Sequence[str]): Required. The list of possible categories. """ @@ -203,7 +203,7 @@ def __init__( might have possible settings of 1.5, 2.5, and 4.0. This list should not contain more than 1,000 values. - scale (str): + scale (str): Required. The type of scaling that should be applied to this parameter. Accepts: 'linear', 'log', 'reverse_log' diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 5a72596177..ab59562331 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -923,12 +923,12 @@ def __init__( display_name='my_job', worker_pool_specs=worker_pool_specs ) - + my_job.run() ``` - For more information on configuring worker pool specs please visit: + For more information on configuring worker pool specs please visit: https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job @@ -937,7 +937,7 @@ def __init__( Required. The user-defined name of the HyperparameterTuningJob. The name can be up to 128 characters long and can be consist of any UTF-8 characters. - worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]): + worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]): Required. The spec of the worker pools including machine type and Docker image. Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages. project (str): @@ -1136,7 +1136,7 @@ def run( to workers leaving and joining a job. sync (bool): Whether to execute this method synchronously. If False, this method - will unblock and it will be executed in a concurrent Future. + will unblock and it will be executed in a concurrent Future. """ if service_account: @@ -1236,7 +1236,7 @@ def __init__( display_name='my_job', worker_pool_specs=worker_pool_specs ) - + hp_job = aiplatform.HyperparameterTuningJob( display_name='hp-test', @@ -1251,7 +1251,7 @@ def __init__( 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') }, max_trial_count=128, - parallel_trial_count=8, + parallel_trial_count=8, ) hp_job.run() @@ -1273,7 +1273,7 @@ def __init__( applies to the CustomJobs created in all the trials. metric_spec: Dict[str, str] Required. Dicionary representing metrics to optimize. The dictionary key is the metric_id, - which is reported by your training job, and the dictionary value is the + which is reported by your training job, and the dictionary value is the optimization goal of the metric('minimize' or 'maximize'). example: metric_spec = {'loss': 'minimize', 'accuracy': 'maximize'} @@ -1283,15 +1283,15 @@ def __init__( which is passed into your training job as a command line key word arguemnt, and the dictionary value is the parameter specification of the metric. - + from google.cloud.aiplatform import hyperparameter_tuning as hpt - + parameter_spec={ 'decay': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear'), 'learning_rate': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear') 'batch_size': hpt.DiscreteParamterSpec(values=[4, 8, 16, 32, 64, 128], scale='linear') } - + Supported parameter specifications can be found until aiplatform.hyperparameter_tuning. These parameter specification are currently supported: DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec @@ -1338,7 +1338,7 @@ def __init__( HyperparameterTuningJob. If this is set, then all resources created by the HyperparameterTuningJob will be encrypted with - the provided encryption key. + the provided encryption key. """ super().__init__(project=project, location=location, credentials=credentials) @@ -1404,7 +1404,7 @@ def run( to workers leaving and joining a job. sync (bool): Whether to execute this method synchronously. If False, this method - will unblock and it will be executed in a concurrent Future. + will unblock and it will be executed in a concurrent Future. """ if service_account: diff --git a/tests/unit/aiplatform/test_custom_job.py b/tests/unit/aiplatform/test_custom_job.py index 3d5e9e510b..37c2ac3df0 100644 --- a/tests/unit/aiplatform/test_custom_job.py +++ b/tests/unit/aiplatform/test_custom_job.py @@ -25,7 +25,7 @@ from google.rpc import status_pb2 import test_training_jobs -from test_training_jobs import mock_python_package_to_gcs +from test_training_jobs import mock_python_package_to_gcs # noqa: F401 from google.cloud import aiplatform from google.cloud.aiplatform.compat.types import custom_job as gca_custom_job_compat @@ -260,7 +260,7 @@ def test_no_staging_bucket_raises(self): aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) with pytest.raises(RuntimeError): - job = aiplatform.CustomJob( + job = aiplatform.CustomJob( # noqa: F841 display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC, ) @@ -316,7 +316,7 @@ def test_create_from_local_script_raises_with_no_staging_bucket( with pytest.raises(RuntimeError): # configuration on this is tested in test_training_jobs.py - job = aiplatform.CustomJob.from_local_script( + job = aiplatform.CustomJob.from_local_script( # noqa: F841 display_name=_TEST_DISPLAY_NAME, script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, container_uri=_TEST_TRAINING_CONTAINER_IMAGE, diff --git a/tests/unit/aiplatform/test_hyperparameter_tuning_job.py b/tests/unit/aiplatform/test_hyperparameter_tuning_job.py index d406e07efa..fcd15f93ac 100644 --- a/tests/unit/aiplatform/test_hyperparameter_tuning_job.py +++ b/tests/unit/aiplatform/test_hyperparameter_tuning_job.py @@ -21,7 +21,6 @@ from unittest import mock from unittest.mock import patch -from google.protobuf import duration_pb2 # type: ignore from google.rpc import status_pb2 from google.cloud import aiplatform From 1b3bad1910d47c58d8529d9583c6c74741f150bf Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Mon, 17 May 2021 17:42:38 -0400 Subject: [PATCH 19/29] chore: resolve reviewers's comments --- google/cloud/aiplatform/hyperparameter_tuning.py | 2 +- google/cloud/aiplatform/jobs.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index 56e9fcda4e..74a292e7b9 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -46,7 +46,7 @@ def __init__( @classmethod @abc.abstractmethod def _proto_parameter_value_class(self) -> proto.Message: - """The proto represenation of this parameter.""" + """The proto representation of this parameter.""" pass @property diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index ab59562331..0ea014e3d0 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1183,7 +1183,7 @@ def job_spec(self): class HyperparameterTuningJob(_RunnableJob): - """AI Pltatform(Unified) HyperparameterTuning Job""" + """AI Pltaform (Unified) Hyperparameter Tuning Job.""" _resource_noun = "hyperparameterTuningJobs" _getter_method = "get_hyperparameter_tuning_job" @@ -1396,7 +1396,7 @@ def run( Private services access must already be configured for the network. If left unspecified, the job is not peered with any network. timeout (int): - The maximum job running time in seconds. The default is 7 days. + Optional. The maximum job running time in seconds. The default is 7 days. restart_job_on_worker_restart (bool): Restarts the entire CustomJob if a worker gets restarted. This feature can be used by @@ -1414,9 +1414,9 @@ def run( self._gca_resource.trial_job_spec.network = network if timeout or restart_job_on_worker_restart: - timeout = duration_pb2.Duration(seconds=timeout) if timeout else None + duration = duration_pb2.Duration(seconds=timeout) if timeout else None self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( - timeout=timeout, + timeout=duration, restart_job_on_worker_restart=restart_job_on_worker_restart, ) From a492741496d3b8f0b99e2d136b4b789dae666dac Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:12:42 -0400 Subject: [PATCH 20/29] Update google/cloud/aiplatform/hyperparameter_tuning.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/hyperparameter_tuning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py index 74a292e7b9..a7a0e641cd 100644 --- a/google/cloud/aiplatform/hyperparameter_tuning.py +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -204,9 +204,9 @@ def __init__( 4.0. This list should not contain more than 1,000 values. scale (str): - Required. The type of scaling that should be applied to this parameter. + Required. The type of scaling that should be applied to this parameter. - Accepts: 'linear', 'log', 'reverse_log' + Accepts: 'linear', 'log', 'reverse_log' """ super().__init__() From 24d3949da59b8c0ac8ff09e30a6fad38eccc4824 Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:12:53 -0400 Subject: [PATCH 21/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 0ea014e3d0..1ad51ad864 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1183,7 +1183,7 @@ def job_spec(self): class HyperparameterTuningJob(_RunnableJob): - """AI Pltaform (Unified) Hyperparameter Tuning Job.""" + """AI Platform (Unified) Hyperparameter Tuning Job.""" _resource_noun = "hyperparameterTuningJobs" _getter_method = "get_hyperparameter_tuning_job" From 8e3e99405702711c460754ca639c3bf65f710623 Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:13:15 -0400 Subject: [PATCH 22/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 1ad51ad864..5ca036a0fe 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1008,9 +1008,12 @@ def from_local_script( Args: display_name (str): Required. The user-defined name of this CustomJob. - script_path (str): Required. Local path to training script. + script_path (str): + Required. Local path to training script. container_uri (str): Required: Uri of the training container image to use for custom job. + args (Optional[List[Union[str, float, int]]]): + Optional. Command line arguments to be passed to the Python task. requirements (Sequence[str]): Optional. List of python packages dependencies of script. environment_variables (Dict[str, str]): From 2f355794020f6fcfa28ca6a5eb9475c3ec92a478 Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:13:42 -0400 Subject: [PATCH 23/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 5ca036a0fe..10c29457ab 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1005,6 +1005,21 @@ def from_local_script( ) -> "CustomJob": """Configures a custom job from a local script. + Example usage: + ``` + job = aiplatform.CustomJob.from_local_script( + display_name="my-custom-job", + script_path="training_script.py", + container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest", + requirements=["gcsfs==0.7.1"], + replica_count=1, + args=['--dataset', 'gs://my-bucket/my-dataset', + '--model_output_uri', 'gs://my-bucket/model'] + ) + + job.run() + ``` + Args: display_name (str): Required. The user-defined name of this CustomJob. From c30a80ffaec77ae77c8fd931671286dcfc07b6a4 Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:13:58 -0400 Subject: [PATCH 24/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 10c29457ab..b6dc131ac7 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1130,7 +1130,7 @@ def run( self, service_account: Optional[str] = None, network: Optional[str] = None, - timeout: Optional[int] = None, # seconds + timeout: Optional[int] = None, restart_job_on_worker_restart: bool = False, sync: bool = True, ) -> None: From 3a66659d8fd47170e823992d3aeaf514386cca6a Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:14:07 -0400 Subject: [PATCH 25/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index b6dc131ac7..20b3baa3eb 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1232,7 +1232,7 @@ def __init__( Example usage: ``` - from google.cloud.aiplatform import hyperparamter_tuning as hpt + from google.cloud.aiplatform import hyperparameter_tuning as hpt worker_pool_specs = [ { From 1c9168518bb005d017ed91a3b27caf8f66a0671f Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:14:44 -0400 Subject: [PATCH 26/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 20b3baa3eb..ff2e00b422 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1324,9 +1324,23 @@ def __init__( If set to 0, AI Platform decides how many Trials must fail before the whole job fails. search_algorithm (str): - The search algorithm specified for the Study. - - Accepts: 'random', 'grid' + The search algorithm specified for the Study. + Accepts one of the following: + `None` - If you do not specify an algorithm, your job uses + the default AI Platform algorithm. The default algorithm + applies Bayesian optimization to arrive at the optimal + solution with a more effective search over the parameter space. + + 'grid' - A simple grid search within the feasible space. This + option is particularly useful if you want to specify a quantity + of trials that is greater than the number of points in the + feasible space. In such cases, if you do not specify a grid + search, the AI Platform default algorithm may generate duplicate + suggestions. To use grid search, all parameter specs must be + of type `IntegerParameterSpec`, `CategoricalParameterSpace`, + or `DiscreteParameterSpec`. + + 'random' - A simple random search within the feasible space. measurement_selection (str): This indicates which measurement to use if/when the service automatically selects the final measurement from previously reported From e6a53b5a4686b7cef9cb5335615ef2b138eebdaf Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:16:33 -0400 Subject: [PATCH 27/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index ff2e00b422..9230c14ae3 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1192,6 +1192,7 @@ def job_spec(self): _SEARCH_ALGORITHM_TO_PROTO_VALUE = { "random": gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, "grid": gca_study_compat.StudySpec.Algorithm.GRID_SEARCH, + None: gca_study_compat.StudySpec.Algorithm.ALGORITHM_UNSPECIFIED, } _MEASUREMENT_SELECTION_TO_PROTO_VALUE = { From 4e07e6dd6c86c708fbf4a224ea98258c16b4bdb6 Mon Sep 17 00:00:00 2001 From: sasha-gitg <44654632+sasha-gitg@users.noreply.github.com> Date: Tue, 18 May 2021 09:16:41 -0400 Subject: [PATCH 28/29] Update google/cloud/aiplatform/jobs.py Co-authored-by: Vinny Senthil --- google/cloud/aiplatform/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 9230c14ae3..9c8412e372 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1220,7 +1220,7 @@ def __init__( max_trial_count: int, parallel_trial_count: int, max_failed_trial_count: int = 0, - search_algorithm: Optional[str] = "random", + search_algorithm: Optional[str] = None, measurement_selection: Optional[str] = "best", project: Optional[str] = None, location: Optional[str] = None, From d5128b0f8b7a2a5a2d719cc99b4861f177acefbb Mon Sep 17 00:00:00 2001 From: Sasha Sobran Date: Tue, 18 May 2021 09:52:52 -0400 Subject: [PATCH 29/29] chore: lint --- google/cloud/aiplatform/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 9c8412e372..7b1f5cccc5 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1325,7 +1325,7 @@ def __init__( If set to 0, AI Platform decides how many Trials must fail before the whole job fails. search_algorithm (str): - The search algorithm specified for the Study. + The search algorithm specified for the Study. Accepts one of the following: `None` - If you do not specify an algorithm, your job uses the default AI Platform algorithm. The default algorithm