diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index 720aa46b21..20d8141a22 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -1070,13 +1070,15 @@ def from_local_script( display_name: str, script_path: str, container_uri: str, - args: Optional[List[Union[str, float, int]]] = None, + args: Optional[Sequence[str]] = None, requirements: Optional[Sequence[str]] = None, environment_variables: Optional[Dict[str, str]] = None, replica_count: int = 1, machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, base_output_dir: Optional[str] = None, project: Optional[str] = None, location: Optional[str] = None, @@ -1110,7 +1112,7 @@ def from_local_script( Required. Local path to training script. container_uri (str): Required: Uri of the training container image to use for custom job. - args (Optional[List[Union[str, float, int]]]): + args (Optional[Sequence[str]]): Optional. Command line arguments to be passed to the Python task. requirements (Sequence[str]): Optional. List of python packages dependencies of script. @@ -1136,6 +1138,13 @@ def from_local_script( NVIDIA_TESLA_T4 accelerator_count (int): Optional. The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Optional. Type of the boot disk, default is `pd-ssd`. + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Optional. Size in GB of the boot disk, default is 100GB. + boot disk size must be within the range of [100, 64000]. base_output_dir (str): Optional. GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. @@ -1188,6 +1197,8 @@ def from_local_script( machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ).pool_specs python_packager = source_utils._TrainingScriptPythonPackager( diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index db7db10f2f..52418096be 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -1139,6 +1139,8 @@ def _prepare_and_validate_run( machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, ) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]: """Create worker pool specs and managed model as well validating the run. @@ -1172,6 +1174,13 @@ def _prepare_and_validate_run( NVIDIA_TESLA_T4 accelerator_count (int): The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Type of the boot disk, default is `pd-ssd`. + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Size in GB of the boot disk, default is 100GB. + boot disk size must be within the range of [100, 64000]. Returns: Worker pools specs and managed model for run. @@ -1204,6 +1213,8 @@ def _prepare_and_validate_run( machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ).pool_specs managed_model = self._managed_model @@ -1588,6 +1599,8 @@ def run( machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, training_fraction_split: float = 0.8, validation_fraction_split: float = 0.1, test_fraction_split: float = 0.1, @@ -1724,6 +1737,13 @@ def run( NVIDIA_TESLA_T4 accelerator_count (int): The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Type of the boot disk, default is `pd-ssd`. + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Size in GB of the boot disk, default is 100GB. + boot disk size must be within the range of [100, 64000]. training_fraction_split (float): The fraction of the input data that is to be used to train the Model. This is ignored if Dataset is not provided. @@ -1774,6 +1794,8 @@ def run( machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ) # make and copy package @@ -2241,6 +2263,8 @@ def run( machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, training_fraction_split: float = 0.8, validation_fraction_split: float = 0.1, test_fraction_split: float = 0.1, @@ -2370,6 +2394,13 @@ def run( NVIDIA_TESLA_T4 accelerator_count (int): The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Type of the boot disk, default is `pd-ssd`. + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Size in GB of the boot disk, default is 100GB. + boot disk size must be within the range of [100, 64000]. training_fraction_split (float): The fraction of the input data that is to be used to train the Model. This is ignored if Dataset is not provided. @@ -2425,6 +2456,8 @@ def run( machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ) return self._run( @@ -4402,6 +4435,8 @@ def run( machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, training_fraction_split: float = 0.8, validation_fraction_split: float = 0.1, test_fraction_split: float = 0.1, @@ -4531,6 +4566,13 @@ def run( NVIDIA_TESLA_T4 accelerator_count (int): The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Type of the boot disk, default is `pd-ssd`. + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Size in GB of the boot disk, default is 100GB. + boot disk size must be within the range of [100, 64000]. training_fraction_split (float): The fraction of the input data that is to be used to train the Model. This is ignored if Dataset is not provided. @@ -4581,6 +4623,8 @@ def run( machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ) return self._run( diff --git a/google/cloud/aiplatform/utils/worker_spec_utils.py b/google/cloud/aiplatform/utils/worker_spec_utils.py index 385ac83979..1c0b60540f 100644 --- a/google/cloud/aiplatform/utils/worker_spec_utils.py +++ b/google/cloud/aiplatform/utils/worker_spec_utils.py @@ -22,16 +22,19 @@ ) -class _MachineSpec(NamedTuple): - """Specification container for Machine specs used for distributed training. +class _WorkerPoolSpec(NamedTuple): + """Specification container for Worker Pool specs used for distributed training. Usage: - spec = _MachineSpec( + spec = _WorkerPoolSpec( replica_count=10, machine_type='n1-standard-4', accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80') + accelerator_type='NVIDIA_TESLA_K80', + boot_disk_type='pd-ssd', + boot_disk_size_gb=100, + ) Note that container and python package specs are not stored with this spec. """ @@ -40,6 +43,8 @@ class _MachineSpec(NamedTuple): machine_type: str = "n1-standard-4" accelerator_count: int = 0 accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED" + boot_disk_type: str = "pd-ssd" + boot_disk_size_gb: int = 100 def _get_accelerator_type(self) -> Optional[str]: """Validates accelerator_type and returns the name of the accelerator. @@ -70,7 +75,12 @@ def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]: spec = { "machine_spec": {"machine_type": self.machine_type}, "replica_count": self.replica_count, + "disk_spec": { + "boot_disk_type": self.boot_disk_type, + "boot_disk_size_gb": self.boot_disk_size_gb, + }, } + accelerator_type = self._get_accelerator_type() if accelerator_type and self.accelerator_count: spec["machine_spec"]["accelerator_type"] = accelerator_type @@ -98,25 +108,29 @@ class _DistributedTrainingSpec(NamedTuple): Usage: dist_training_spec = _DistributedTrainingSpec( - chief_spec = _MachineSpec( + chief_spec = _WorkerPoolSpec( replica_count=1, machine_type='n1-standard-4', accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80' - ), - worker_spec = _MachineSpec( + accelerator_type='NVIDIA_TESLA_K80', + boot_disk_type='pd-ssd', + boot_disk_size_gb=100, + ), + worker_spec = _WorkerPoolSpec( replica_count=10, machine_type='n1-standard-4', accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80' - ) + accelerator_type='NVIDIA_TESLA_K80', + boot_disk_type='pd-ssd', + boot_disk_size_gb=100, + ), ) """ - chief_spec: _MachineSpec = _MachineSpec() - worker_spec: _MachineSpec = _MachineSpec() - parameter_server_spec: _MachineSpec = _MachineSpec() - evaluator_spec: _MachineSpec = _MachineSpec() + chief_spec: _WorkerPoolSpec = _WorkerPoolSpec() + worker_spec: _WorkerPoolSpec = _WorkerPoolSpec() + parameter_server_spec: _WorkerPoolSpec = _WorkerPoolSpec() + evaluator_spec: _WorkerPoolSpec = _WorkerPoolSpec() @property def pool_specs( @@ -156,6 +170,8 @@ def chief_worker_pool( machine_type: str = "n1-standard-4", accelerator_count: int = 0, accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, ) -> "_DistributedTrainingSpec": """Parameterizes Config to support only chief with worker replicas. @@ -174,6 +190,13 @@ def chief_worker_pool( NVIDIA_TESLA_T4 accelerator_count (int): The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Type of the boot disk (default is `pd-ssd`). + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Size in GB of the boot disk (default is 100GB). + boot disk size must be within the range of [100, 64000]. Returns: _DistributedTrainingSpec representing one chief and n workers all of same @@ -182,18 +205,22 @@ def chief_worker_pool( if replica_count <= 0: return cls() - chief_spec = _MachineSpec( + chief_spec = _WorkerPoolSpec( replica_count=1, machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ) - worker_spec = _MachineSpec( + worker_spec = _WorkerPoolSpec( replica_count=replica_count - 1, machine_type=machine_type, accelerator_count=accelerator_count, accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, ) return cls(chief_spec=chief_spec, worker_spec=worker_spec) diff --git a/tests/unit/aiplatform/test_custom_job.py b/tests/unit/aiplatform/test_custom_job.py index da4fc1fbe7..f44a1471cc 100644 --- a/tests/unit/aiplatform/test_custom_job.py +++ b/tests/unit/aiplatform/test_custom_job.py @@ -54,6 +54,8 @@ _TEST_TRAINING_CONTAINER_IMAGE = "gcr.io/test-training/container:image" +_TEST_RUN_ARGS = ["-v", "0.1", "--test=arg"] + _TEST_WORKER_POOL_SPEC = [ { "machine_spec": { @@ -62,10 +64,11 @@ "accelerator_count": 1, }, "replica_count": 1, + "disk_spec": {"boot_disk_type": "pd-ssd", "boot_disk_size_gb": 100}, "container_spec": { "image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "command": [], - "args": [], + "args": _TEST_RUN_ARGS, }, } ] @@ -490,3 +493,41 @@ def test_create_custom_job_without_base_output_dir(self,): assert job.job_spec.base_output_directory.output_uri_prefix.startswith( f"{_TEST_STAGING_BUCKET}/aiplatform-custom-job" ) + + @pytest.mark.usefixtures("mock_python_package_to_gcs") + @pytest.mark.parametrize("sync", [True, False]) + def test_create_from_local_script_with_all_args( + self, get_custom_job_mock, create_custom_job_mock, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + # configuration on this is tested in test_training_jobs.py + job = aiplatform.CustomJob.from_local_script( + display_name=_TEST_DISPLAY_NAME, + script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, + container_uri=_TEST_TRAINING_CONTAINER_IMAGE, + args=_TEST_RUN_ARGS, + requirements=test_training_jobs._TEST_REQUIREMENTS, + environment_variables=test_training_jobs._TEST_ENVIRONMENT_VARIABLES, + replica_count=test_training_jobs._TEST_REPLICA_COUNT, + machine_type=test_training_jobs._TEST_MACHINE_TYPE, + accelerator_type=test_training_jobs._TEST_ACCELERATOR_TYPE, + accelerator_count=test_training_jobs._TEST_ACCELERATOR_COUNT, + boot_disk_type=test_training_jobs._TEST_BOOT_DISK_TYPE, + boot_disk_size_gb=test_training_jobs._TEST_BOOT_DISK_SIZE_GB, + base_output_dir=_TEST_BASE_OUTPUT_DIR, + labels=_TEST_LABELS, + ) + + job.run(sync=sync) + + job.wait() + + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py index 35006a3e95..d9e0788f39 100644 --- a/tests/unit/aiplatform/test_end_to_end.py +++ b/tests/unit/aiplatform/test_end_to_end.py @@ -211,6 +211,10 @@ def test_dataset_create_to_model_predict( "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -394,6 +398,10 @@ def test_dataset_create_to_model_predict_with_pipeline_fail( "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py index 15824d3172..3e694e6a1e 100644 --- a/tests/unit/aiplatform/test_training_jobs.py +++ b/tests/unit/aiplatform/test_training_jobs.py @@ -95,6 +95,10 @@ _TEST_ACCELERATOR_TYPE = "NVIDIA_TESLA_K80" _TEST_INVALID_ACCELERATOR_TYPE = "NVIDIA_DOES_NOT_EXIST" _TEST_ACCELERATOR_COUNT = 1 +_TEST_BOOT_DISK_TYPE_DEFAULT = "pd-ssd" +_TEST_BOOT_DISK_SIZE_GB_DEFAULT = 100 +_TEST_BOOT_DISK_TYPE = "pd-standard" +_TEST_BOOT_DISK_SIZE_GB = 300 _TEST_MODEL_DISPLAY_NAME = "model-display-name" _TEST_LABELS = {"key": "value"} _TEST_MODEL_LABELS = {"model_key": "model_value"} @@ -691,6 +695,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -858,6 +866,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -1136,6 +1148,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -1392,6 +1408,10 @@ def test_run_call_pipeline_service_create_distributed_training( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -1407,6 +1427,10 @@ def test_run_call_pipeline_service_create_distributed_training( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -1657,6 +1681,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_without_model_ "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": source_utils._TrainingScriptPythonPackager.module_name, @@ -1906,6 +1934,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, "command": _TEST_TRAINING_CONTAINER_CMD, @@ -2063,6 +2095,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, "command": _TEST_TRAINING_CONTAINER_CMD, @@ -2323,6 +2359,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, "command": _TEST_TRAINING_CONTAINER_CMD, @@ -2556,6 +2596,10 @@ def test_run_call_pipeline_service_create_distributed_training( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, "command": _TEST_TRAINING_CONTAINER_CMD, @@ -2569,6 +2613,10 @@ def test_run_call_pipeline_service_create_distributed_training( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, "command": _TEST_TRAINING_CONTAINER_CMD, @@ -2701,6 +2749,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, "command": _TEST_TRAINING_CONTAINER_CMD, @@ -2830,9 +2882,9 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_raises_if_anno ) -class Test_MachineSpec: +class Test_WorkerPoolSpec: def test_machine_spec_return_spec_dict(self): - test_spec = worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._WorkerPoolSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2846,12 +2898,41 @@ def test_machine_spec_return_spec_dict(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": _TEST_REPLICA_COUNT, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, + } + + assert test_spec.spec_dict == true_spec_dict + + def test_machine_spec_return_spec_with_boot_disk_dict(self): + test_spec = worker_spec_utils._WorkerPoolSpec( + replica_count=_TEST_REPLICA_COUNT, + machine_type=_TEST_MACHINE_TYPE, + accelerator_count=_TEST_ACCELERATOR_COUNT, + accelerator_type=_TEST_ACCELERATOR_TYPE, + boot_disk_type=_TEST_BOOT_DISK_TYPE, + boot_disk_size_gb=_TEST_BOOT_DISK_SIZE_GB, + ) + + true_spec_dict = { + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, + }, + "replica_count": _TEST_REPLICA_COUNT, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB, + }, } assert test_spec.spec_dict == true_spec_dict def test_machine_spec_return_spec_dict_with_no_accelerator(self): - test_spec = worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._WorkerPoolSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=0, @@ -2861,12 +2942,16 @@ def test_machine_spec_return_spec_dict_with_no_accelerator(self): true_spec_dict = { "machine_spec": {"machine_type": _TEST_MACHINE_TYPE}, "replica_count": _TEST_REPLICA_COUNT, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, } assert test_spec.spec_dict == true_spec_dict def test_machine_spec_spec_dict_raises_invalid_accelerator(self): - test_spec = worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._WorkerPoolSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2877,7 +2962,7 @@ def test_machine_spec_spec_dict_raises_invalid_accelerator(self): test_spec.spec_dict def test_machine_spec_spec_dict_is_empty(self): - test_spec = worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._WorkerPoolSpec( replica_count=0, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2887,7 +2972,7 @@ def test_machine_spec_spec_dict_is_empty(self): assert test_spec.is_empty def test_machine_spec_spec_dict_is_not_empty(self): - test_spec = worker_spec_utils._MachineSpec( + test_spec = worker_spec_utils._WorkerPoolSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2901,25 +2986,25 @@ class Test_DistributedTrainingSpec: def test_machine_spec_returns_pool_spec(self): spec = worker_spec_utils._DistributedTrainingSpec( - chief_spec=worker_spec_utils._MachineSpec( + chief_spec=worker_spec_utils._WorkerPoolSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=worker_spec_utils._MachineSpec( + worker_spec=worker_spec_utils._WorkerPoolSpec( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - parameter_server_spec=worker_spec_utils._MachineSpec( + parameter_server_spec=worker_spec_utils._WorkerPoolSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=worker_spec_utils._MachineSpec( + evaluator_spec=worker_spec_utils._WorkerPoolSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2935,6 +3020,10 @@ def test_machine_spec_returns_pool_spec(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 1, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, { "machine_spec": { @@ -2943,6 +3032,10 @@ def test_machine_spec_returns_pool_spec(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 10, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, { "machine_spec": { @@ -2951,6 +3044,10 @@ def test_machine_spec_returns_pool_spec(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 3, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, { "machine_spec": { @@ -2959,6 +3056,10 @@ def test_machine_spec_returns_pool_spec(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 1, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, ] @@ -2981,6 +3082,10 @@ def test_chief_worker_pool_returns_spec(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 1, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, { "machine_spec": { @@ -2989,6 +3094,10 @@ def test_chief_worker_pool_returns_spec(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 9, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, ] @@ -3011,6 +3120,10 @@ def test_chief_worker_pool_returns_just_chief(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 1, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, } ] @@ -3019,7 +3132,7 @@ def test_chief_worker_pool_returns_just_chief(self): def test_machine_spec_raise_with_more_than_one_chief_replica(self): spec = worker_spec_utils._DistributedTrainingSpec( - chief_spec=worker_spec_utils._MachineSpec( + chief_spec=worker_spec_utils._WorkerPoolSpec( replica_count=2, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -3033,20 +3146,20 @@ def test_machine_spec_raise_with_more_than_one_chief_replica(self): def test_machine_spec_handles_missing_pools(self): spec = worker_spec_utils._DistributedTrainingSpec( - chief_spec=worker_spec_utils._MachineSpec( + chief_spec=worker_spec_utils._WorkerPoolSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=worker_spec_utils._MachineSpec(replica_count=0), - parameter_server_spec=worker_spec_utils._MachineSpec( + worker_spec=worker_spec_utils._WorkerPoolSpec(replica_count=0), + parameter_server_spec=worker_spec_utils._WorkerPoolSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=worker_spec_utils._MachineSpec(replica_count=0), + evaluator_spec=worker_spec_utils._WorkerPoolSpec(replica_count=0), ) true_pool_spec = [ @@ -3057,8 +3170,19 @@ def test_machine_spec_handles_missing_pools(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 1, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, + }, + { + "machine_spec": {"machine_type": "n1-standard-4"}, + "replica_count": 0, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, - {"machine_spec": {"machine_type": "n1-standard-4"}, "replica_count": 0}, { "machine_spec": { "machine_type": _TEST_MACHINE_TYPE, @@ -3066,6 +3190,10 @@ def test_machine_spec_handles_missing_pools(self): "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "replica_count": 3, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, }, ] @@ -3149,6 +3277,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME, @@ -3308,6 +3440,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset_without_model_dis "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME, @@ -3462,6 +3598,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME, @@ -3727,6 +3867,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME, @@ -3967,6 +4111,10 @@ def test_run_call_pipeline_service_create_distributed_training( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME, @@ -3981,6 +4129,10 @@ def test_run_call_pipeline_service_create_distributed_training( "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME, @@ -4113,6 +4265,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_without_model_ "accelerator_type": _TEST_ACCELERATOR_TYPE, "accelerator_count": _TEST_ACCELERATOR_COUNT, }, + "disk_spec": { + "boot_disk_type": _TEST_BOOT_DISK_TYPE_DEFAULT, + "boot_disk_size_gb": _TEST_BOOT_DISK_SIZE_GB_DEFAULT, + }, "python_package_spec": { "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, "python_module": _TEST_PYTHON_MODULE_NAME,