Skip to content

Commit

Permalink
feat: enable boot disk for CustomTrainingJob, CustomPythonPackageTrai…
Browse files Browse the repository at this point in the history
…ningJob, CustomContainerTrainingJob
  • Loading branch information
morgandu committed Aug 11, 2021
1 parent b478075 commit 2b06a22
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 6 deletions.
44 changes: 44 additions & 0 deletions google/cloud/aiplatform/training_jobs.py
Expand Up @@ -1111,6 +1111,8 @@ def _prepare_and_validate_run(
machine_type: str = "n1-standard-4",
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
accelerator_count: int = 0,
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]:
"""Create worker pool specs and managed model as well validating the
run.
Expand All @@ -1134,6 +1136,13 @@ def _prepare_and_validate_run(
NVIDIA_TESLA_T4
accelerator_count (int):
The number of accelerators to attach to a worker replica.
boot_disk_type (str):
Type of the boot disk, default is `pd-ssd`.
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
`pd-standard` (Persistent Disk Hard Disk Drive).
boot_disk_size_gb (int):
Size in GB of the boot disk, default is 100GB.
boot disk size must be within the range of [100, 64000].
Returns:
Worker pools specs and managed model for run.
Expand Down Expand Up @@ -1166,6 +1175,8 @@ def _prepare_and_validate_run(
machine_type=machine_type,
accelerator_count=accelerator_count,
accelerator_type=accelerator_type,
boot_disk_type=boot_disk_type,
boot_disk_size_gb=boot_disk_size_gb,
).pool_specs

managed_model = self._managed_model
Expand Down Expand Up @@ -1525,6 +1536,8 @@ def run(
machine_type: str = "n1-standard-4",
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
accelerator_count: int = 0,
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
training_fraction_split: float = 0.8,
validation_fraction_split: float = 0.1,
test_fraction_split: float = 0.1,
Expand Down Expand Up @@ -1651,6 +1664,13 @@ def run(
NVIDIA_TESLA_T4
accelerator_count (int):
The number of accelerators to attach to a worker replica.
boot_disk_type (str):
Type of the boot disk, default is `pd-ssd`.
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
`pd-standard` (Persistent Disk Hard Disk Drive).
boot_disk_size_gb (int):
Size in GB of the boot disk, default is 100GB.
boot disk size must be within the range of [100, 64000].
training_fraction_split (float):
The fraction of the input data that is to be
used to train the Model. This is ignored if Dataset is not provided.
Expand Down Expand Up @@ -1700,6 +1720,8 @@ def run(
machine_type=machine_type,
accelerator_count=accelerator_count,
accelerator_type=accelerator_type,
boot_disk_type=boot_disk_type,
boot_disk_size_gb=boot_disk_size_gb,
)

# make and copy package
Expand Down Expand Up @@ -2147,6 +2169,8 @@ def run(
machine_type: str = "n1-standard-4",
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
accelerator_count: int = 0,
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
training_fraction_split: float = 0.8,
validation_fraction_split: float = 0.1,
test_fraction_split: float = 0.1,
Expand Down Expand Up @@ -2266,6 +2290,13 @@ def run(
NVIDIA_TESLA_T4
accelerator_count (int):
The number of accelerators to attach to a worker replica.
boot_disk_type (str):
Type of the boot disk, default is `pd-ssd`.
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
`pd-standard` (Persistent Disk Hard Disk Drive).
boot_disk_size_gb (int):
Size in GB of the boot disk, default is 100GB.
boot disk size must be within the range of [100, 64000].
training_fraction_split (float):
The fraction of the input data that is to be
used to train the Model. This is ignored if Dataset is not provided.
Expand Down Expand Up @@ -2320,6 +2351,8 @@ def run(
machine_type=machine_type,
accelerator_count=accelerator_count,
accelerator_type=accelerator_type,
boot_disk_type=boot_disk_type,
boot_disk_size_gb=boot_disk_size_gb,
)

return self._run(
Expand Down Expand Up @@ -4162,6 +4195,8 @@ def run(
machine_type: str = "n1-standard-4",
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
accelerator_count: int = 0,
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
training_fraction_split: float = 0.8,
validation_fraction_split: float = 0.1,
test_fraction_split: float = 0.1,
Expand Down Expand Up @@ -4281,6 +4316,13 @@ def run(
NVIDIA_TESLA_T4
accelerator_count (int):
The number of accelerators to attach to a worker replica.
boot_disk_type (str):
Type of the boot disk, default is `pd-ssd`.
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
`pd-standard` (Persistent Disk Hard Disk Drive).
boot_disk_size_gb (int):
Size in GB of the boot disk, default is 100GB.
boot disk size must be within the range of [100, 64000].
training_fraction_split (float):
The fraction of the input data that is to be
used to train the Model. This is ignored if Dataset is not provided.
Expand Down Expand Up @@ -4330,6 +4372,8 @@ def run(
machine_type=machine_type,
accelerator_count=accelerator_count,
accelerator_type=accelerator_type,
boot_disk_type=boot_disk_type,
boot_disk_size_gb=boot_disk_size_gb,
)

return self._run(
Expand Down
40 changes: 35 additions & 5 deletions google/cloud/aiplatform/utils/worker_spec_utils.py
Expand Up @@ -31,7 +31,10 @@ class _MachineSpec(NamedTuple):
replica_count=10,
machine_type='n1-standard-4',
accelerator_count=2,
accelerator_type='NVIDIA_TESLA_K80')
accelerator_type='NVIDIA_TESLA_K80',
boot_disk_type='pd-ssd',
boot_disk_size_gb=100,
)
Note that container and python package specs are not stored with this spec.
"""
Expand All @@ -40,6 +43,8 @@ class _MachineSpec(NamedTuple):
machine_type: str = "n1-standard-4"
accelerator_count: int = 0
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED"
boot_disk_type: str = "pd-ssd"
boot_disk_size_gb: int = 100

def _get_accelerator_type(self) -> Optional[str]:
"""Validates accelerator_type and returns the name of the accelerator.
Expand Down Expand Up @@ -70,7 +75,12 @@ def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]:
spec = {
"machine_spec": {"machine_type": self.machine_type},
"replica_count": self.replica_count,
"disk_spec": {
"boot_disk_type": self.boot_disk_type,
"boot_disk_size_gb": self.boot_disk_size_gb,
},
}

accelerator_type = self._get_accelerator_type()
if accelerator_type and self.accelerator_count:
spec["machine_spec"]["accelerator_type"] = accelerator_type
Expand Down Expand Up @@ -102,14 +112,18 @@ class _DistributedTrainingSpec(NamedTuple):
replica_count=1,
machine_type='n1-standard-4',
accelerator_count=2,
accelerator_type='NVIDIA_TESLA_K80'
),
accelerator_type='NVIDIA_TESLA_K80',
boot_disk_type='pd-ssd',
boot_disk_size_gb=100,
),
worker_spec = _MachineSpec(
replica_count=10,
machine_type='n1-standard-4',
accelerator_count=2,
accelerator_type='NVIDIA_TESLA_K80'
)
accelerator_type='NVIDIA_TESLA_K80',
boot_disk_type='pd-ssd',
boot_disk_size_gb=100,
),
)
"""

Expand Down Expand Up @@ -156,6 +170,8 @@ def chief_worker_pool(
machine_type: str = "n1-standard-4",
accelerator_count: int = 0,
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
boot_disk_type: str = "pd-ssd",
boot_disk_size_gb: int = 100,
) -> "_DistributedTrainingSpec":
"""Parameterizes Config to support only chief with worker replicas.
Expand All @@ -174,6 +190,16 @@ def chief_worker_pool(
NVIDIA_TESLA_T4
accelerator_count (int):
The number of accelerators to attach to a worker replica.
boot_disk_type (str):
Type of the boot disk (default is `pd-ssd`).
Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
`pd-standard` (Persistent Disk Hard Disk Drive).
boot_disk_size_gb (int):
Size in GB of the boot disk (default is 100GB).
boot disk size must be within the range of [100, 64000].
Raises:
Returns:
_DistributedTrainingSpec representing one chief and n workers all of same
Expand All @@ -187,13 +213,17 @@ def chief_worker_pool(
machine_type=machine_type,
accelerator_count=accelerator_count,
accelerator_type=accelerator_type,
boot_disk_type=boot_disk_type,
boot_disk_size_gb=boot_disk_size_gb,
)

worker_spec = _MachineSpec(
replica_count=replica_count - 1,
machine_type=machine_type,
accelerator_count=accelerator_count,
accelerator_type=accelerator_type,
boot_disk_type=boot_disk_type,
boot_disk_size_gb=boot_disk_size_gb,
)

return cls(chief_spec=chief_spec, worker_spec=worker_spec)
8 changes: 8 additions & 0 deletions tests/unit/aiplatform/test_end_to_end.py
Expand Up @@ -211,6 +211,10 @@ def test_dataset_create_to_model_predict(
"accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
"accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT,
},
"disk_spec": {
"boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT,
"boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT,
},
"python_package_spec": {
"executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
"python_module": source_utils._TrainingScriptPythonPackager.module_name,
Expand Down Expand Up @@ -394,6 +398,10 @@ def test_dataset_create_to_model_predict_with_pipeline_fail(
"accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
"accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT,
},
"disk_spec": {
"boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT,
"boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT,
},
"python_package_spec": {
"executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
"python_module": source_utils._TrainingScriptPythonPackager.module_name,
Expand Down

0 comments on commit 2b06a22

Please sign in to comment.