feat: enable boot disk for CustomTrainingJob, CustomPythonPackageTrai…

…ningJob, CustomContainerTrainingJob
googleapis · Aug 11, 2021 · 2b06a22 · 2b06a22
1 parent b478075
commit 2b06a22
Show file tree

Hide file tree

Showing 4 changed files with 244 additions and 6 deletions.
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
@@ -1111,6 +1111,8 @@ def _prepare_and_validate_run(
         machine_type: str = "n1-standard-4",
         accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
         accelerator_count: int = 0,
+        boot_disk_type: str = "pd-ssd",
+        boot_disk_size_gb: int = 100,
     ) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]:
         """Create worker pool specs and managed model as well validating the
         run.
@@ -1134,6 +1136,13 @@ def _prepare_and_validate_run(
                 NVIDIA_TESLA_T4
             accelerator_count (int):
                 The number of accelerators to attach to a worker replica.
+            boot_disk_type (str):
+                Type of the boot disk, default is `pd-ssd`.
+                Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
+                `pd-standard` (Persistent Disk Hard Disk Drive).
+            boot_disk_size_gb (int):
+                Size in GB of the boot disk, default is 100GB.
+                boot disk size must be within the range of [100, 64000].
         Returns:
             Worker pools specs and managed model for run.
 
@@ -1166,6 +1175,8 @@ def _prepare_and_validate_run(
             machine_type=machine_type,
             accelerator_count=accelerator_count,
             accelerator_type=accelerator_type,
+            boot_disk_type=boot_disk_type,
+            boot_disk_size_gb=boot_disk_size_gb,
         ).pool_specs
 
         managed_model = self._managed_model
@@ -1525,6 +1536,8 @@ def run(
         machine_type: str = "n1-standard-4",
         accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
         accelerator_count: int = 0,
+        boot_disk_type: str = "pd-ssd",
+        boot_disk_size_gb: int = 100,
         training_fraction_split: float = 0.8,
         validation_fraction_split: float = 0.1,
         test_fraction_split: float = 0.1,
@@ -1651,6 +1664,13 @@ def run(
                 NVIDIA_TESLA_T4
             accelerator_count (int):
                 The number of accelerators to attach to a worker replica.
+            boot_disk_type (str):
+                Type of the boot disk, default is `pd-ssd`.
+                Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
+                `pd-standard` (Persistent Disk Hard Disk Drive).
+            boot_disk_size_gb (int):
+                Size in GB of the boot disk, default is 100GB.
+                boot disk size must be within the range of [100, 64000].
             training_fraction_split (float):
                 The fraction of the input data that is to be
                 used to train the Model. This is ignored if Dataset is not provided.
@@ -1700,6 +1720,8 @@ def run(
             machine_type=machine_type,
             accelerator_count=accelerator_count,
             accelerator_type=accelerator_type,
+            boot_disk_type=boot_disk_type,
+            boot_disk_size_gb=boot_disk_size_gb,
         )
 
         # make and copy package
@@ -2147,6 +2169,8 @@ def run(
         machine_type: str = "n1-standard-4",
         accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
         accelerator_count: int = 0,
+        boot_disk_type: str = "pd-ssd",
+        boot_disk_size_gb: int = 100,
         training_fraction_split: float = 0.8,
         validation_fraction_split: float = 0.1,
         test_fraction_split: float = 0.1,
@@ -2266,6 +2290,13 @@ def run(
                 NVIDIA_TESLA_T4
             accelerator_count (int):
                 The number of accelerators to attach to a worker replica.
+            boot_disk_type (str):
+                Type of the boot disk, default is `pd-ssd`.
+                Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
+                `pd-standard` (Persistent Disk Hard Disk Drive).
+            boot_disk_size_gb (int):
+                Size in GB of the boot disk, default is 100GB.
+                boot disk size must be within the range of [100, 64000].
             training_fraction_split (float):
                 The fraction of the input data that is to be
                 used to train the Model. This is ignored if Dataset is not provided.
@@ -2320,6 +2351,8 @@ def run(
             machine_type=machine_type,
             accelerator_count=accelerator_count,
             accelerator_type=accelerator_type,
+            boot_disk_type=boot_disk_type,
+            boot_disk_size_gb=boot_disk_size_gb,
         )
 
         return self._run(
@@ -4162,6 +4195,8 @@ def run(
         machine_type: str = "n1-standard-4",
         accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
         accelerator_count: int = 0,
+        boot_disk_type: str = "pd-ssd",
+        boot_disk_size_gb: int = 100,
         training_fraction_split: float = 0.8,
         validation_fraction_split: float = 0.1,
         test_fraction_split: float = 0.1,
@@ -4281,6 +4316,13 @@ def run(
                 NVIDIA_TESLA_T4
             accelerator_count (int):
                 The number of accelerators to attach to a worker replica.
+            boot_disk_type (str):
+                Type of the boot disk, default is `pd-ssd`.
+                Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
+                `pd-standard` (Persistent Disk Hard Disk Drive).
+            boot_disk_size_gb (int):
+                Size in GB of the boot disk, default is 100GB.
+                boot disk size must be within the range of [100, 64000].
             training_fraction_split (float):
                 The fraction of the input data that is to be
                 used to train the Model. This is ignored if Dataset is not provided.
@@ -4330,6 +4372,8 @@ def run(
             machine_type=machine_type,
             accelerator_count=accelerator_count,
             accelerator_type=accelerator_type,
+            boot_disk_type=boot_disk_type,
+            boot_disk_size_gb=boot_disk_size_gb,
         )
 
         return self._run(

diff --git a/google/cloud/aiplatform/utils/worker_spec_utils.py b/google/cloud/aiplatform/utils/worker_spec_utils.py
@@ -31,7 +31,10 @@ class _MachineSpec(NamedTuple):
                 replica_count=10,
                 machine_type='n1-standard-4',
                 accelerator_count=2,
-                accelerator_type='NVIDIA_TESLA_K80')
+                accelerator_type='NVIDIA_TESLA_K80',
+                boot_disk_type='pd-ssd',
+                boot_disk_size_gb=100,
+            )
 
     Note that container and python package specs are not stored with this spec.
     """
@@ -40,6 +43,8 @@ class _MachineSpec(NamedTuple):
     machine_type: str = "n1-standard-4"
     accelerator_count: int = 0
     accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED"
+    boot_disk_type: str = "pd-ssd"
+    boot_disk_size_gb: int = 100
 
     def _get_accelerator_type(self) -> Optional[str]:
         """Validates accelerator_type and returns the name of the accelerator.
@@ -70,7 +75,12 @@ def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]:
         spec = {
             "machine_spec": {"machine_type": self.machine_type},
             "replica_count": self.replica_count,
+            "disk_spec": {
+                "boot_disk_type": self.boot_disk_type,
+                "boot_disk_size_gb": self.boot_disk_size_gb,
+            },
         }
+
         accelerator_type = self._get_accelerator_type()
         if accelerator_type and self.accelerator_count:
             spec["machine_spec"]["accelerator_type"] = accelerator_type
@@ -102,14 +112,18 @@ class _DistributedTrainingSpec(NamedTuple):
                 replica_count=1,
                 machine_type='n1-standard-4',
                 accelerator_count=2,
-                accelerator_type='NVIDIA_TESLA_K80'
-                ),
+                accelerator_type='NVIDIA_TESLA_K80',
+                boot_disk_type='pd-ssd',
+                boot_disk_size_gb=100,
+            ),
         worker_spec = _MachineSpec(
                 replica_count=10,
                 machine_type='n1-standard-4',
                 accelerator_count=2,
-                accelerator_type='NVIDIA_TESLA_K80'
-                )
+                accelerator_type='NVIDIA_TESLA_K80',
+                boot_disk_type='pd-ssd',
+                boot_disk_size_gb=100,
+            ),
     )
     """
 
@@ -156,6 +170,8 @@ def chief_worker_pool(
         machine_type: str = "n1-standard-4",
         accelerator_count: int = 0,
         accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
+        boot_disk_type: str = "pd-ssd",
+        boot_disk_size_gb: int = 100,
     ) -> "_DistributedTrainingSpec":
         """Parameterizes Config to support only chief with worker replicas.
 
@@ -174,6 +190,16 @@ def chief_worker_pool(
                 NVIDIA_TESLA_T4
             accelerator_count (int):
                 The number of accelerators to attach to a worker replica.
+            boot_disk_type (str):
+                Type of the boot disk (default is `pd-ssd`).
+                Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or
+                `pd-standard` (Persistent Disk Hard Disk Drive).
+            boot_disk_size_gb (int):
+                Size in GB of the boot disk (default is 100GB).
+                boot disk size must be within the range of [100, 64000].
+
+        Raises:
+
 
         Returns:
             _DistributedTrainingSpec representing one chief and n workers all of same
@@ -187,13 +213,17 @@ def chief_worker_pool(
             machine_type=machine_type,
             accelerator_count=accelerator_count,
             accelerator_type=accelerator_type,
+            boot_disk_type=boot_disk_type,
+            boot_disk_size_gb=boot_disk_size_gb,
         )
 
         worker_spec = _MachineSpec(
             replica_count=replica_count - 1,
             machine_type=machine_type,
             accelerator_count=accelerator_count,
             accelerator_type=accelerator_type,
+            boot_disk_type=boot_disk_type,
+            boot_disk_size_gb=boot_disk_size_gb,
         )
 
         return cls(chief_spec=chief_spec, worker_spec=worker_spec)
diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py
@@ -211,6 +211,10 @@ def test_dataset_create_to_model_predict(
                 "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
                 "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT,
             },
+            "disk_spec": {
+                "boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT,
+                "boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT,
+            },
             "python_package_spec": {
                 "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
                 "python_module": source_utils._TrainingScriptPythonPackager.module_name,
@@ -394,6 +398,10 @@ def test_dataset_create_to_model_predict_with_pipeline_fail(
                 "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE,
                 "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT,
             },
+            "disk_spec": {
+                "boot_disk_type": test_training_jobs._TEST_BOOT_DISK_TYPE_DEFAULT,
+                "boot_disk_size_gb": test_training_jobs._TEST_BOOT_DISK_SIZE_GB_DEFAULT,
+            },
             "python_package_spec": {
                 "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE,
                 "python_module": source_utils._TrainingScriptPythonPackager.module_name,