add dag job to support release test (#207)

* add dag job to support release test * fix enum value * change nightly docker * nit update
GoogleCloudPlatform · Mar 19, 2024 · fe9d87f · fe9d87f
1 parent a304f30
commit fe9d87f
Show file tree

Hide file tree

Showing 3 changed files with 222 additions and 21 deletions.
diff --git a/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py b/dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
@@ -14,15 +14,72 @@
 
 """Utilities to construct configs for pytorchxla_torchbench DAG."""
 
-import datetime
+import enum
 from typing import Tuple
 from xlml.apis import gcp_config, metric_config, task, test_config
 import dags.vm_resource as resource
 from dags import gcs_bucket, test_owner
 
 
-def set_up_torchbench_tpu(model_name: str = "") -> Tuple[str]:
+class VERSION(enum.Enum):
+  NIGHTLY = enum.auto()
+  R2_2 = enum.auto()
+  R2_3 = enum.auto()
+
+
+class VERSION_MAPPING:
+
+  class NIGHTLY(enum.Enum):
+    TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl"
+    TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-nightly-cp38-cp38-linux_x86_64.whl"
+    TORCH = "torch"
+    TORCHVISION = "torchvision"
+    TORCHAUDIO = "torchaudio"
+    # TODO(@piz): update to xla:nightly_3.10_cuda_12.1 once available
+    TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1"
+    TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/nightly/cpu"
+    TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/nightly/cu121"
+    TORCH_REPO_BRANCH = "-b main"
+    TORCH_XLA_REPO_BRANCH = "-b master"
+
+  class R2_2(enum.Enum):
+    TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl"
+    TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl"
+    TORCH = "torch==2.2.0"
+    TORCHVISION = "torchvision==0.17.0"
+    TORCHAUDIO = "torchaudio==2.2.0"
+    TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_cuda_12.1"
+    TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/cpu"
+    TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/cu121"
+    TORCH_REPO_BRANCH = "-b v2.2.0"
+    TORCH_XLA_REPO_BRANCH = "-b v2.2.0"
+
+  # TODO(@siyuan): Please update the 2.3 rc to the latest.
+  class R2_3(enum.Enum):
+    TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.3.0rc2-cp310-cp310-linux_x86_64.whl"
+    TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0rc2-cp310-cp310-linux_x86_64.whl"
+    TORCH = "torch==2.3.0"
+    TORCHVISION = "torchvision==0.18.0"
+    TORCHAUDIO = "torchaudio==2.2.0"
+    TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0-rc2_3.10_cuda_12.1"
+    TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/test/cpu"
+    TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/test/cu121"
+    TORCH_REPO_BRANCH = "-b v2.3.0-rc2"
+    TORCH_XLA_REPO_BRANCH = "-b v2.3.0-rc2"
+
+
+def set_up_torchbench_tpu(
+    model_name: str = "", test_version: VERSION = VERSION.NIGHTLY
+) -> Tuple[str]:
   """Common set up for TorchBench."""
+  if test_version == VERSION.NIGHTLY:
+    version_mapping = VERSION_MAPPING.NIGHTLY
+  elif test_version == VERSION.R2_2:
+    version_mapping = VERSION_MAPPING.R2_2
+  elif test_version == VERSION.R2_3:
+    version_mapping = VERSION_MAPPING.R2_3
+  else:
+    raise ValueError("version number does not exist in VERSION enum")
 
   def model_install_cmds(output_file=None) -> str:
     """Installs torchbench models.
@@ -47,18 +104,16 @@ def model_install_cmds(output_file=None) -> str:
       "sudo apt-get install libgl1 -y",
       "pip3 install --user numpy pandas",
       (
-          "pip3 install --user --pre torch torchvision torchaudio --index-url"
-          " https://download.pytorch.org/whl/nightly/cpu"
+          f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CPU_URL.value}"
       ),
       (
-          "pip3 install --user 'torch_xla[tpuvm] @"
-          " https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl'"
+          f"pip3 install --user 'torch_xla[tpuvm] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}'"
       ),
       "pip3 install --user psutil",
       "cd; git clone https://github.com/pytorch/benchmark.git",
       f"cd benchmark && {model_install_cmds()}",
-      "cd; git clone https://github.com/pytorch/pytorch.git",
-      "cd; git clone https://github.com/pytorch/xla.git",
+      f"cd; git clone {version_mapping.TORCH_REPO_BRANCH.value} https://github.com/pytorch/pytorch.git",
+      f"cd; git clone {version_mapping.TORCH_XLA_REPO_BRANCH.value} https://github.com/pytorch/xla.git",
   )
 
 
@@ -71,6 +126,7 @@ def get_torchbench_tpu_config(
     time_out_in_min: int,
     network: str = "default",
     subnetwork: str = "default",
+    test_version: VERSION = VERSION.NIGHTLY,
     model_name: str = "",
     extraFlags: str = "",
 ) -> task.TpuQueuedResourceTask:
@@ -80,7 +136,7 @@ def get_torchbench_tpu_config(
       dataset_name=metric_config.DatasetOption.BENCHMARK_DATASET,
   )
 
-  set_up_cmds = set_up_torchbench_tpu(model_name)
+  set_up_cmds = set_up_torchbench_tpu(model_name, test_version)
   local_output_location = "~/xla/benchmarks/output/metric_report.jsonl"
 
   if not model_name or model_name.lower() == "all":
@@ -130,9 +186,19 @@ def get_torchbench_tpu_config(
 
 # Below is the setup for torchbench GPU run.
 def set_up_torchbench_gpu(
-    model_name: str, nvidia_driver_version: str
+    model_name: str,
+    nvidia_driver_version: str,
+    test_version: VERSION,
 ) -> Tuple[str]:
   """Common set up for TorchBench."""
+  if test_version == VERSION.NIGHTLY:
+    version_mapping = VERSION_MAPPING.NIGHTLY
+  elif test_version == VERSION.R2_2:
+    version_mapping = VERSION_MAPPING.R2_2
+  elif test_version == VERSION.R2_3:
+    version_mapping = VERSION_MAPPING.R2_3
+  else:
+    raise ValueError("version number does not exist in VERSION enum")
 
   def model_install_cmds(output_file=None) -> str:
     """Installs torchbench models.
@@ -172,11 +238,11 @@ def get_nvidia_driver_install_cmd(driver_version: str) -> str:
       # Below are the dependencies for benchmark data processing:
       "pip3 install --user numpy pandas",
       # torch related dependencies
-      "pip3 install --user --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121",
-      "cd /tmp/ && git clone https://github.com/pytorch/benchmark.git",
+      f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CUDA_URL.value}",
+      f"cd /tmp/ && git clone https://github.com/pytorch/benchmark.git",
       f" cd benchmark && {model_install_cmds()}",
-      "cd /tmp/ && git clone https://github.com/pytorch/pytorch.git",
-      "cd /tmp/ && git clone https://github.com/pytorch/xla.git",
+      f"cd /tmp/ && git clone {version_mapping.TORCH_REPO_BRANCH.value} https://github.com/pytorch/pytorch.git",
+      f"cd /tmp/ && git clone {version_mapping.TORCH_XLA_REPO_BRANCH.value} https://github.com/pytorch/xla.git",
   )
   docker_cmds = "\n".join(docker_cmds_ls)
 
@@ -191,13 +257,10 @@ def get_nvidia_driver_install_cmd(driver_version: str) -> str:
       "sudo nvidia-smi --lock-gpu-clocks=1200,1200",
       "sudo systemctl restart docker",
       "sudo nvidia-smi -pm 1",
-      (
-          "sudo docker pull"
-          " us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1"
-      ),
+      f"sudo docker pull {version_mapping.TORCH_XLA_GPU_DOCKER.value}",
       (
           "sudo docker run --shm-size 16g --gpus all -it -d --network host --name ml-automation-torchbench"
-          " us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1"
+          f" {version_mapping.TORCH_XLA_GPU_DOCKER.value}"
       ),
       f"sudo docker exec -i ml-automation-torchbench /bin/bash -c '{docker_cmds}'",
   )
@@ -212,6 +275,7 @@ def get_torchbench_gpu_config(
     gpu_zone: resource.Zone,
     time_out_in_min: int,
     nvidia_driver_version: str = "525.125.06",
+    test_version: VERSION = VERSION.NIGHTLY,
     model_name: str = "",
     extraFlags: str = "",
 ) -> task.GpuCreateResourceTask:
@@ -221,7 +285,9 @@ def get_torchbench_gpu_config(
       dataset_name=metric_config.DatasetOption.BENCHMARK_DATASET,
   )
 
-  set_up_cmds = set_up_torchbench_gpu(model_name, nvidia_driver_version)
+  set_up_cmds = set_up_torchbench_gpu(
+      model_name, nvidia_driver_version, test_version
+  )
   local_output_location = "/tmp/xla/benchmarks/output/metric_report.jsonl"
 
   if not model_name or model_name.lower() == "all":

diff --git a/dags/pytorch_xla/pytorchxla-torchbench-release.py b/dags/pytorch_xla/pytorchxla-torchbench-release.py
@@ -0,0 +1,135 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A DAG to run all TorchBench tests with nightly version."""
+
+from airflow import models
+import datetime
+from dags import composer_env
+from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
+import dags.vm_resource as resource
+
+SCHEDULED_TIME = None
+
+
+with models.DAG(
+    dag_id="pytorchxla-torchbench-release",
+    schedule=SCHEDULED_TIME,
+    tags=["pytorchxla", "release", "torchbench"],
+    start_date=datetime.datetime(2024, 1, 1),
+    catchup=False,
+) as dag:
+  model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
+  torchbench_extra_flags = [f"--filter={model}"]
+  test_version = config.VERSION.R2_2
+  # Running on V4-8:
+  config.get_torchbench_tpu_config(
+      tpu_version=resource.TpuVersion.V4,
+      tpu_cores=8,
+      project=resource.Project.CLOUD_ML_AUTO_SOLUTIONS,
+      tpu_zone=resource.Zone.US_CENTRAL2_B,
+      runtime_version=resource.RuntimeVersion.TPU_UBUNTU2204_BASE,
+      test_version=test_version,
+      model_name=model,
+      time_out_in_min=1600,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
+
+  # Running on V5P
+  config.get_torchbench_tpu_config(
+      tpu_version=resource.TpuVersion.V5P,
+      tpu_cores=8,
+      project=resource.Project.TPU_PROD_ENV_AUTOMATED,
+      tpu_zone=resource.Zone.US_EAST5_A,
+      runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV5,
+      network=resource.V5_NETWORKS,
+      subnetwork=resource.V5P_SUBNETWORKS,
+      test_version=test_version,
+      time_out_in_min=700,
+      model_name=model,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
+
+  # Running on V5E
+  config.get_torchbench_tpu_config(
+      tpu_version=resource.TpuVersion.V5E,
+      tpu_cores=4,
+      project=resource.Project.TPU_PROD_ENV_AUTOMATED,
+      tpu_zone=resource.Zone.US_EAST1_C,
+      runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV5_LITE,
+      network=resource.V5_NETWORKS,
+      subnetwork=resource.V5E_SUBNETWORKS,
+      test_version=test_version,
+      time_out_in_min=1600,
+      model_name=model,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
+
+  # Running on V100 GPU
+  config.get_torchbench_gpu_config(
+      machine_type=resource.MachineVersion.N1_STANDARD_8,
+      image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
+      accelerator_type=resource.GpuVersion.V100,
+      count=1,
+      gpu_zone=resource.Zone.US_CENTRAL1_C,
+      test_version=test_version,
+      model_name=model,
+      time_out_in_min=1600,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
+
+  # Running on A100 GPU
+  config.get_torchbench_gpu_config(
+      machine_type=resource.MachineVersion.A2_HIGHGPU_1G,
+      image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
+      accelerator_type=resource.GpuVersion.A100,
+      count=1,
+      gpu_zone=resource.Zone.US_CENTRAL1_F,
+      test_version=test_version,
+      model_name=model,
+      time_out_in_min=1600,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
+
+  # Running on H100 GPU
+  # Note: H100 must use ssd.
+  config.get_torchbench_gpu_config(
+      machine_type=resource.MachineVersion.A3_HIGHGPU_8G,
+      image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
+      accelerator_type=resource.GpuVersion.H100,
+      count=8,
+      gpu_zone=resource.Zone.US_CENTRAL1_A,
+      nvidia_driver_version="535.86.10",
+      test_version=test_version,
+      model_name=model,
+      time_out_in_min=1600,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
+
+  # Running on L4 GPU
+  config.get_torchbench_gpu_config(
+      machine_type=resource.MachineVersion.G2_STAND_4,
+      image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
+      image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
+      accelerator_type=resource.GpuVersion.L4,
+      count=1,
+      gpu_zone=resource.Zone.US_CENTRAL1_C,
+      test_version=test_version,
+      model_name=model,
+      time_out_in_min=1600,
+      extraFlags=" ".join(torchbench_extra_flags),
+  ).run()
diff --git a/dags/pytorch_xla/pytorchxla_torchbench.py b/dags/pytorch_xla/pytorchxla_torchbench.py
@@ -31,7 +31,7 @@
     start_date=datetime.datetime(2024, 1, 1),
     catchup=False,
 ) as dag:
-  model = "all"
+  model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
   torchbench_extra_flags = [f"--filter={model}"]
   # Running on V4-8:
   config.get_torchbench_tpu_config(