Skip to content

Commit

Permalink
add dag job to support release test (#207)
Browse files Browse the repository at this point in the history
* add dag job to support release test

* fix enum value

* change nightly docker

* nit update
  • Loading branch information
zpcore committed Mar 19, 2024
1 parent a304f30 commit fe9d87f
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 21 deletions.
106 changes: 86 additions & 20 deletions dags/pytorch_xla/configs/pytorchxla_torchbench_config.py
Expand Up @@ -14,15 +14,72 @@

"""Utilities to construct configs for pytorchxla_torchbench DAG."""

import datetime
import enum
from typing import Tuple
from xlml.apis import gcp_config, metric_config, task, test_config
import dags.vm_resource as resource
from dags import gcs_bucket, test_owner


def set_up_torchbench_tpu(model_name: str = "") -> Tuple[str]:
class VERSION(enum.Enum):
NIGHTLY = enum.auto()
R2_2 = enum.auto()
R2_3 = enum.auto()


class VERSION_MAPPING:

class NIGHTLY(enum.Enum):
TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl"
TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-nightly-cp38-cp38-linux_x86_64.whl"
TORCH = "torch"
TORCHVISION = "torchvision"
TORCHAUDIO = "torchaudio"
# TODO(@piz): update to xla:nightly_3.10_cuda_12.1 once available
TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1"
TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/nightly/cpu"
TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/nightly/cu121"
TORCH_REPO_BRANCH = "-b main"
TORCH_XLA_REPO_BRANCH = "-b master"

class R2_2(enum.Enum):
TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl"
TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl"
TORCH = "torch==2.2.0"
TORCHVISION = "torchvision==0.17.0"
TORCHAUDIO = "torchaudio==2.2.0"
TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_cuda_12.1"
TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/cpu"
TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/cu121"
TORCH_REPO_BRANCH = "-b v2.2.0"
TORCH_XLA_REPO_BRANCH = "-b v2.2.0"

# TODO(@siyuan): Please update the 2.3 rc to the latest.
class R2_3(enum.Enum):
TORCH_XLA_TPU_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.3.0rc2-cp310-cp310-linux_x86_64.whl"
TORCH_XLA_CUDA_WHEEL = "https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0rc2-cp310-cp310-linux_x86_64.whl"
TORCH = "torch==2.3.0"
TORCHVISION = "torchvision==0.18.0"
TORCHAUDIO = "torchaudio==2.2.0"
TORCH_XLA_GPU_DOCKER = "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0-rc2_3.10_cuda_12.1"
TORCH_INDEX_CPU_URL = "https://download.pytorch.org/whl/test/cpu"
TORCH_INDEX_CUDA_URL = "https://download.pytorch.org/whl/test/cu121"
TORCH_REPO_BRANCH = "-b v2.3.0-rc2"
TORCH_XLA_REPO_BRANCH = "-b v2.3.0-rc2"


def set_up_torchbench_tpu(
model_name: str = "", test_version: VERSION = VERSION.NIGHTLY
) -> Tuple[str]:
"""Common set up for TorchBench."""
if test_version == VERSION.NIGHTLY:
version_mapping = VERSION_MAPPING.NIGHTLY
elif test_version == VERSION.R2_2:
version_mapping = VERSION_MAPPING.R2_2
elif test_version == VERSION.R2_3:
version_mapping = VERSION_MAPPING.R2_3
else:
raise ValueError("version number does not exist in VERSION enum")

def model_install_cmds(output_file=None) -> str:
"""Installs torchbench models.
Expand All @@ -47,18 +104,16 @@ def model_install_cmds(output_file=None) -> str:
"sudo apt-get install libgl1 -y",
"pip3 install --user numpy pandas",
(
"pip3 install --user --pre torch torchvision torchaudio --index-url"
" https://download.pytorch.org/whl/nightly/cpu"
f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CPU_URL.value}"
),
(
"pip3 install --user 'torch_xla[tpuvm] @"
" https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl'"
f"pip3 install --user 'torch_xla[tpuvm] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}'"
),
"pip3 install --user psutil",
"cd; git clone https://github.com/pytorch/benchmark.git",
f"cd benchmark && {model_install_cmds()}",
"cd; git clone https://github.com/pytorch/pytorch.git",
"cd; git clone https://github.com/pytorch/xla.git",
f"cd; git clone {version_mapping.TORCH_REPO_BRANCH.value} https://github.com/pytorch/pytorch.git",
f"cd; git clone {version_mapping.TORCH_XLA_REPO_BRANCH.value} https://github.com/pytorch/xla.git",
)


Expand All @@ -71,6 +126,7 @@ def get_torchbench_tpu_config(
time_out_in_min: int,
network: str = "default",
subnetwork: str = "default",
test_version: VERSION = VERSION.NIGHTLY,
model_name: str = "",
extraFlags: str = "",
) -> task.TpuQueuedResourceTask:
Expand All @@ -80,7 +136,7 @@ def get_torchbench_tpu_config(
dataset_name=metric_config.DatasetOption.BENCHMARK_DATASET,
)

set_up_cmds = set_up_torchbench_tpu(model_name)
set_up_cmds = set_up_torchbench_tpu(model_name, test_version)
local_output_location = "~/xla/benchmarks/output/metric_report.jsonl"

if not model_name or model_name.lower() == "all":
Expand Down Expand Up @@ -130,9 +186,19 @@ def get_torchbench_tpu_config(

# Below is the setup for torchbench GPU run.
def set_up_torchbench_gpu(
model_name: str, nvidia_driver_version: str
model_name: str,
nvidia_driver_version: str,
test_version: VERSION,
) -> Tuple[str]:
"""Common set up for TorchBench."""
if test_version == VERSION.NIGHTLY:
version_mapping = VERSION_MAPPING.NIGHTLY
elif test_version == VERSION.R2_2:
version_mapping = VERSION_MAPPING.R2_2
elif test_version == VERSION.R2_3:
version_mapping = VERSION_MAPPING.R2_3
else:
raise ValueError("version number does not exist in VERSION enum")

def model_install_cmds(output_file=None) -> str:
"""Installs torchbench models.
Expand Down Expand Up @@ -172,11 +238,11 @@ def get_nvidia_driver_install_cmd(driver_version: str) -> str:
# Below are the dependencies for benchmark data processing:
"pip3 install --user numpy pandas",
# torch related dependencies
"pip3 install --user --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121",
"cd /tmp/ && git clone https://github.com/pytorch/benchmark.git",
f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CUDA_URL.value}",
f"cd /tmp/ && git clone https://github.com/pytorch/benchmark.git",
f" cd benchmark && {model_install_cmds()}",
"cd /tmp/ && git clone https://github.com/pytorch/pytorch.git",
"cd /tmp/ && git clone https://github.com/pytorch/xla.git",
f"cd /tmp/ && git clone {version_mapping.TORCH_REPO_BRANCH.value} https://github.com/pytorch/pytorch.git",
f"cd /tmp/ && git clone {version_mapping.TORCH_XLA_REPO_BRANCH.value} https://github.com/pytorch/xla.git",
)
docker_cmds = "\n".join(docker_cmds_ls)

Expand All @@ -191,13 +257,10 @@ def get_nvidia_driver_install_cmd(driver_version: str) -> str:
"sudo nvidia-smi --lock-gpu-clocks=1200,1200",
"sudo systemctl restart docker",
"sudo nvidia-smi -pm 1",
(
"sudo docker pull"
" us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1"
),
f"sudo docker pull {version_mapping.TORCH_XLA_GPU_DOCKER.value}",
(
"sudo docker run --shm-size 16g --gpus all -it -d --network host --name ml-automation-torchbench"
" us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1"
f" {version_mapping.TORCH_XLA_GPU_DOCKER.value}"
),
f"sudo docker exec -i ml-automation-torchbench /bin/bash -c '{docker_cmds}'",
)
Expand All @@ -212,6 +275,7 @@ def get_torchbench_gpu_config(
gpu_zone: resource.Zone,
time_out_in_min: int,
nvidia_driver_version: str = "525.125.06",
test_version: VERSION = VERSION.NIGHTLY,
model_name: str = "",
extraFlags: str = "",
) -> task.GpuCreateResourceTask:
Expand All @@ -221,7 +285,9 @@ def get_torchbench_gpu_config(
dataset_name=metric_config.DatasetOption.BENCHMARK_DATASET,
)

set_up_cmds = set_up_torchbench_gpu(model_name, nvidia_driver_version)
set_up_cmds = set_up_torchbench_gpu(
model_name, nvidia_driver_version, test_version
)
local_output_location = "/tmp/xla/benchmarks/output/metric_report.jsonl"

if not model_name or model_name.lower() == "all":
Expand Down
135 changes: 135 additions & 0 deletions dags/pytorch_xla/pytorchxla-torchbench-release.py
@@ -0,0 +1,135 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""A DAG to run all TorchBench tests with nightly version."""

from airflow import models
import datetime
from dags import composer_env
from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
import dags.vm_resource as resource

SCHEDULED_TIME = None


with models.DAG(
dag_id="pytorchxla-torchbench-release",
schedule=SCHEDULED_TIME,
tags=["pytorchxla", "release", "torchbench"],
start_date=datetime.datetime(2024, 1, 1),
catchup=False,
) as dag:
model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
torchbench_extra_flags = [f"--filter={model}"]
test_version = config.VERSION.R2_2
# Running on V4-8:
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V4,
tpu_cores=8,
project=resource.Project.CLOUD_ML_AUTO_SOLUTIONS,
tpu_zone=resource.Zone.US_CENTRAL2_B,
runtime_version=resource.RuntimeVersion.TPU_UBUNTU2204_BASE,
test_version=test_version,
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()

# Running on V5P
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V5P,
tpu_cores=8,
project=resource.Project.TPU_PROD_ENV_AUTOMATED,
tpu_zone=resource.Zone.US_EAST5_A,
runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV5,
network=resource.V5_NETWORKS,
subnetwork=resource.V5P_SUBNETWORKS,
test_version=test_version,
time_out_in_min=700,
model_name=model,
extraFlags=" ".join(torchbench_extra_flags),
).run()

# Running on V5E
config.get_torchbench_tpu_config(
tpu_version=resource.TpuVersion.V5E,
tpu_cores=4,
project=resource.Project.TPU_PROD_ENV_AUTOMATED,
tpu_zone=resource.Zone.US_EAST1_C,
runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV5_LITE,
network=resource.V5_NETWORKS,
subnetwork=resource.V5E_SUBNETWORKS,
test_version=test_version,
time_out_in_min=1600,
model_name=model,
extraFlags=" ".join(torchbench_extra_flags),
).run()

# Running on V100 GPU
config.get_torchbench_gpu_config(
machine_type=resource.MachineVersion.N1_STANDARD_8,
image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
accelerator_type=resource.GpuVersion.V100,
count=1,
gpu_zone=resource.Zone.US_CENTRAL1_C,
test_version=test_version,
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()

# Running on A100 GPU
config.get_torchbench_gpu_config(
machine_type=resource.MachineVersion.A2_HIGHGPU_1G,
image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
accelerator_type=resource.GpuVersion.A100,
count=1,
gpu_zone=resource.Zone.US_CENTRAL1_F,
test_version=test_version,
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()

# Running on H100 GPU
# Note: H100 must use ssd.
config.get_torchbench_gpu_config(
machine_type=resource.MachineVersion.A3_HIGHGPU_8G,
image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
accelerator_type=resource.GpuVersion.H100,
count=8,
gpu_zone=resource.Zone.US_CENTRAL1_A,
nvidia_driver_version="535.86.10",
test_version=test_version,
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()

# Running on L4 GPU
config.get_torchbench_gpu_config(
machine_type=resource.MachineVersion.G2_STAND_4,
image_project=resource.ImageProject.DEEP_LEARNING_PLATFORM_RELEASE,
image_family=resource.ImageFamily.COMMON_CU121_DEBIAN_11,
accelerator_type=resource.GpuVersion.L4,
count=1,
gpu_zone=resource.Zone.US_CENTRAL1_C,
test_version=test_version,
model_name=model,
time_out_in_min=1600,
extraFlags=" ".join(torchbench_extra_flags),
).run()
2 changes: 1 addition & 1 deletion dags/pytorch_xla/pytorchxla_torchbench.py
Expand Up @@ -31,7 +31,7 @@
start_date=datetime.datetime(2024, 1, 1),
catchup=False,
) as dag:
model = "all"
model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
torchbench_extra_flags = [f"--filter={model}"]
# Running on V4-8:
config.get_torchbench_tpu_config(
Expand Down

0 comments on commit fe9d87f

Please sign in to comment.