Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored 2.16 release into a more reusable template #205

Merged
merged 1 commit into from Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
97 changes: 60 additions & 37 deletions dags/solutions_team/configs/tensorflow/common.py
Expand Up @@ -14,7 +14,7 @@

"""Utilities to construct common configs."""

from typing import Tuple
from __future__ import annotations


# Keras API
Expand Down Expand Up @@ -60,7 +60,7 @@
)


def set_up_se_nightly() -> Tuple[str]:
def set_up_se_nightly() -> tuple[str]:
"""Adjust grpc_tpu_worker for SE tests"""
return (
"sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service",
Expand All @@ -69,60 +69,83 @@ def set_up_se_nightly() -> Tuple[str]:
)


def install_tf_nightly() -> Tuple[str]:
"""Install tf nightly + libtpu."""
def install_tf(
major: Optional[int] = None,
minor: Optional[int] = None,
patch: Optional[int] = None,
libtpu_version: Optional[str] = None,
) -> tuple[str]:
"""Install tf + libtpu.
If the version numbers are set, installs that version. Otherwise just installs using nightly.
Either all of the version numbers need to be set or none of them should be set.
Args:
major (Optional[int]): The major version number
minor (Optional[int]): The minor version number
patch (Optional[int]): The minor version number
libtpu_version (Optional[str]): The libtpu version to install
"""
gs_version_str = "tf-nightly"
if any(x is not None for x in {major, minor, patch}):
msg = "All parts of a version should be specified if any of them are"
assert all(x is not None for x in {major, minor, patch}), msg
gs_version_str = f"tf-{major}-{minor}-{patch}"

libtpu_version_str = "latest"
if libtpu_version is not None:
libtpu_version_str = f"{libtpu_version}/latest"

return (
"pip install tensorflow-text-nightly",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-nightly/latest/*.whl /tmp/ && pip install /tmp/tf*.whl --force",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/latest/libtpu.so /lib/",
f"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/{gs_version_str}/latest/t*.whl /tmp/ && pip install /tmp/t*.whl --force",
f"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/{libtpu_version_str}/libtpu.so /lib/",
CMD_PRINT_TF_VERSION,
)


def install_tf_2_16() -> Tuple[str]:
"""Install tf 2.16 + libtpu."""
return (
"pip install tensorflow-text-nightly",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/2.16/2024-02-20/t*.whl /tmp/ && pip install /tmp/t*.whl --force",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/1.10.0/rc0/libtpu.so /lib/",
CMD_PRINT_TF_VERSION,
)
def set_up_keras(version: Optional[str] = None) -> tuple[str]:
"""Common set up for tensorflow Keras tests.
If a version is not set, defaults to nightly.
Args:
version(Optional[str]): The keras version to install
"""
cmd_install_keras = CMD_INSTALL_KERAS_NIGHTLY
if version is not None:
cmd_install_keras = (
f"pip install --upgrade --force-reinstall --no-deps tf-keras=={version}"
)

def set_up_tensorflow_keras() -> Tuple[str]:
"""Common set up for tensorflow Keras tests."""
return (
CMD_INSTALL_KERAS_NIGHTLY,
cmd_install_keras,
"export PATH=$PATH:/root/google-cloud-sdk/bin && cd /tmp && sudo gcloud source repos clone tf2-api-tests --project=cloud-ml-auto-solutions",
"cd /tmp/tf2-api-tests && pip install behave matplotlib",
)


def set_up_tensorflow_2_16_keras() -> Tuple[str]:
"""Common set up for tensorflow Keras 2.16 tests."""
return (
"pip install --upgrade --force-reinstall tf-keras==2.16.0rc0",
"export PATH=$PATH:/root/google-cloud-sdk/bin && cd /tmp && sudo gcloud source repos clone tf2-api-tests --project=cloud-ml-auto-solutions",
"cd /tmp/tf2-api-tests && pip install behave matplotlib",
)
def set_up_tensorflow_models(
models_branch: Optional[str] = None,
keras_version: Optional[str] = None,
) -> tuple[str]:
"""Common set up for tensorflow models for the release.
If any versions are not set, defaults to nightly.
def set_up_google_tensorflow_models() -> Tuple[str]:
"""Common set up for tensorflow models."""
return (
'if [ ! -d "/usr/share/tpu/models" ]; then sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git; fi',
"pip install -r /usr/share/tpu/models/official/requirements.txt",
"pip install tensorflow-recommenders --no-deps",
CMD_INSTALL_KERAS_NIGHTLY,
)
Args:
models_branch (Optional[str]): The models branch to use
"""
if models_branch is None:
models_branch = "master"

cmd_install_keras = CMD_INSTALL_KERAS_NIGHTLY
if keras_version is not None:
cmd_install_keras = f"pip install --upgrade --force-reinstall --no-deps tf-keras=={keras_version}"

def set_up_google_tensorflow_2_16_models() -> Tuple[str]:
"""Common set up for tensorflow models."""
return (
"sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu",
"sudo git clone -b r2.16.0 https://github.com/tensorflow/models.git",
f'if [ ! -d "/usr/share/tpu/models" ]; then sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && sudo git clone -b {models_branch} https://github.com/tensorflow/models.git; fi',
"pip install -r /usr/share/tpu/models/official/requirements.txt",
"pip install tensorflow-recommenders --no-deps",
"pip install --upgrade --force-reinstall tf-keras==2.16.0rc0",
cmd_install_keras,
)
Expand Up @@ -42,7 +42,7 @@ def get_tf_keras_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = common.install_tf_nightly() + common.set_up_tensorflow_keras()
set_up_cmds = common.set_up_keras() + common.install_tf()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()
keras_test_name = f"tf_keras_api_{test_name}"
Expand Down Expand Up @@ -110,9 +110,7 @@ def get_tf_resnet_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = (
common.install_tf_nightly() + common.set_up_google_tensorflow_models()
)
set_up_cmds = common.set_up_tensorflow_models() + common.install_tf()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()

Expand Down Expand Up @@ -201,9 +199,7 @@ def get_tf_dlrm_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = (
common.install_tf_nightly() + common.set_up_google_tensorflow_models()
)
set_up_cmds = common.set_up_tensorflow_models() + common.install_tf()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()

Expand Down
Expand Up @@ -14,12 +14,23 @@

"""Utilities to construct configs for solutionsteam_tf_nightly_supported DAG."""

from __future__ import annotations

from xlml.apis import gcp_config, metric_config, task, test_config
from dags import gcs_bucket, test_owner
from dags.solutions_team.configs.tensorflow import common
from airflow.models import Variable
from dags.vm_resource import TpuVersion, Project, RuntimeVersion
from typing import List


MAJOR_VERSION = "2"
EricLeFort marked this conversation as resolved.
Show resolved Hide resolved
MINOR_VERSION = "15"
PATCH_VERSION = "1"
LIBTPU_VERSION = "1.9.0"
KERAS_VERSION = "2.15.1"
MODELS_BRANCH = "r2.15.0"

GS_VERSION_STR = f"tf-{MAJOR_VERSION}-{MINOR_VERSION}-{PATCH_VERSION}"


def get_tf_keras_config(
Expand All @@ -42,10 +53,13 @@ def get_tf_keras_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = common.install_tf_2_16() + common.set_up_tensorflow_2_16_keras()
set_up_cmds = common.set_up_keras(KERAS_VERSION)
set_up_cmds += common.install_tf(
MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION, LIBTPU_VERSION
)
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()
keras_test_name = f"tf_2_16_keras_api_{test_name}"
keras_test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_keras_api_{test_name}"
benchmark_id = f"{keras_test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
# TODO(ranran): replace Variable.get() to XCOM when it applies
Expand Down Expand Up @@ -110,8 +124,9 @@ def get_tf_resnet_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = (
common.install_tf_2_16() + common.set_up_google_tensorflow_2_16_models()
set_up_cmds = common.set_up_tensorflow_models(MODELS_BRANCH, KERAS_VERSION)
set_up_cmds += common.install_tf(
MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION, LIBTPU_VERSION
)
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()
Expand All @@ -136,7 +151,7 @@ def get_tf_resnet_config(
},
}

test_name = "tf_2_16_resnet_imagenet"
test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_resnet_imagenet"
benchmark_id = f"{test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
# TODO(ranran): replace Variable.get() to XCOM when it applies
Expand Down Expand Up @@ -183,7 +198,7 @@ def get_tf_dlrm_config(
tpu_cores: int,
tpu_zone: str,
time_out_in_min: int,
bottom_mlp: List[int],
bottom_mlp: list[int],
embedding_dim: int,
train_steps: int,
runtime_version: str,
Expand All @@ -201,8 +216,9 @@ def get_tf_dlrm_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = (
common.install_tf_2_16() + common.set_up_google_tensorflow_2_16_models()
set_up_cmds = common.set_up_tensorflow_models(MODELS_BRANCH, KERAS_VERSION)
set_up_cmds += common.install_tf(
MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION, LIBTPU_VERSION
)
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()
Expand Down Expand Up @@ -273,7 +289,7 @@ def get_tf_dlrm_config(
},
}

test_name = "tf_2_16_dlrm_criteo"
test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_dlrm_criteo"
benchmark_id = f"{test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
# TODO(ranran): replace Variable.get() to XCOM when it applies
Expand Down
Expand Up @@ -18,17 +18,19 @@
from airflow import models
from dags import composer_env
from dags.vm_resource import TpuVersion, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_2_16_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_release_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import common


# Release tests only need to run once, they can be run manually as needed
SCHEDULED_TIME = None
VERSION = f"{tf_config.MAJOR_VERSION}.{tf_config.MINOR_VERSION}"


with models.DAG(
dag_id="tf_2_16_se_nightly_supported",
dag_id=f"tf_{tf_config.MAJOR_VERSION}_{tf_config.MINOR_VERSION}_se_nightly_supported",
EricLeFort marked this conversation as resolved.
Show resolved Hide resolved
schedule=SCHEDULED_TIME,
tags=["solutions_team", "tf", "se", "2.16", "supported", "xlml"],
tags=["solutions_team", "tf", "se", VERSION, "supported", "xlml"],
start_date=datetime.datetime(2024, 1, 4),
catchup=False,
) as dag:
Expand Down
Expand Up @@ -18,18 +18,19 @@
from airflow import models
from dags import composer_env
from dags.vm_resource import TpuVersion, Project, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_2_16_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_release_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import common


# Release tests only need to run once, they can be run manually as needed
SCHEDULED_TIME = None
VERSION = f"{tf_config.MAJOR_VERSION}.{tf_config.MINOR_VERSION}"


with models.DAG(
dag_id="tf_2_16_nightly_supported",
dag_id=f"tf_{tf_config.MAJOR_VERSION}_{tf_config.MINOR_VERSION}_nightly_supported",
schedule=SCHEDULED_TIME,
tags=["solutions_team", "tf", "2.16", "supported", "xlml"],
tags=["solutions_team", "tf", VERSION, "supported", "xlml"],
start_date=datetime.datetime(2023, 8, 16),
catchup=False,
) as dag:
Expand Down