Skip to content

Commit

Permalink
Refactored 2.16 release into a more reusable template
Browse files Browse the repository at this point in the history
  • Loading branch information
EricLeFort committed Mar 16, 2024
1 parent b3a8354 commit d390848
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 53 deletions.
40 changes: 5 additions & 35 deletions dags/solutions_team/configs/tensorflow/common.py
Expand Up @@ -14,7 +14,7 @@

"""Utilities to construct common configs."""

from typing import Tuple
from __future__ import annotations


# Keras API
Expand Down Expand Up @@ -60,7 +60,7 @@
)


def set_up_se_nightly() -> Tuple[str]:
def set_up_se_nightly() -> tuple[str]:
"""Adjust grpc_tpu_worker for SE tests"""
return (
"sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service",
Expand All @@ -69,7 +69,7 @@ def set_up_se_nightly() -> Tuple[str]:
)


def install_tf_nightly() -> Tuple[str]:
def install_tf_nightly() -> tuple[str]:
"""Install tf nightly + libtpu."""
return (
"pip install tensorflow-text-nightly",
Expand All @@ -79,17 +79,7 @@ def install_tf_nightly() -> Tuple[str]:
)


def install_tf_2_16() -> Tuple[str]:
"""Install tf 2.16 + libtpu."""
return (
"pip install tensorflow-text-nightly",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/2.16/2024-02-20/t*.whl /tmp/ && pip install /tmp/t*.whl --force",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/1.10.0/rc0/libtpu.so /lib/",
CMD_PRINT_TF_VERSION,
)


def set_up_tensorflow_keras() -> Tuple[str]:
def set_up_tensorflow_keras() -> tuple[str]:
"""Common set up for tensorflow Keras tests."""
return (
CMD_INSTALL_KERAS_NIGHTLY,
Expand All @@ -98,31 +88,11 @@ def set_up_tensorflow_keras() -> Tuple[str]:
)


def set_up_tensorflow_2_16_keras() -> Tuple[str]:
"""Common set up for tensorflow Keras 2.16 tests."""
return (
"pip install --upgrade --force-reinstall tf-keras==2.16.0rc0",
"export PATH=$PATH:/root/google-cloud-sdk/bin && cd /tmp && sudo gcloud source repos clone tf2-api-tests --project=cloud-ml-auto-solutions",
"cd /tmp/tf2-api-tests && pip install behave matplotlib",
)


def set_up_google_tensorflow_models() -> Tuple[str]:
def set_up_google_tensorflow_models() -> tuple[str]:
"""Common set up for tensorflow models."""
return (
'if [ ! -d "/usr/share/tpu/models" ]; then sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git; fi',
"pip install -r /usr/share/tpu/models/official/requirements.txt",
"pip install tensorflow-recommenders --no-deps",
CMD_INSTALL_KERAS_NIGHTLY,
)


def set_up_google_tensorflow_2_16_models() -> Tuple[str]:
"""Common set up for tensorflow models."""
return (
"sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu",
"sudo git clone -b r2.16.0 https://github.com/tensorflow/models.git",
"pip install -r /usr/share/tpu/models/official/requirements.txt",
"pip install tensorflow-recommenders --no-deps",
"pip install --upgrade --force-reinstall tf-keras==2.16.0rc0",
)
Expand Up @@ -14,12 +14,51 @@

"""Utilities to construct configs for solutionsteam_tf_nightly_supported DAG."""

from __future__ import annotations

from xlml.apis import gcp_config, metric_config, task, test_config
from dags import gcs_bucket, test_owner
from dags.solutions_team.configs.tensorflow import common
from airflow.models import Variable
from dags.vm_resource import TpuVersion, Project, RuntimeVersion
from typing import List


MAJOR_VERSION = "2"
MINOR_VERSION = "16"
PATCH_VERSION = "1"
LIBTPU_VERSION = "1.10.0"
KERAS_VERSION = "2.16.0rc0"
MODELS_BRANCH = "r2.16.0"


def install_tf() -> tuple[str]:
"""Install the release version of tf + libtpu."""
return (
"pip install tensorflow-text-nightly",
f"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/{MAJOR_VERSION}.{MINOR_VERSION}/2024-02-20/t*.whl /tmp/ && pip install /tmp/t*.whl --force",
f"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/{LIBTPU_VERSION}/rc0/libtpu.so /lib/",
common.CMD_PRINT_TF_VERSION,
)


def set_up_keras() -> tuple[str]:
"""Common set up for tensorflow Keras tests for the release."""
return (
f"pip install --upgrade --force-reinstall tf-keras=={KERAS_VERSION}",
"export PATH=$PATH:/root/google-cloud-sdk/bin && cd /tmp && sudo gcloud source repos clone tf2-api-tests --project=cloud-ml-auto-solutions",
"cd /tmp/tf2-api-tests && pip install behave matplotlib",
)


def set_up_tensorflow_models() -> tuple[str]:
"""Common set up for tensorflow models for the release."""
return (
"sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu",
f"sudo git clone -b {MODELS_BRANCH} https://github.com/tensorflow/models.git",
"pip install -r /usr/share/tpu/models/official/requirements.txt",
"pip install tensorflow-recommenders --no-deps",
f"pip install --upgrade --force-reinstall tf-keras=={KERAS_VERSION}",
)


def get_tf_keras_config(
Expand All @@ -42,10 +81,10 @@ def get_tf_keras_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = common.install_tf_2_16() + common.set_up_tensorflow_2_16_keras()
set_up_cmds = install_tf() + set_up_keras()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()
keras_test_name = f"tf_2_16_keras_api_{test_name}"
keras_test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_keras_api_{test_name}"
benchmark_id = f"{keras_test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
# TODO(ranran): replace Variable.get() to XCOM when it applies
Expand Down Expand Up @@ -110,9 +149,7 @@ def get_tf_resnet_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = (
common.install_tf_2_16() + common.set_up_google_tensorflow_2_16_models()
)
set_up_cmds = install_tf() + set_up_tensorflow_models()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()

Expand All @@ -136,7 +173,7 @@ def get_tf_resnet_config(
},
}

test_name = "tf_2_16_resnet_imagenet"
test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_resnet_imagenet"
benchmark_id = f"{test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
# TODO(ranran): replace Variable.get() to XCOM when it applies
Expand Down Expand Up @@ -183,7 +220,7 @@ def get_tf_dlrm_config(
tpu_cores: int,
tpu_zone: str,
time_out_in_min: int,
bottom_mlp: List[int],
bottom_mlp: list[int],
embedding_dim: int,
train_steps: int,
runtime_version: str,
Expand All @@ -201,9 +238,7 @@ def get_tf_dlrm_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = (
common.install_tf_2_16() + common.set_up_google_tensorflow_2_16_models()
)
set_up_cmds = install_tf() + set_up_tensorflow_models()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()

Expand Down Expand Up @@ -273,7 +308,7 @@ def get_tf_dlrm_config(
},
}

test_name = "tf_2_16_dlrm_criteo"
test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_dlrm_criteo"
benchmark_id = f"{test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
# TODO(ranran): replace Variable.get() to XCOM when it applies
Expand Down
Expand Up @@ -18,17 +18,19 @@
from airflow import models
from dags import composer_env
from dags.vm_resource import TpuVersion, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_2_16_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_release_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import common


# Release tests only need to run once, they can be run manually as needed
SCHEDULED_TIME = None
VERSION = f"{tf_config.MAJOR_VERSION}.{tf_config.MINOR_VERSION}"


with models.DAG(
dag_id="tf_2_16_se_nightly_supported",
dag_id=f"tf_{tf_config.MAJOR_VERSION}_{tf_config.MINOR_VERSION}_se_nightly_supported",
schedule=SCHEDULED_TIME,
tags=["solutions_team", "tf", "se", "2.16", "supported", "xlml"],
tags=["solutions_team", "tf", "se", VERSION, "supported", "xlml"],
start_date=datetime.datetime(2024, 1, 4),
catchup=False,
) as dag:
Expand Down
Expand Up @@ -18,18 +18,19 @@
from airflow import models
from dags import composer_env
from dags.vm_resource import TpuVersion, Project, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_2_16_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import solutionsteam_tf_release_supported_config as tf_config
from dags.solutions_team.configs.tensorflow import common


# Release tests only need to run once, they can be run manually as needed
SCHEDULED_TIME = None
VERSION = f"{tf_config.MAJOR_VERSION}.{tf_config.MINOR_VERSION}"


with models.DAG(
dag_id="tf_2_16_nightly_supported",
dag_id=f"tf_{tf_config.MAJOR_VERSION}_{tf_config.MINOR_VERSION}_nightly_supported",
schedule=SCHEDULED_TIME,
tags=["solutions_team", "tf", "2.16", "supported", "xlml"],
tags=["solutions_team", "tf", VERSION, "supported", "xlml"],
start_date=datetime.datetime(2023, 8, 16),
catchup=False,
) as dag:
Expand Down

0 comments on commit d390848

Please sign in to comment.