From 6047b27ff8ae20fa33ee452b4d8914dcfe9efd40 Mon Sep 17 00:00:00 2001 From: Eric Le Fort Date: Mon, 18 Mar 2024 21:00:11 +0000 Subject: [PATCH] Refactored 2.16 release into a more reusable template and updated to 2.15.1 --- .../configs/tensorflow/common.py | 102 +++++++++++------- ...lutionsteam_tf_nightly_supported_config.py | 10 +- ...utionsteam_tf_release_supported_config.py} | 36 +++++-- ... solutionsteam_tf_release_se_supported.py} | 8 +- ... => solutionsteam_tf_release_supported.py} | 7 +- 5 files changed, 103 insertions(+), 60 deletions(-) rename dags/solutions_team/configs/tensorflow/{solutionsteam_tf_2_16_supported_config.py => solutionsteam_tf_release_supported_config.py} (90%) rename dags/solutions_team/{solutionsteam_tf_2_16_se_supported.py => solutionsteam_tf_release_se_supported.py} (95%) rename dags/solutions_team/{solutionsteam_tf_2_16_supported.py => solutionsteam_tf_release_supported.py} (95%) diff --git a/dags/solutions_team/configs/tensorflow/common.py b/dags/solutions_team/configs/tensorflow/common.py index 53b4da79..4e03f5db 100644 --- a/dags/solutions_team/configs/tensorflow/common.py +++ b/dags/solutions_team/configs/tensorflow/common.py @@ -14,7 +14,7 @@ """Utilities to construct common configs.""" -from typing import Tuple +from __future__ import annotations # Keras API @@ -60,7 +60,7 @@ ) -def set_up_se_nightly() -> Tuple[str]: +def set_up_se_nightly() -> tuple[str]: """Adjust grpc_tpu_worker for SE tests""" return ( "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service", @@ -69,60 +69,88 @@ def set_up_se_nightly() -> Tuple[str]: ) -def install_tf_nightly() -> Tuple[str]: - """Install tf nightly + libtpu.""" +def install_tf( + major: Optional[int] = None, + minor: Optional[int] = None, + patch: Optional[int] = None, + libtpu_version: Optional[str] = None, +) -> tuple[str]: + """Install tf + libtpu. + + If the version numbers are set, installs that version. Otherwise just installs using nightly. + Either all of the version numbers need to be set or none of them should be set. + + Args: + major (Optional[int]): The major version number + minor (Optional[int]): The minor version number + patch (Optional[int]): The minor version number + libtpu_version (Optional[str]): The libtpu version to install + """ + gs_version_str = "tf-nightly" + if any(x is not None for x in {major, minor, patch}): + msg = "All parts of a version should be specified if any of them are" + assert all(x is not None for x in {major, minor, patch}), msg + gs_version_str = f"tf-{major}-{minor}-{patch}" + + libtpu_version_str = "latest" + if libtpu_version is not None: + libtpu_version_str = f"{libtpu_version}/latest" + return ( "pip install tensorflow-text-nightly", - "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-nightly/latest/*.whl /tmp/ && pip install /tmp/tf*.whl --force", - "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/latest/libtpu.so /lib/", + f"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/{gs_version_str}/latest/t*.whl /tmp/ && pip install /tmp/t*.whl --force", + f"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/{libtpu_version_str}/libtpu.so /lib/", CMD_PRINT_TF_VERSION, ) -def install_tf_2_16() -> Tuple[str]: - """Install tf 2.16 + libtpu.""" - return ( - "pip install tensorflow-text-nightly", - "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/2.16/2024-02-20/t*.whl /tmp/ && pip install /tmp/t*.whl --force", - "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/1.10.0/rc0/libtpu.so /lib/", - CMD_PRINT_TF_VERSION, - ) +def set_up_keras(version: Optional[str] = None) -> tuple[str]: + """Common set up for tensorflow Keras tests. + + If a version is not set, defaults to nightly. + Args: + version(Optional[str]): The keras version to install + """ + cmd_install_keras = CMD_INSTALL_KERAS_NIGHTLY + if version is not None: + cmd_install_keras = ( + f"pip install --upgrade --force-reinstall --no-deps tf-keras=={version}" + ) -def set_up_tensorflow_keras() -> Tuple[str]: - """Common set up for tensorflow Keras tests.""" return ( - CMD_INSTALL_KERAS_NIGHTLY, + cmd_install_keras, "export PATH=$PATH:/root/google-cloud-sdk/bin && cd /tmp && sudo gcloud source repos clone tf2-api-tests --project=cloud-ml-auto-solutions", "cd /tmp/tf2-api-tests && pip install behave matplotlib", ) -def set_up_tensorflow_2_16_keras() -> Tuple[str]: - """Common set up for tensorflow Keras 2.16 tests.""" - return ( - "pip install --upgrade --force-reinstall tf-keras==2.16.0rc0", - "export PATH=$PATH:/root/google-cloud-sdk/bin && cd /tmp && sudo gcloud source repos clone tf2-api-tests --project=cloud-ml-auto-solutions", - "cd /tmp/tf2-api-tests && pip install behave matplotlib", - ) +def set_up_tensorflow_models( + models_branch: Optional[str] = None, + keras_version: Optional[str] = None, +) -> tuple[str]: + """Common set up for tensorflow models for the release. + If any versions are not set, defaults to nightly. -def set_up_google_tensorflow_models() -> Tuple[str]: - """Common set up for tensorflow models.""" - return ( - 'if [ ! -d "/usr/share/tpu/models" ]; then sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git; fi', - "pip install -r /usr/share/tpu/models/official/requirements.txt", - "pip install tensorflow-recommenders --no-deps", - CMD_INSTALL_KERAS_NIGHTLY, - ) + Args: + models_branch (Optional[str]): The models branch to use + """ + if models_branch is None: + models_branch = "master" + cmd_install_keras = CMD_INSTALL_KERAS_NIGHTLY + if keras_version is not None: + cmd_install_keras = f"pip install --upgrade --force-reinstall --no-deps tf-keras=={keras_version}" -def set_up_google_tensorflow_2_16_models() -> Tuple[str]: - """Common set up for tensorflow models.""" + print("models branch ERIC") + print(models_branch) + print( + f'if [ ! -d "/usr/share/tpu/models" ]; then sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone -b {models_branch} https://github.com/tensorflow/models.git; fi' + ) return ( - "sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu", - "sudo git clone -b r2.16.0 https://github.com/tensorflow/models.git", + f'if [ ! -d "/usr/share/tpu/models" ]; then sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && sudo git clone -b {models_branch} https://github.com/tensorflow/models.git; fi', "pip install -r /usr/share/tpu/models/official/requirements.txt", "pip install tensorflow-recommenders --no-deps", - "pip install --upgrade --force-reinstall tf-keras==2.16.0rc0", + cmd_install_keras, ) diff --git a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py index a2477161..ae95a764 100644 --- a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py +++ b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_nightly_supported_config.py @@ -42,7 +42,7 @@ def get_tf_keras_config( dataset_name=metric_config.DatasetOption.XLML_DATASET, ) - set_up_cmds = common.install_tf_nightly() + common.set_up_tensorflow_keras() + set_up_cmds = common.set_up_keras() + common.install_tf() if not is_pjrt and is_pod: set_up_cmds += common.set_up_se_nightly() keras_test_name = f"tf_keras_api_{test_name}" @@ -110,9 +110,7 @@ def get_tf_resnet_config( dataset_name=metric_config.DatasetOption.XLML_DATASET, ) - set_up_cmds = ( - common.install_tf_nightly() + common.set_up_google_tensorflow_models() - ) + set_up_cmds = common.set_up_tensorflow_models() + common.install_tf() if not is_pjrt and is_pod: set_up_cmds += common.set_up_se_nightly() @@ -201,9 +199,7 @@ def get_tf_dlrm_config( dataset_name=metric_config.DatasetOption.XLML_DATASET, ) - set_up_cmds = ( - common.install_tf_nightly() + common.set_up_google_tensorflow_models() - ) + set_up_cmds = common.set_up_tensorflow_models() + common.install_tf() if not is_pjrt and is_pod: set_up_cmds += common.set_up_se_nightly() diff --git a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_2_16_supported_config.py b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_release_supported_config.py similarity index 90% rename from dags/solutions_team/configs/tensorflow/solutionsteam_tf_2_16_supported_config.py rename to dags/solutions_team/configs/tensorflow/solutionsteam_tf_release_supported_config.py index 3936cff7..cc06956d 100644 --- a/dags/solutions_team/configs/tensorflow/solutionsteam_tf_2_16_supported_config.py +++ b/dags/solutions_team/configs/tensorflow/solutionsteam_tf_release_supported_config.py @@ -14,12 +14,23 @@ """Utilities to construct configs for solutionsteam_tf_nightly_supported DAG.""" +from __future__ import annotations + from xlml.apis import gcp_config, metric_config, task, test_config from dags import gcs_bucket, test_owner from dags.solutions_team.configs.tensorflow import common from airflow.models import Variable from dags.vm_resource import TpuVersion, Project, RuntimeVersion -from typing import List + + +MAJOR_VERSION = "2" +MINOR_VERSION = "15" +PATCH_VERSION = "1" +LIBTPU_VERSION = "1.9.0" +KERAS_VERSION = "2.15.1" +MODELS_BRANCH = "r2.15.0" + +GS_VERSION_STR = f"tf-{MAJOR_VERSION}-{MINOR_VERSION}-{PATCH_VERSION}" def get_tf_keras_config( @@ -42,10 +53,13 @@ def get_tf_keras_config( dataset_name=metric_config.DatasetOption.XLML_DATASET, ) - set_up_cmds = common.install_tf_2_16() + common.set_up_tensorflow_2_16_keras() + set_up_cmds = common.set_up_keras(KERAS_VERSION) + set_up_cmds += common.install_tf( + MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION, LIBTPU_VERSION + ) if not is_pjrt and is_pod: set_up_cmds += common.set_up_se_nightly() - keras_test_name = f"tf_2_16_keras_api_{test_name}" + keras_test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_keras_api_{test_name}" benchmark_id = f"{keras_test_name}-v{tpu_version.value}-{tpu_cores}" # Add default_var to pass DAG check # TODO(ranran): replace Variable.get() to XCOM when it applies @@ -110,8 +124,9 @@ def get_tf_resnet_config( dataset_name=metric_config.DatasetOption.XLML_DATASET, ) - set_up_cmds = ( - common.install_tf_2_16() + common.set_up_google_tensorflow_2_16_models() + set_up_cmds = common.set_up_tensorflow_models(MODELS_BRANCH, KERAS_VERSION) + set_up_cmds += common.install_tf( + MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION, LIBTPU_VERSION ) if not is_pjrt and is_pod: set_up_cmds += common.set_up_se_nightly() @@ -136,7 +151,7 @@ def get_tf_resnet_config( }, } - test_name = "tf_2_16_resnet_imagenet" + test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_resnet_imagenet" benchmark_id = f"{test_name}-v{tpu_version.value}-{tpu_cores}" # Add default_var to pass DAG check # TODO(ranran): replace Variable.get() to XCOM when it applies @@ -183,7 +198,7 @@ def get_tf_dlrm_config( tpu_cores: int, tpu_zone: str, time_out_in_min: int, - bottom_mlp: List[int], + bottom_mlp: list[int], embedding_dim: int, train_steps: int, runtime_version: str, @@ -201,8 +216,9 @@ def get_tf_dlrm_config( dataset_name=metric_config.DatasetOption.XLML_DATASET, ) - set_up_cmds = ( - common.install_tf_2_16() + common.set_up_google_tensorflow_2_16_models() + set_up_cmds = common.set_up_tensorflow_models(MODELS_BRANCH, KERAS_VERSION) + set_up_cmds += common.install_tf( + MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION, LIBTPU_VERSION ) if not is_pjrt and is_pod: set_up_cmds += common.set_up_se_nightly() @@ -273,7 +289,7 @@ def get_tf_dlrm_config( }, } - test_name = "tf_2_16_dlrm_criteo" + test_name = f"tf_{MAJOR_VERSION}_{MINOR_VERSION}_dlrm_criteo" benchmark_id = f"{test_name}-v{tpu_version.value}-{tpu_cores}" # Add default_var to pass DAG check # TODO(ranran): replace Variable.get() to XCOM when it applies diff --git a/dags/solutions_team/solutionsteam_tf_2_16_se_supported.py b/dags/solutions_team/solutionsteam_tf_release_se_supported.py similarity index 95% rename from dags/solutions_team/solutionsteam_tf_2_16_se_supported.py rename to dags/solutions_team/solutionsteam_tf_release_se_supported.py index d0e6ce0b..9a8361b4 100644 --- a/dags/solutions_team/solutionsteam_tf_2_16_se_supported.py +++ b/dags/solutions_team/solutionsteam_tf_release_se_supported.py @@ -18,17 +18,19 @@ from airflow import models from dags import composer_env from dags.vm_resource import TpuVersion, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS -from dags.solutions_team.configs.tensorflow import solutionsteam_tf_2_16_supported_config as tf_config +from dags.solutions_team.configs.tensorflow import solutionsteam_tf_release_supported_config as tf_config from dags.solutions_team.configs.tensorflow import common # Release tests only need to run once, they can be run manually as needed SCHEDULED_TIME = None +VERSION = f"{tf_config.MAJOR_VERSION}.{tf_config.MINOR_VERSION}" + with models.DAG( - dag_id="tf_2_16_se_nightly_supported", + dag_id=f"tf_{tf_config.MAJOR_VERSION}_{tf_config.MINOR_VERSION}_se_nightly_supported", schedule=SCHEDULED_TIME, - tags=["solutions_team", "tf", "se", "2.16", "supported", "xlml"], + tags=["solutions_team", "tf", "se", VERSION, "supported", "xlml"], start_date=datetime.datetime(2024, 1, 4), catchup=False, ) as dag: diff --git a/dags/solutions_team/solutionsteam_tf_2_16_supported.py b/dags/solutions_team/solutionsteam_tf_release_supported.py similarity index 95% rename from dags/solutions_team/solutionsteam_tf_2_16_supported.py rename to dags/solutions_team/solutionsteam_tf_release_supported.py index 79198884..b05aeab8 100644 --- a/dags/solutions_team/solutionsteam_tf_2_16_supported.py +++ b/dags/solutions_team/solutionsteam_tf_release_supported.py @@ -18,18 +18,19 @@ from airflow import models from dags import composer_env from dags.vm_resource import TpuVersion, Project, Zone, RuntimeVersion, V5_NETWORKS, V5E_SUBNETWORKS, V5P_SUBNETWORKS -from dags.solutions_team.configs.tensorflow import solutionsteam_tf_2_16_supported_config as tf_config +from dags.solutions_team.configs.tensorflow import solutionsteam_tf_release_supported_config as tf_config from dags.solutions_team.configs.tensorflow import common # Release tests only need to run once, they can be run manually as needed SCHEDULED_TIME = None +VERSION = f"{tf_config.MAJOR_VERSION}.{tf_config.MINOR_VERSION}" with models.DAG( - dag_id="tf_2_16_nightly_supported", + dag_id=f"tf_{tf_config.MAJOR_VERSION}_{tf_config.MINOR_VERSION}_nightly_supported", schedule=SCHEDULED_TIME, - tags=["solutions_team", "tf", "2.16", "supported", "xlml"], + tags=["solutions_team", "tf", VERSION, "supported", "xlml"], start_date=datetime.datetime(2023, 8, 16), catchup=False, ) as dag: