Skip to content

CI

CI #3598

Workflow file for this run

# Do not edit this file! It has been generated by .github/gen-workflow-ci.py
name: CI
on:
schedule:
# run a build on master (this does not publish test results or cancel concurrent builds)
- cron: '0 10 * * *' # everyday at 10am
push:
# only consider push to master, hotfix-branches, and tags
# otherwise modify job.config.outputs.push
branches: [ 'master', 'hotfix-*' ]
tags: [ 'v*.*.*' ]
pull_request:
# only consider pull requests into master
branches: [ master ]
workflow_dispatch:
permissions: {}
concurrency:
# This controls which concurrent builds to cancel:
# - we do not want any concurrent builds on a branch (pull_request)
# - we do not want concurrent builds on the same commit on master (push)
# - we do not want concurrent builds on the same commit on a tag (push)
# - we allow concurrent runs on the same commit on master and its tag (push)
# - we allow concurrent runs on the same commit on master (push) and a scheduled build (schedule)
#
# A pull_request event only runs on branch commit, a push event only on master and tag commit.
# A schedule event only runs on master HEAD commit.
#
# Expression github.ref means something like refs/heads/master or refs/tags/v0.22.1 or the branch.
# This helps to not cancel concurrent runs on master or a tag that share the same commit.
# Expression github.head_ref refers to the branch of the pull request.
# On master, github.head_ref is empty, so we use the SHA of the commit, this means individual
# commits to master will not be cancelled, while there can only be one concurrent build on a branch.
#
# We include the event name to we allow for concurrent scheduled and master builds.
group: ci-${{ github.event_name }}-${{ github.ref }}-${{ github.head_ref || github.sha }}
cancel-in-progress: true
jobs:
event_file:
name: "Event File"
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v3
with:
name: Event File
path: ${{ github.event_path }}
setup-py:
name: "setup.py"
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Test setup.py
env:
HOROVOD_WITHOUT_TENSORFLOW: 1
HOROVOD_WITHOUT_PYTORCH: 1
HOROVOD_WITHOUT_MXNET: 1
HOROVOD_WITHOUT_GLOO: 1
HOROVOD_WITHOUT_MPI: 1
run: |
python -m pip install --upgrade pip
python -m pip install setuptools wheel
python setup.py sdist
pip -v install dist/horovod-*.tar.gz
init-workflow:
name: "Init Workflow"
runs-on: ubuntu-latest
outputs:
run-at-all: ${{ github.event_name != 'schedule' || github.repository == 'horovod/horovod' }}
# if we don't get a clear 'false', we fall back to building and testing
run-builds-and-tests: ${{ steps.tests.outputs.needed != 'false' }}
buildkite-branch-label: "${{ steps.config-buildkite.outputs.branch-label }}"
buildkite-message: "${{ steps.config-buildkite.outputs.message }}"
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Pip install dependencies
run: pip install -r .github/requirements.txt
- name: Check ci.yaml is up-to-date
run: |
python .github/gen-workflow-ci.py
if [[ $(git diff .github/workflows/ci.yaml | wc -l) -gt 0 ]]
then
echo "::error::Workflow file .github/workflows/ci.yaml is out-dated, please run .github/gen-workflow-ci.py and commit changes"
exit 1
fi
shell: bash
- name: Check if tests are needed
id: tests
env:
GITHUB_BASE_SHA: ${{ github.event.pull_request.base.sha }}
GITHUB_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
if [[ "${{ github.event_name }}" == "pull_request" ]]
then
changes="$(python .github/get-changed-code-files.py)"
if [[ -z "$changes" ]]
then
echo "No code changes, no need to build and test"
echo "needed=false" >> $GITHUB_OUTPUT
else
echo "Code changes, we need to build and test:"
echo "$changes"
echo "needed=true" >> $GITHUB_OUTPUT
fi
else
echo "This is not part of a pull request, we need to build and test"
echo "needed=true" >> $GITHUB_OUTPUT
fi
- name: Configure Buildkite Build
id: config-buildkite
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
branch="${{ github.event.pull_request.head.ref || github.ref }}"
branch="${branch#"refs/heads/"}"
branch="${branch#"refs/tags/"}"
branch_label="${branch}"
if [[ "${{ github.event_name }}" == "schedule" ]]
then
# we add this label to the branch used by Buildkite to avoid it cancelling one of concurrent schedule and push builds on master
branch_label="${branch} (schedule)"
fi
echo "branch-label=${branch_label}" >> $GITHUB_OUTPUT
if [[ "${{ github.event_name }}" == "pull_request" ]]
then
head_sha="${{ github.event.pull_request.head.sha }}"
message="$(gh api https://api.github.com/repos/horovod/horovod/commits/${head_sha} -q .commit.message | head -n1)"
echo "message=${message}" >> $GITHUB_OUTPUT
fi
- name: Provide PR meta
if: github.event_name == 'pull_request'
run: |
rm -f pr.json
echo -n "{" >> pr.json
echo -n " \"merge_sha\": \"${{ github.sha }}\"," >> pr.json
echo -n " \"base_sha\": \"${{ github.event.pull_request.base.sha }}\"," >> pr.json
echo -n " \"head_sha\": \"${{ github.event.pull_request.head.sha }}\" " >> pr.json
echo -n "}" >> pr.json
cat pr.json
- name: Upload PR meta
uses: actions/upload-artifact@v3
if: github.event_name == 'pull_request'
with:
name: PR Meta
path: pr.json
build-and-test:
name: "Build and Test (${{ matrix.image }})"
needs: [init-workflow]
if: >
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true'
runs-on: ubuntu-latest
strategy:
max-parallel: 10
fail-fast: false
matrix:
include:
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_8_1-mxnet1_5_1_p0-pyspark3_4_0
Elastic_Spark_TensorFlow_Tests_2: true
Elastic_Tests_2: true
Gloo_Cluster_PyTests: true
Gloo_Keras_MNIST: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_MNIST: true
Single_Keras_MNIST: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Keras_MNIST: true
Spark_Keras_Rossmann_Estimator: true
Spark_Keras_Rossmann_Run: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-gloo-py3_7-tf2_11_0-keras2_11_0-torch1_13_1-mxnet1_9_1-pyspark2_4_8
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_7_0_p2-pyspark3_4_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Data_Service: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-gloo-py3_8-tf2_11_1-keras2_11_0-torch1_13_1-mxnet1_8_0_p0-pyspark3_4_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_3_2
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-mpich-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST_horovodrun: true
MPI_Parallel_PyTests: true
MPI_PyTorch_MNIST_api: true
MPI_PyTorch_MNIST_horovodrun: true
MPI_Single_PyTests: true
MPI_TensorFlow_2_0_Keras_MNIST_api: true
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true
MPI_TensorFlow_2_0_MNIST_api: true
MPI_TensorFlow_2_0_MNIST_horovodrun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
build_timeout: 30
- image: test-cpu-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST_horovodrun: true
MPI_Parallel_PyTests: true
MPI_PyTorch_MNIST_api: true
MPI_PyTorch_MNIST_horovodrun: true
MPI_Single_PyTests: true
MPI_TensorFlow_2_0_Keras_MNIST_api: true
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true
MPI_TensorFlow_2_0_MNIST_api: true
MPI_TensorFlow_2_0_MNIST_horovodrun: true
Run_PyTests_test_interactiverun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-cpu-openmpi-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST_horovodrun: true
MPI_Parallel_PyTests: true
MPI_PyTorch_MNIST_api: true
MPI_PyTorch_MNIST_horovodrun: true
MPI_Single_PyTests: true
MPI_TensorFlow_2_0_Keras_MNIST_api: true
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true
MPI_TensorFlow_2_0_MNIST_api: true
MPI_TensorFlow_2_0_MNIST_horovodrun: true
Run_PyTests_test_interactiverun: true
Single_MXNet_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-gpu-gloo-py3_8-tf1_15_5-keras2_2_4-torch1_12_1-mxnet1_8_0_p0-pyspark3_4_0
build_timeout: 40
- image: test-gpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_8_0_p0-pyspark3_4_0
build_timeout: 40
- image: test-gpu-gloo-py3_8-tf2_11_1-keras2_11_0-torch1_13_1-mxnet1_8_0_p0-pyspark3_4_0
build_timeout: 40
- image: test-gpu-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0
build_timeout: 40
- image: test-mixed-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0
build_timeout: 40
steps:
- name: Clean up disk space
# deleting these paths frees 38 GB disk space:
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
# but this sometimes takes 3-4 minutes
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB
run: |
echo ::group::Disk space before clean up
df -h
echo ::endgroup::
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \
/usr/share/dotnet/shared \
/usr/local/lib/android/sdk/ndk \
/usr/local/lib/android/sdk/build-tools \
/opt/ghc
do
echo ::group::Deleting "$dir"
sudo du -hsc $dir | tail -n1 || true
sudo rm -rf $dir
echo ::endgroup::
done
echo ::group::Disk space after clean up
df -h
echo ::endgroup::
- name: Checkout
uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Build
id: build
run: |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker compose -f docker-compose.test.yml build ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1
- name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]"
id: Elastic_Spark_TensorFlow_Tests_1_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]"
id: Elastic_Spark_TensorFlow_Tests_1_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]"
id: Elastic_Spark_TensorFlow_Tests_1_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]"
id: Elastic_Spark_TensorFlow_Tests_2_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]"
id: Elastic_Spark_TensorFlow_Tests_2_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]"
id: Elastic_Spark_TensorFlow_Tests_2_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py"
shell: bash
- name: "Elastic Spark Torch Tests [attempt 1 of 3]"
id: Elastic_Spark_Torch_Tests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py"
shell: bash
- name: "Elastic Spark Torch Tests [attempt 2 of 3]"
id: Elastic_Spark_Torch_Tests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py"
shell: bash
- name: "Elastic Spark Torch Tests [attempt 3 of 3]"
id: Elastic_Spark_Torch_Tests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py"
shell: bash
- name: "Elastic Tests 1 [attempt 1 of 3]"
id: Elastic_Tests_1_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
shell: bash
- name: "Elastic Tests 1 [attempt 2 of 3]"
id: Elastic_Tests_1_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
shell: bash
- name: "Elastic Tests 1 [attempt 3 of 3]"
id: Elastic_Tests_1_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
shell: bash
- name: "Elastic Tests 2 [attempt 1 of 3]"
id: Elastic_Tests_2_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py"
shell: bash
- name: "Elastic Tests 2 [attempt 2 of 3]"
id: Elastic_Tests_2_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py"
shell: bash
- name: "Elastic Tests 2 [attempt 3 of 3]"
id: Elastic_Tests_2_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py"
shell: bash
- name: "Gloo Cluster PyTests [attempt 1 of 3]"
id: Gloo_Cluster_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
shell: bash
- name: "Gloo Cluster PyTests [attempt 2 of 3]"
id: Gloo_Cluster_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
shell: bash
- name: "Gloo Cluster PyTests [attempt 3 of 3]"
id: Gloo_Cluster_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
shell: bash
- name: "Gloo Keras MNIST [attempt 1 of 3]"
id: Gloo_Keras_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py
shell: bash
- name: "Gloo Keras MNIST [attempt 2 of 3]"
id: Gloo_Keras_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py
shell: bash
- name: "Gloo Keras MNIST [attempt 3 of 3]"
id: Gloo_Keras_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py
shell: bash
- name: "Gloo MXNet2 MNIST api [attempt 1 of 3]"
id: Gloo_MXNet2_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo MXNet2 MNIST api [attempt 2 of 3]"
id: Gloo_MXNet2_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo MXNet2 MNIST api [attempt 3 of 3]"
id: Gloo_MXNet2_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo MXNet2 MNIST horovodrun [attempt 1 of 3]"
id: Gloo_MXNet2_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
shell: bash
- name: "Gloo MXNet2 MNIST horovodrun [attempt 2 of 3]"
id: Gloo_MXNet2_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
shell: bash
- name: "Gloo MXNet2 MNIST horovodrun [attempt 3 of 3]"
id: Gloo_MXNet2_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
shell: bash
- name: "Gloo MXNet MNIST horovodrun [attempt 1 of 3]"
id: Gloo_MXNet_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py
shell: bash
- name: "Gloo MXNet MNIST horovodrun [attempt 2 of 3]"
id: Gloo_MXNet_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py
shell: bash
- name: "Gloo MXNet MNIST horovodrun [attempt 3 of 3]"
id: Gloo_MXNet_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py
shell: bash
- name: "Gloo Parallel PyTests [attempt 1 of 3]"
id: Gloo_Parallel_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
shell: bash
- name: "Gloo Parallel PyTests [attempt 2 of 3]"
id: Gloo_Parallel_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
shell: bash
- name: "Gloo Parallel PyTests [attempt 3 of 3]"
id: Gloo_Parallel_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
shell: bash
- name: "Gloo PyTorch MNIST api [attempt 1 of 3]"
id: Gloo_PyTorch_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo PyTorch MNIST api [attempt 2 of 3]"
id: Gloo_PyTorch_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo PyTorch MNIST api [attempt 3 of 3]"
id: Gloo_PyTorch_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo PyTorch MNIST horovodrun [attempt 1 of 3]"
id: Gloo_PyTorch_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
shell: bash
- name: "Gloo PyTorch MNIST horovodrun [attempt 2 of 3]"
id: Gloo_PyTorch_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
shell: bash
- name: "Gloo PyTorch MNIST horovodrun [attempt 3 of 3]"
id: Gloo_PyTorch_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
shell: bash
- name: "Gloo Single PyTests [attempt 1 of 3]"
id: Gloo_Single_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
shell: bash
- name: "Gloo Single PyTests [attempt 2 of 3]"
id: Gloo_Single_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
shell: bash
- name: "Gloo Single PyTests [attempt 3 of 3]"
id: Gloo_Single_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
shell: bash
- name: "Gloo TensorFlow MNIST [attempt 1 of 3]"
id: Gloo_TensorFlow_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py
shell: bash
- name: "Gloo TensorFlow MNIST [attempt 2 of 3]"
id: Gloo_TensorFlow_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py
shell: bash
- name: "Gloo TensorFlow MNIST [attempt 3 of 3]"
id: Gloo_TensorFlow_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py
shell: bash
- name: "MPI Cluster PyTests [attempt 1 of 3]"
id: MPI_Cluster_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [attempt 2 of 3]"
id: MPI_Cluster_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [attempt 3 of 3]"
id: MPI_Cluster_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]"
id: MPI_Cluster_PyTests_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]"
id: MPI_Cluster_PyTests_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]"
id: MPI_Cluster_PyTests_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]"
id: MPI_Cluster_PyTests_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]"
id: MPI_Cluster_PyTests_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]"
id: MPI_Cluster_PyTests_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI MXNet2 MNIST api [attempt 1 of 3]"
id: MPI_MXNet2_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI MXNet2 MNIST api [attempt 2 of 3]"
id: MPI_MXNet2_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI MXNet2 MNIST api [attempt 3 of 3]"
id: MPI_MXNet2_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI MXNet2 MNIST horovodrun [attempt 1 of 3]"
id: MPI_MXNet2_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py"
shell: bash
- name: "MPI MXNet2 MNIST horovodrun [attempt 2 of 3]"
id: MPI_MXNet2_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py"
shell: bash
- name: "MPI MXNet2 MNIST horovodrun [attempt 3 of 3]"
id: MPI_MXNet2_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [attempt 1 of 3]"
id: MPI_MXNet_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [attempt 2 of 3]"
id: MPI_MXNet_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [attempt 3 of 3]"
id: MPI_MXNet_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI Parallel PyTests [attempt 1 of 3]"
id: MPI_Parallel_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [attempt 2 of 3]"
id: MPI_Parallel_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [attempt 3 of 3]"
id: MPI_Parallel_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]"
id: MPI_Parallel_PyTests_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]"
id: MPI_Parallel_PyTests_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]"
id: MPI_Parallel_PyTests_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]"
id: MPI_Parallel_PyTests_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]"
id: MPI_Parallel_PyTests_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]"
id: MPI_Parallel_PyTests_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI PyTorch MNIST api [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI Single PyTests [attempt 1 of 3]"
id: MPI_Single_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [attempt 2 of 3]"
id: MPI_Single_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [attempt 3 of 3]"
id: MPI_Single_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]"
id: MPI_Single_PyTests_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]"
id: MPI_Single_PyTests_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]"
id: MPI_Single_PyTests_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]"
id: MPI_Single_PyTests_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]"
id: MPI_Single_PyTests_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]"
id: MPI_Single_PyTests_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "Run PyTests test_interactiverun [attempt 1 of 3]"
id: Run_PyTests_test_interactiverun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py"
shell: bash
- name: "Run PyTests test_interactiverun [attempt 2 of 3]"
id: Run_PyTests_test_interactiverun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py"
shell: bash
- name: "Run PyTests test_interactiverun [attempt 3 of 3]"
id: Run_PyTests_test_interactiverun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py"
shell: bash
- name: "Single Keras MNIST [attempt 1 of 3]"
id: Single_Keras_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64"
shell: bash
- name: "Single Keras MNIST [attempt 2 of 3]"
id: Single_Keras_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64"
shell: bash
- name: "Single Keras MNIST [attempt 3 of 3]"
id: Single_Keras_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64"
shell: bash
- name: "Single MXNet2 MNIST [attempt 1 of 3]"
id: Single_MXNet2_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet2 MNIST [attempt 2 of 3]"
id: Single_MXNet2_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet2 MNIST [attempt 3 of 3]"
id: Single_MXNet2_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [attempt 1 of 3]"
id: Single_MXNet_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [attempt 2 of 3]"
id: Single_MXNet_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [attempt 3 of 3]"
id: Single_MXNet_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]"
id: Single_MXNet_MNIST_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]"
id: Single_MXNet_MNIST_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]"
id: Single_MXNet_MNIST_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]"
id: Single_MXNet_MNIST_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]"
id: Single_MXNet_MNIST_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]"
id: Single_MXNet_MNIST_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single PyTorch MNIST [attempt 1 of 3]"
id: Single_PyTorch_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [attempt 2 of 3]"
id: Single_PyTorch_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [attempt 3 of 3]"
id: Single_PyTorch_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]"
id: Single_PyTorch_MNIST_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]"
id: Single_PyTorch_MNIST_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]"
id: Single_PyTorch_MNIST_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]"
id: Single_PyTorch_MNIST_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]"
id: Single_PyTorch_MNIST_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]"
id: Single_PyTorch_MNIST_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Spark Keras MNIST [attempt 1 of 3]"
id: Spark_Keras_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Keras MNIST [attempt 2 of 3]"
id: Spark_Keras_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Keras MNIST [attempt 3 of 3]"
id: Spark_Keras_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Keras Rossmann Estimator [attempt 1 of 3]"
id: Spark_Keras_Rossmann_Estimator_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Estimator [attempt 2 of 3]"
id: Spark_Keras_Rossmann_Estimator_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Estimator [attempt 3 of 3]"
id: Spark_Keras_Rossmann_Estimator_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Run [attempt 1 of 3]"
id: Spark_Keras_Rossmann_Run_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Run [attempt 2 of 3]"
id: Spark_Keras_Rossmann_Run_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Run [attempt 3 of 3]"
id: Spark_Keras_Rossmann_Run_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Lightning MNIST [attempt 1 of 3]"
id: Spark_Lightning_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Lightning MNIST [attempt 2 of 3]"
id: Spark_Lightning_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Lightning MNIST [attempt 3 of 3]"
id: Spark_Lightning_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark PyTests [attempt 1 of 3]"
id: Spark_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)"
shell: bash
- name: "Spark PyTests [attempt 2 of 3]"
id: Spark_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)"
shell: bash
- name: "Spark PyTests [attempt 3 of 3]"
id: Spark_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)"
shell: bash
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]"
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]"
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]"
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Spark Torch MNIST [attempt 1 of 3]"
id: Spark_Torch_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Torch MNIST [attempt 2 of 3]"
id: Spark_Torch_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Torch MNIST [attempt 3 of 3]"
id: Spark_Torch_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: Upload Test Results
uses: actions/upload-artifact@v3
if: always() && contains(matrix.image, '-cpu-')
with:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml
build-and-test-heads:
name: "Build and Test heads (${{ matrix.image }})"
needs: [init-workflow, build-and-test]
if: >
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true'
runs-on: ubuntu-latest
strategy:
max-parallel: 2
fail-fast: false
matrix:
include:
- image: test-cpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_4_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet2_MNIST_api: true
Gloo_MXNet2_MNIST_horovodrun: true
Gloo_Parallel_PyTests: true
Gloo_PyTorch_MNIST_api: true
Gloo_PyTorch_MNIST_horovodrun: true
Gloo_Single_PyTests: true
Gloo_TensorFlow_2_0_Keras_MNIST_api: true
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true
Gloo_TensorFlow_2_0_MNIST_api: true
Gloo_TensorFlow_2_0_MNIST_horovodrun: true
MPI_Cluster_PyTests: true
MPI_MXNet2_MNIST_api: true
MPI_MXNet2_MNIST_horovodrun: true
MPI_Parallel_PyTests: true
MPI_PyTorch_MNIST_api: true
MPI_PyTorch_MNIST_horovodrun: true
MPI_Single_PyTests: true
MPI_TensorFlow_2_0_Keras_MNIST_api: true
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true
MPI_TensorFlow_2_0_MNIST_api: true
MPI_TensorFlow_2_0_MNIST_horovodrun: true
Run_PyTests_test_interactiverun: true
Single_MXNet2_MNIST: true
Single_PyTorch_MNIST: true
Spark_Lightning_MNIST: true
Spark_PyTests: true
Spark_TensorFlow_2_0_MNIST_Data_Service: true
Spark_Torch_MNIST: true
build_timeout: 30
- image: test-gpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_4_0
build_timeout: 40
steps:
- name: Clean up disk space
# deleting these paths frees 38 GB disk space:
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
# but this sometimes takes 3-4 minutes
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB
run: |
echo ::group::Disk space before clean up
df -h
echo ::endgroup::
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \
/usr/share/dotnet/shared \
/usr/local/lib/android/sdk/ndk \
/usr/local/lib/android/sdk/build-tools \
/opt/ghc
do
echo ::group::Deleting "$dir"
sudo du -hsc $dir | tail -n1 || true
sudo rm -rf $dir
echo ::endgroup::
done
echo ::group::Disk space after clean up
df -h
echo ::endgroup::
- name: Checkout
uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Build
id: build
run: |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker compose -f docker-compose.test.yml build ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1
- name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]"
id: Elastic_Spark_TensorFlow_Tests_1_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]"
id: Elastic_Spark_TensorFlow_Tests_1_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]"
id: Elastic_Spark_TensorFlow_Tests_1_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]"
id: Elastic_Spark_TensorFlow_Tests_2_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]"
id: Elastic_Spark_TensorFlow_Tests_2_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py"
shell: bash
- name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]"
id: Elastic_Spark_TensorFlow_Tests_2_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py"
shell: bash
- name: "Elastic Spark Torch Tests [attempt 1 of 3]"
id: Elastic_Spark_Torch_Tests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py"
shell: bash
- name: "Elastic Spark Torch Tests [attempt 2 of 3]"
id: Elastic_Spark_Torch_Tests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py"
shell: bash
- name: "Elastic Spark Torch Tests [attempt 3 of 3]"
id: Elastic_Spark_Torch_Tests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py"
shell: bash
- name: "Elastic Tests 1 [attempt 1 of 3]"
id: Elastic_Tests_1_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
shell: bash
- name: "Elastic Tests 1 [attempt 2 of 3]"
id: Elastic_Tests_1_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
shell: bash
- name: "Elastic Tests 1 [attempt 3 of 3]"
id: Elastic_Tests_1_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
shell: bash
- name: "Elastic Tests 2 [attempt 1 of 3]"
id: Elastic_Tests_2_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py"
shell: bash
- name: "Elastic Tests 2 [attempt 2 of 3]"
id: Elastic_Tests_2_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py"
shell: bash
- name: "Elastic Tests 2 [attempt 3 of 3]"
id: Elastic_Tests_2_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py"
shell: bash
- name: "Gloo Cluster PyTests [attempt 1 of 3]"
id: Gloo_Cluster_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
shell: bash
- name: "Gloo Cluster PyTests [attempt 2 of 3]"
id: Gloo_Cluster_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
shell: bash
- name: "Gloo Cluster PyTests [attempt 3 of 3]"
id: Gloo_Cluster_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
shell: bash
- name: "Gloo Keras MNIST [attempt 1 of 3]"
id: Gloo_Keras_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py
shell: bash
- name: "Gloo Keras MNIST [attempt 2 of 3]"
id: Gloo_Keras_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py
shell: bash
- name: "Gloo Keras MNIST [attempt 3 of 3]"
id: Gloo_Keras_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py
shell: bash
- name: "Gloo MXNet2 MNIST api [attempt 1 of 3]"
id: Gloo_MXNet2_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo MXNet2 MNIST api [attempt 2 of 3]"
id: Gloo_MXNet2_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo MXNet2 MNIST api [attempt 3 of 3]"
id: Gloo_MXNet2_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo MXNet2 MNIST horovodrun [attempt 1 of 3]"
id: Gloo_MXNet2_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
shell: bash
- name: "Gloo MXNet2 MNIST horovodrun [attempt 2 of 3]"
id: Gloo_MXNet2_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
shell: bash
- name: "Gloo MXNet2 MNIST horovodrun [attempt 3 of 3]"
id: Gloo_MXNet2_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
shell: bash
- name: "Gloo MXNet MNIST horovodrun [attempt 1 of 3]"
id: Gloo_MXNet_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py
shell: bash
- name: "Gloo MXNet MNIST horovodrun [attempt 2 of 3]"
id: Gloo_MXNet_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py
shell: bash
- name: "Gloo MXNet MNIST horovodrun [attempt 3 of 3]"
id: Gloo_MXNet_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py
shell: bash
- name: "Gloo Parallel PyTests [attempt 1 of 3]"
id: Gloo_Parallel_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
shell: bash
- name: "Gloo Parallel PyTests [attempt 2 of 3]"
id: Gloo_Parallel_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
shell: bash
- name: "Gloo Parallel PyTests [attempt 3 of 3]"
id: Gloo_Parallel_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
shell: bash
- name: "Gloo PyTorch MNIST api [attempt 1 of 3]"
id: Gloo_PyTorch_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo PyTorch MNIST api [attempt 2 of 3]"
id: Gloo_PyTorch_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo PyTorch MNIST api [attempt 3 of 3]"
id: Gloo_PyTorch_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo
shell: bash
- name: "Gloo PyTorch MNIST horovodrun [attempt 1 of 3]"
id: Gloo_PyTorch_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
shell: bash
- name: "Gloo PyTorch MNIST horovodrun [attempt 2 of 3]"
id: Gloo_PyTorch_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
shell: bash
- name: "Gloo PyTorch MNIST horovodrun [attempt 3 of 3]"
id: Gloo_PyTorch_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
shell: bash
- name: "Gloo Single PyTests [attempt 1 of 3]"
id: Gloo_Single_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
shell: bash
- name: "Gloo Single PyTests [attempt 2 of 3]"
id: Gloo_Single_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
shell: bash
- name: "Gloo Single PyTests [attempt 3 of 3]"
id: Gloo_Single_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
shell: bash
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]"
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
shell: bash
- name: "Gloo TensorFlow MNIST [attempt 1 of 3]"
id: Gloo_TensorFlow_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py
shell: bash
- name: "Gloo TensorFlow MNIST [attempt 2 of 3]"
id: Gloo_TensorFlow_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py
shell: bash
- name: "Gloo TensorFlow MNIST [attempt 3 of 3]"
id: Gloo_TensorFlow_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py
shell: bash
- name: "MPI Cluster PyTests [attempt 1 of 3]"
id: MPI_Cluster_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [attempt 2 of 3]"
id: MPI_Cluster_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [attempt 3 of 3]"
id: MPI_Cluster_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]"
id: MPI_Cluster_PyTests_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]"
id: MPI_Cluster_PyTests_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]"
id: MPI_Cluster_PyTests_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]"
id: MPI_Cluster_PyTests_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]"
id: MPI_Cluster_PyTests_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]"
id: MPI_Cluster_PyTests_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py"
shell: bash
- name: "MPI MXNet2 MNIST api [attempt 1 of 3]"
id: MPI_MXNet2_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI MXNet2 MNIST api [attempt 2 of 3]"
id: MPI_MXNet2_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI MXNet2 MNIST api [attempt 3 of 3]"
id: MPI_MXNet2_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI MXNet2 MNIST horovodrun [attempt 1 of 3]"
id: MPI_MXNet2_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py"
shell: bash
- name: "MPI MXNet2 MNIST horovodrun [attempt 2 of 3]"
id: MPI_MXNet2_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py"
shell: bash
- name: "MPI MXNet2 MNIST horovodrun [attempt 3 of 3]"
id: MPI_MXNet2_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [attempt 1 of 3]"
id: MPI_MXNet_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [attempt 2 of 3]"
id: MPI_MXNet_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [attempt 3 of 3]"
id: MPI_MXNet_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py"
shell: bash
- name: "MPI Parallel PyTests [attempt 1 of 3]"
id: MPI_Parallel_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [attempt 2 of 3]"
id: MPI_Parallel_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [attempt 3 of 3]"
id: MPI_Parallel_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]"
id: MPI_Parallel_PyTests_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]"
id: MPI_Parallel_PyTests_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]"
id: MPI_Parallel_PyTests_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]"
id: MPI_Parallel_PyTests_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]"
id: MPI_Parallel_PyTests_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]"
id: MPI_Parallel_PyTests_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)"
shell: bash
- name: "MPI PyTorch MNIST api [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets"
shell: bash
- name: "MPI Single PyTests [attempt 1 of 3]"
id: MPI_Single_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [attempt 2 of 3]"
id: MPI_Single_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [attempt 3 of 3]"
id: MPI_Single_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]"
id: MPI_Single_PyTests_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]"
id: MPI_Single_PyTests_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]"
id: MPI_Single_PyTests_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]"
id: MPI_Single_PyTests_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]"
id: MPI_Single_PyTests_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]"
id: MPI_Single_PyTests_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]"
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py"
shell: bash
- name: "Run PyTests test_interactiverun [attempt 1 of 3]"
id: Run_PyTests_test_interactiverun_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py"
shell: bash
- name: "Run PyTests test_interactiverun [attempt 2 of 3]"
id: Run_PyTests_test_interactiverun_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py"
shell: bash
- name: "Run PyTests test_interactiverun [attempt 3 of 3]"
id: Run_PyTests_test_interactiverun_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py"
shell: bash
- name: "Single Keras MNIST [attempt 1 of 3]"
id: Single_Keras_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64"
shell: bash
- name: "Single Keras MNIST [attempt 2 of 3]"
id: Single_Keras_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64"
shell: bash
- name: "Single Keras MNIST [attempt 3 of 3]"
id: Single_Keras_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64"
shell: bash
- name: "Single MXNet2 MNIST [attempt 1 of 3]"
id: Single_MXNet2_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet2 MNIST [attempt 2 of 3]"
id: Single_MXNet2_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet2 MNIST [attempt 3 of 3]"
id: Single_MXNet2_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [attempt 1 of 3]"
id: Single_MXNet_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [attempt 2 of 3]"
id: Single_MXNet_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [attempt 3 of 3]"
id: Single_MXNet_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]"
id: Single_MXNet_MNIST_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]"
id: Single_MXNet_MNIST_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]"
id: Single_MXNet_MNIST_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]"
id: Single_MXNet_MNIST_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]"
id: Single_MXNet_MNIST_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]"
id: Single_MXNet_MNIST_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3"
shell: bash
- name: "Single PyTorch MNIST [attempt 1 of 3]"
id: Single_PyTorch_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [attempt 2 of 3]"
id: Single_PyTorch_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [attempt 3 of 3]"
id: Single_PyTorch_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]"
id: Single_PyTorch_MNIST_ONECCL_MPI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]"
id: Single_PyTorch_MNIST_ONECCL_MPI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]"
id: Single_PyTorch_MNIST_ONECCL_MPI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]"
id: Single_PyTorch_MNIST_ONECCL_OFI_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]"
id: Single_PyTorch_MNIST_ONECCL_OFI_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]"
id: Single_PyTorch_MNIST_ONECCL_OFI_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets"
shell: bash
- name: "Spark Keras MNIST [attempt 1 of 3]"
id: Spark_Keras_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Keras MNIST [attempt 2 of 3]"
id: Spark_Keras_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Keras MNIST [attempt 3 of 3]"
id: Spark_Keras_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Keras Rossmann Estimator [attempt 1 of 3]"
id: Spark_Keras_Rossmann_Estimator_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Estimator [attempt 2 of 3]"
id: Spark_Keras_Rossmann_Estimator_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Estimator [attempt 3 of 3]"
id: Spark_Keras_Rossmann_Estimator_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Run [attempt 1 of 3]"
id: Spark_Keras_Rossmann_Run_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Run [attempt 2 of 3]"
id: Spark_Keras_Rossmann_Run_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Keras Rossmann Run [attempt 3 of 3]"
id: Spark_Keras_Rossmann_Run_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1"
shell: bash
- name: "Spark Lightning MNIST [attempt 1 of 3]"
id: Spark_Lightning_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Lightning MNIST [attempt 2 of 3]"
id: Spark_Lightning_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Lightning MNIST [attempt 3 of 3]"
id: Spark_Lightning_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark PyTests [attempt 1 of 3]"
id: Spark_PyTests_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)"
shell: bash
- name: "Spark PyTests [attempt 2 of 3]"
id: Spark_PyTests_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)"
shell: bash
- name: "Spark PyTests [attempt 3 of 3]"
id: Spark_PyTests_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)"
shell: bash
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]"
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]"
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]"
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json"
shell: bash
- name: "Spark Torch MNIST [attempt 1 of 3]"
id: Spark_Torch_MNIST_run_1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Torch MNIST [attempt 2 of 3]"
id: Spark_Torch_MNIST_run_2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_1.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: "Spark Torch MNIST [attempt 3 of 3]"
id: Spark_Torch_MNIST_run_3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_2.outcome == 'failure'
run: |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
shell: bash
- name: Upload Test Results
uses: actions/upload-artifact@v3
if: always() && contains(matrix.image, '-cpu-')
with:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml
build-mins:
name: "Build mins (${{ matrix.image }})"
needs: [init-workflow, build-and-test]
if: >
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true'
runs-on: ubuntu-latest
strategy:
max-parallel: 2
fail-fast: false
matrix:
include:
- image: test-cpu-openmpi-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
build_timeout: 30
- image: test-gpu-openmpi-gloo-py3_8-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
build_timeout: 40
steps:
- name: Clean up disk space
# deleting these paths frees 38 GB disk space:
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
# but this sometimes takes 3-4 minutes
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB
run: |
echo ::group::Disk space before clean up
df -h
echo ::endgroup::
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \
/usr/share/dotnet/shared \
/usr/local/lib/android/sdk/ndk \
/usr/local/lib/android/sdk/build-tools \
/opt/ghc
do
echo ::group::Deleting "$dir"
sudo du -hsc $dir | tail -n1 || true
sudo rm -rf $dir
echo ::endgroup::
done
echo ::group::Disk space after clean up
df -h
echo ::endgroup::
- name: Checkout
uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Build
id: build
run: |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker compose -f docker-compose.test.yml build ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1
- name: Upload Test Results
uses: actions/upload-artifact@v3
if: always() && contains(matrix.image, '-cpu-')
with:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml
build-and-test-macos:
name: "Build and Test macOS (${{ matrix.image }}-macos)"
needs: [init-workflow, build-and-test]
if: >
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true'
runs-on: macos-11
strategy:
max-parallel: 3
fail-fast: false
matrix:
include:
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0
HOROVOD_WITH_MPI: 1
HOROVOD_WITHOUT_GLOO: 1
TENSORFLOW: 1.15.0
KERAS: 2.2.4
PYTORCH: 1.6.0
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.7.0
MXNET: 1.5.1.post0
- image: test-cpu-gloo-py3_8-tf2_9_2-keras2_9_0-torch1_11_0-mxnet1_7_0_p2
HOROVOD_WITHOUT_MPI: 1
HOROVOD_WITH_GLOO: 1
TENSORFLOW: 2.9.2
KERAS: 2.9.0
PYTORCH: 1.11.0
PYTORCH_LIGHTNING: 1.5.9
TORCHVISION: 0.12.0
MXNET: 1.7.0.post2
- image: test-openmpi-cpu-gloo-py3_8-tf2_10_0-keras2_10_0-torch1_12_1-mxnet1_9_1
HOROVOD_WITH_MPI: 1
HOROVOD_WITH_GLOO: 1
TENSORFLOW: 2.10.0
KERAS: 2.10.0
PYTORCH: 1.12.1
PYTORCH_LIGHTNING: 1.5.9
TORCHVISION: 0.13.1
MXNET: 1.9.1
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: recursive
- name: Build
id: build
env:
HOROVOD_WITH_MPI: ${{ matrix.HOROVOD_WITH_MPI }}
HOROVOD_WITHOUT_MPI: ${{ matrix.HOROVOD_WITHOUT_MPI }}
HOROVOD_WITH_GLOO: ${{ matrix.HOROVOD_WITH_GLOO }}
HOROVOD_WITHOUT_GLOO: ${{ matrix.HOROVOD_WITHOUT_GLOO }}
TENSORFLOW: ${{ matrix.TENSORFLOW }}
KERAS: ${{ matrix.KERAS }}
PYTORCH: ${{ matrix.PYTORCH }}
PYTORCH_LIGHTNING: ${{ matrix.PYTORCH_LIGHTNING }}
TORCHVISION: ${{ matrix.TORCHVISION }}
MXNET: ${{ matrix.MXNET }}
# The python patch in the pyenv install step is to work around an incompatibility introduced in new xcode version in macOS Big Sur. The patch is provided by python team.
# The original discussion is here https://github.com/pyenv/pyenv/issues/1737
run: |
brew reinstall -f zlib bzip2
brew install -f openmpi cmake libuv pyenv coreutils curl
export PATH=$(pyenv root)/shims:$PATH
pyenv uninstall -f 3.7.7
CFLAGS="-I$(brew --prefix bzip2)/include -I$(brew --prefix zlib)/include" LDFLAGS="-L$(brew --prefix zlib)/lib -L$(brew --prefix bzip2)/lib" pyenv install --patch 3.7.7 < <(curl -sSL https://github.com/python/cpython/commit/8ea6353.patch)
pyenv global 3.7.7
python --version
python -m pip install -U pip
pip install tensorflow==${TENSORFLOW} keras==${KERAS}
if [[ ${TENSORFLOW} == 1.* ]] || [[ ${TENSORFLOW} == 2.[012345].* ]]; then pip install "h5py<3" "protobuf~=3.20"; fi
pip install torch==${PYTORCH} pytorch_lightning==${PYTORCH_LIGHTNING} torchvision==${TORCHVISION}
pip install mxnet==${MXNET}
HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir .[test]
horovodrun --check-build
- name: Test [attempt 1 of 3]
id: test-1
continue-on-error: true
if: always() && steps.build.outcome == 'success' && true
run: |
export PATH=$(pyenv root)/shims:$PATH
pyenv global 3.7.7
python --version
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-1"
mkdir -p "$artifacts_path"
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh
chmod u+x pytest.sh
cd test/parallel
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos
- name: Test [attempt 2 of 3]
id: test-2
continue-on-error: true
if: always() && steps.build.outcome == 'success' && steps.test-1.outcome == 'failure'
run: |
export PATH=$(pyenv root)/shims:$PATH
pyenv global 3.7.7
python --version
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-2"
mkdir -p "$artifacts_path"
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh
chmod u+x pytest.sh
cd test/parallel
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos
- name: Test [attempt 3 of 3]
id: test-3
continue-on-error: false
if: always() && steps.build.outcome == 'success' && steps.test-2.outcome == 'failure'
run: |
export PATH=$(pyenv root)/shims:$PATH
pyenv global 3.7.7
python --version
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-3"
mkdir -p "$artifacts_path"
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh
chmod u+x pytest.sh
cd test/parallel
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos
- name: Upload Test Results
uses: actions/upload-artifact@v3
if: always()
with:
name: Unit Test Results - ${{ matrix.image }}-macos
path: |
${{ steps.test-1.outputs.artifacts-path }}
${{ steps.test-2.outputs.artifacts-path }}
${{ steps.test-3.outputs.artifacts-path }}
buildkite-trigger:
name: "Build and Test GPU (trigger Builtkite)"
needs: [init-workflow, build-and-test]
runs-on: ubuntu-latest
if: >
github.repository == 'horovod/horovod' &&
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
outputs:
url: ${{ steps.build.outputs.url }}
steps:
- name: Trigger Buildkite Pipeline
id: build
uses: buildkite/trigger-pipeline-action@v1.3.1
env:
PIPELINE: "horovod/horovod"
# COMMIT is taken from GITHUB_SHA
BRANCH: "${{ needs.init-workflow.outputs.buildkite-branch-label }} (GPU NON HEADS)"
# empty MESSAGE will be filled by Buildkite from commit message
MESSAGE: "${{ needs.init-workflow.outputs.buildkite-message }}"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU NON HEADS\"}"
buildkite:
name: "Build and Test GPU (download Builtkite)"
needs: [buildkite-trigger]
runs-on: ubuntu-latest
steps:
- name: Download Buildkite Artifacts
id: download
uses: EnricoMi/download-buildkite-artifact-action@v1
with:
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
buildkite_build_url: ${{ needs.buildkite-trigger.outputs.url }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out
output_path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite
- name: Upload Test Results
uses: actions/upload-artifact@v3
if: always()
with:
name: Unit Test Results - GPU NON HEADS on Builtkite
path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite/**/*.xml
- name: Check Buildkite job state
if: >
always() &&
steps.download.conclusion == 'success' &&
steps.download.outputs.build-state != 'passed'
run: |
echo "::warning::Buildkite pipeline did not pass: ${{ needs.buildkite-trigger.outputs.url }}"
exit 1
buildkite-heads-trigger:
name: "Build and Test GPU heads (trigger Builtkite)"
needs: [init-workflow, build-and-test]
runs-on: ubuntu-latest
if: >
github.repository == 'horovod/horovod' &&
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
outputs:
url: ${{ steps.build.outputs.url }}
steps:
- name: Trigger Buildkite Pipeline
id: build
uses: buildkite/trigger-pipeline-action@v1.3.1
env:
PIPELINE: "horovod/horovod"
# COMMIT is taken from GITHUB_SHA
BRANCH: "${{ needs.init-workflow.outputs.buildkite-branch-label }} (GPU HEADS)"
# empty MESSAGE will be filled by Buildkite from commit message
MESSAGE: "${{ needs.init-workflow.outputs.buildkite-message }}"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU HEADS\"}"
buildkite-heads:
name: "Build and Test GPU heads (download Builtkite)"
needs: [buildkite-heads-trigger]
runs-on: ubuntu-latest
steps:
- name: Download Buildkite Artifacts
id: download
uses: EnricoMi/download-buildkite-artifact-action@v1
with:
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
buildkite_build_url: ${{ needs.buildkite-heads-trigger.outputs.url }}
ignore_build_states: blocked,canceled,skipped,not_run
ignore_job_states: timed_out
output_path: artifacts/Unit Test Results - GPU HEADS on Builtkite
- name: Upload Test Results
uses: actions/upload-artifact@v3
if: always()
with:
name: Unit Test Results - GPU HEADS on Builtkite
path: artifacts/Unit Test Results - GPU HEADS on Builtkite/**/*.xml
- name: Check Buildkite job state
if: >
always() &&
steps.download.conclusion == 'success' &&
steps.download.outputs.build-state != 'passed'
run: |
echo "::warning::Buildkite pipeline did not pass: ${{ needs.buildkite-heads-trigger.outputs.url }}"
exit 1
docker-config:
name: Configure docker build
needs: [init-workflow, build-and-test, buildkite]
# build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)
# buildkite might have been skipped (workflow runs for a fork PR),
# we still want to build docker images (though we might not want to push them)
if: >
always() &&
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true' &&
needs.build-and-test.result == 'success' &&
( needs.buildkite.result == 'success' || needs.buildkite.result == 'skipped' )
runs-on: ubuntu-latest
outputs:
run: ${{ steps.config.outputs.run }}
push: ${{ steps.config.outputs.push }}
steps:
- name: Config
id: config
env:
# run workflow for all events on Horovod repo and non-schedule events on forks
run: ${{ github.repository == 'horovod/horovod' || github.event_name != 'schedule' }}
# push images only from Horovod repo and for schedule and push events
push: ${{ github.repository == 'horovod/horovod' && contains('schedule,push', github.event_name) }}
run: |
echo Repository: ${{ github.repository }}
echo Event: ${{ github.event_name }}
echo Run: $run
echo "run=$run" >> $GITHUB_OUTPUT
echo Push: $push
echo "push=$push" >> $GITHUB_OUTPUT
docker-build:
name: Build docker image ${{ matrix.docker-image }} (push=${{ needs.docker-config.outputs.push }})
needs: docker-config
if: always() && needs.docker-config.outputs.run == 'true'
runs-on: ubuntu-latest
# we want an ongoing run of this workflow to be canceled by a later commit
# so that there is only one concurrent run of this workflow for each branch
concurrency:
# github.ref means something like refs/heads/master or refs/tags/v0.22.1 or the branch.
# This helps to not cancel concurrent runs on master and a tag that share the same commit
# head_ref refers to the pull request branch so we run only one workflow for the given pull request.
# On master, head_ref is empty, so we use the SHA of the commit, this means
# commits to master will not be cancelled, which is important to ensure
# that every commit to master is full tested and deployed.
group: docker-${{ matrix.docker-image }}-${{ github.ref }}-${{ github.head_ref || github.sha }}
cancel-in-progress: true
strategy:
fail-fast: false
matrix:
docker-image:
- horovod
- horovod-cpu
- horovod-nvtabular
- horovod-ray
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: 'recursive'
- name: Docker meta
id: meta
uses: crazy-max/ghaction-docker-meta@v2
with:
# list of Docker images to use as base name for tags
images: |
horovod/${{ matrix.docker-image }}
# generate Docker tags based on the following events/attributes
tags: |
type=schedule
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=sha
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
driver: docker
- name: Login to DockerHub
if: needs.docker-config.outputs.push == 'true'
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Clean up disk space
# deleting these paths frees 38 GB disk space:
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
# but this sometimes takes 3-4 minutes
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB
run: |
echo ::group::Disk space before clean up
df -h
echo ::endgroup::
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \
/usr/share/dotnet/shared \
/usr/local/lib/android/sdk/ndk \
/usr/local/lib/android/sdk/build-tools \
/opt/ghc
do
echo ::group::Deleting "$dir"
sudo du -hsc $dir | tail -n1 || true
sudo rm -rf $dir
echo ::endgroup::
done
echo ::group::Disk space after clean up
df -h
echo ::endgroup::
- name: Build image
id: build
uses: docker/build-push-action@v3
timeout-minutes: 60
with:
context: .
file: ./docker/${{ matrix.docker-image }}/Dockerfile
pull: true
push: false
load: true
tags: horovod-test
outputs: type=docker
- name: List image
run: |
docker image ls horovod-test
- name: Prepare container for test
run: |
grep "RUN sed" Dockerfile.test.cpu | sed "s/^RUN //" | docker run -i --name horovod-test horovod-test:latest /bin/bash
- name: Test image (pytorch gloo)
if: always() && steps.build.outcome == 'success'
run: |
docker start -ai horovod-test <<<"python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo"
- name: Test image (tensorflow2 gloo)
if: always() && steps.build.outcome == 'success'
run: |
docker start -ai horovod-test <<<"python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo"
- name: Test image (pytorch mpi)
if: always() && steps.build.outcome == 'success' && matrix.docker-image != 'horovod-ray'
run: |
docker start -ai horovod-test <<<"python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi"
- name: Test image (tensorflow2 mpi)
if: always() && steps.build.outcome == 'success' && matrix.docker-image != 'horovod-ray'
run: |
docker start -ai horovod-test <<<"python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi"
- name: Push image
if: needs.docker-config.outputs.push == 'true'
uses: docker/build-push-action@v3
timeout-minutes: 60
with:
context: .
file: ./docker/${{ matrix.docker-image }}/Dockerfile
push: ${{ needs.docker-config.outputs.push }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: Show free space
if: always()
run: |
echo ::group::Disk Space
df -h
echo ::endgroup::
echo ::group::Docker Space
docker system df
echo ::endgroup::
echo ::group::Docker Images
docker images -a
echo ::endgroup::
echo ::group::Docker Container
docker container list -a
echo ::endgroup::
sync-files:
name: "Sync Files (${{ matrix.name }})"
needs: [init-workflow]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: Docs Summary
left_file: README.rst
right_file: docs/summary.rst
init: sed -i -e s/docs\///g README.rst
- name: Examples Keras Spark3
left_file: examples/spark/keras/keras_spark_rossmann_run.py
right_file: examples/spark/keras/keras_spark3_rossmann.py
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Diffing ${{ matrix.left_file }} with ${{ matrix.right_file }}
env:
LEFT: ${{ matrix.left_file }}
RIGHT: ${{ matrix.right_file }}
INIT: ${{ matrix.init }}
run: |
$INIT
patch --quiet -p0 $LEFT ${RIGHT}.patch -o ${LEFT}.expected
if ! diff -q ${LEFT}.expected --label $LEFT $RIGHT
then
echo
echo "::error::Files are out-of-sync: $LEFT vs. $RIGHT"
echo "Unexpected differences are:"
diff ${LEFT}.expected --label $LEFT $RIGHT || true
echo
echo "Use the following as ${RIGHT}.patch to accept those changes:"
diff $LEFT $RIGHT || true
false
fi