Skip to content

Commit

Permalink
Added L0 diagnostics, corrected scaling test, added temperature test,…
Browse files Browse the repository at this point in the history
… halfduplex measures host-device bandwidth
  • Loading branch information
tbennun committed Feb 7, 2016
1 parent 8fe9420 commit 517fc25
Show file tree
Hide file tree
Showing 8 changed files with 382 additions and 72 deletions.
9 changes: 6 additions & 3 deletions CMakeLists.txt
Expand Up @@ -61,9 +61,12 @@ endif()

#########################################

# Level 1 (microbenchmarks)
# Level 0 (diagnostics)

cuda_add_executable(numgpus src/L0/numgpus.cpp)
cuda_add_executable(devinfo src/L0/devinfo.cpp)

cuda_add_executable(numgpus src/L1/numgpus.cpp)
# Level 1 (microbenchmarks)

cuda_add_executable(halfduplex src/L1/halfduplex.cpp)
target_link_libraries(halfduplex gflags-static ${EXTRA_LIBS})
Expand All @@ -75,7 +78,7 @@ cuda_add_executable(uva src/L1/uva.cu)
target_link_libraries(uva gflags-static ${EXTRA_LIBS})


# Level 2
# Level 2 (micro-applications)

cuda_add_executable(gol src/L2/gol/golsample.cu src/L2/gol/main.cpp)
target_link_libraries(gol gflags-static ${EXTRA_LIBS})
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -5,7 +5,7 @@ MGBench: Multi-GPU Computing Benchmark Suite

This set of applications test the performance, bus speed, power efficiency and correctness of a multi-GPU node.

It is comprised of Level-1 tests (microbenchmarks) and Level-2 tests (micro-applications).
It is comprised of Level-0 tests (diagnostic utilities), Level-1 tests (microbenchmarks), and Level-2 tests (micro-applications).

Requirements
------------
Expand Down
24 changes: 14 additions & 10 deletions TESTS.md
@@ -1,13 +1,21 @@
List of Tests
=============

Level-0
-------

* `numgpus`: Returns the number of available GPUs.

* `devinfo`: Returns information regarding each device and the DMA access between them.


Level-1
-------

* Half-duplex: Tests inter-GPU uni-directional bandwidth by copying data from each GPU to the rest of the GPUs.
* Half-duplex: Tests uni-directional bandwidth by copying data from each GPU to the host/other GPUs.
* Special flags:
* `--from`: Specify only one GPU to copy from (or -1 for all GPUs)
* `--to`: Specify a single target GPU to copy to (or -1 for all GPUs)
* `--from`: Specify only one GPU to copy from (0 for host, 1...N for a specific GPU or -1 for all GPUs)
* `--to`: Specify a single target GPU to copy to (0 for host, 1...N for a specific GPU or -1 for all GPUs)

* Full-duplex: Tests inter-GPU bi-directional bandwidth by exchanging data between GPUs.
* Special flags:
Expand All @@ -31,6 +39,9 @@ Level-2
* Modes:
* Correctness: Runs SGEMM with CPU regression for a small amount of iterations to verify the results.
* Performance: Runs SGEMM without regression, averaging multiplication time over a large amount of repetitions to obtain accurate performance. Scaling should be near-linear.
* Cooling: Runs SGEMM for a specified amount of seconds in order to heat the GPUs.
* Special flags:
* `heat`: Specifies the number of seconds (instead of repetitions) to run SGEMM kernels consecutively, thereby heating the GPU(s).

* Game of Life (gol): Simple stencil operator that tests multi-GPU correctness as well as inter-GPU communications.
* Modes:
Expand All @@ -40,13 +51,6 @@ Level-2
* Special flags:
* `--save_images`: Saves the two images in case the regression test failed (default: true).


Miscellaneous
-------------

* `numgpus`: A simple application that prints the number of available GPUs


Test Flags
----------

Expand Down
98 changes: 92 additions & 6 deletions run.sh
Expand Up @@ -35,7 +35,53 @@ then
exit 0
fi

#######################################
# Find nvidia-smi for temperature tests
TEMPTEST=0
NVSMI=`which nvidia-smi`
if ! [ -x "$NVSMI" ]
then
NVSMI=`find /usr/local -name 'nvidia-smi' 2> /dev/null`
if ! [ -x "$NVSMI" ]
then
NVSMI=`find -L /etc -name 'nvidia-smi' 2> /dev/null`
if ! [ -x "$NVSMI" ]
then
echo "WARNING: nvidia-smi not found"
else
TEMPTEST=1
fi
else
TEMPTEST=1
fi
else
TEMPTEST=1
fi

if [ $TEMPTEST -eq 1 ]
then
echo "Found nvidia-smi at ${NVSMI}"
fi
#######################################


# Run L0 diagnostics
echo ""
echo "L0 diagnostics"
echo "--------------"

echo "1/2 Computer information"
echo "CPU Info:" > l0-info.log
cat /proc/cpuinfo >> l0-info.log
echo "Memory Info:" >> l0-info.log
cat /proc/meminfo >> l0-info.log

echo "2/2 Device information"
./build/devinfo > l0-devices.log


# Run L1 tests
echo ""
echo "L1 Tests"
echo "--------"

Expand All @@ -58,27 +104,27 @@ echo "6/7 Full-duplex DMA Write"
./build/uva --write --fullduplex > l1-uvawfull.log

echo "7/7 Scaling"
./build/sgemm -n 4096 -k 4096 -m 4096 --repetitions=100 --regression=false --scaling=true > l1-scaling.log
./build/sgemm -n 4096 -k 4096 -m 4096 --repetitions=100 --regression=false --scaling > l1-scaling.log

# Run L2 tests
echo ""
echo "L2 Tests"
echo "--------"

# Matrix multiplication
echo "1/5 Matrix multiplication (correctness)"
echo "1/6 Matrix multiplication (correctness)"
./build/sgemm -n 1024 -k 1024 -m 1024 --repetitions=100 --regression=true > l2-sgemm-correctness.log
echo "2/5 Matrix multiplication (performance)"
echo "2/6 Matrix multiplication (performance)"
./build/sgemm -n 8192 -k 8192 -m 8192 --repetitions=100 --regression=false > l2-sgemm-perf.log

# Stencil operator
echo "3/5 Stencil (correctness)"
echo "3/6 Stencil (correctness)"
./build/gol --repetitions=5 --regression=true > l2-gol-correctness.log
echo "4/5 Stencil (performance)"
echo "4/6 Stencil (performance)"
./build/gol --repetitions=1000 --regression=false > l2-gol-perf.log

# Test each GPU separately
echo "5/5 Stencil (single GPU correctness)"
echo "5/6 Stencil (single GPU correctness)"
echo "" > l2-gol-single.log
i=0
while [ $i -lt $NUMGPUS ]
Expand All @@ -89,4 +135,44 @@ do
i=`expr $i + 1`
done


# Temperature test
if [ $TEMPTEST -eq 1 ]
then
echo "6/6 Cooling"

# Measure initial temperature
echo "Initial temp: " > l2-cooling.log
$NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
echo "" >> l2-cooling.log

# Wait 1 minute, measure again
sleep 300
echo "Temp after 5min: " >> l2-cooling.log
$NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
echo "" >> l2-cooling.log

# Heat, measure temperature right after application
./build/sgemm --heat=60 --regression=false --startwith=$NUMGPUS >> l2-cooling.log
echo "Temp after heat: " >> l2-cooling.log
$NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
echo "" >> l2-cooling.log

# Cool for 1 minute, measure again
sleep 60
echo "Temp after 1min: " >> l2-cooling.log
$NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
echo "" >> l2-cooling.log

# Cool for 4 more minutes, measure again
sleep 240
echo "Temp after 5min: " >> l2-cooling.log
$NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
echo "" >> l2-cooling.log

else
echo "6/6 Cooling -- SKIPPED"
echo "SKIPPED" > l2-cooling.log
fi

echo "Done!"
142 changes: 142 additions & 0 deletions src/L0/devinfo.cpp
@@ -0,0 +1,142 @@
// MGBench: Multi-GPU Computing Benchmark Suite
// Copyright (c) 2016, Tal Ben-Nun
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the names of the copyright holders nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

#include <cstdio>
#include <cstdlib>
#include <iostream>

#include <cuda_runtime.h>

static void HandleError(const char *file, int line, cudaError_t err)
{
printf("ERROR in %s:%d: %s (%d)\n", file, line,
cudaGetErrorString(err), err);
cudaGetLastError();
}

// CUDA assertions
#define CUDA_CHECK(err) do { cudaError_t errr = (err); if(errr != cudaSuccess) { HandleError(__FILE__, __LINE__, errr); exit(1); } } while(0)

// Device capability helper macros
#define CAP(cap) NCAP(cap, cap)
#define NCAP(cap, name) ((props.cap) ? (#name " ") : "")

int main(int argc, char **argv)
{
int ndevs = 0;
if (cudaGetDeviceCount(&ndevs) != cudaSuccess)
return 1;

int version = 0;
CUDA_CHECK(cudaDriverGetVersion(&version));
std::cout << "Driver version: " << (version / 1000) << "."
<< ((version % 100) / 10) << std::endl;

version = 0;
CUDA_CHECK(cudaRuntimeGetVersion(&version));
std::cout << "Runtime version: " << (version / 1000) << "."
<< ((version % 100) / 10) << std::endl;
std::cout << std::endl;

// Print information for each GPU
for (int i = 0; i < ndevs; ++i)
{
CUDA_CHECK(cudaSetDevice(i));
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, i));

std::cout << "GPU " << (i + 1) << ": " << props.name << " ("
<< props.pciDomainID << "/" << props.pciBusID
<< "/" << props.pciDeviceID << ")" << std::endl


<< "Global memory: " << (props.totalGlobalMem/1024.0/1024.0)
<< " MB" << std::endl
<< "Constant memory: " << props.totalConstMem
<< " bytes" << std::endl
<< "Shared memory: " << props.sharedMemPerBlock
<< " bytes" << std::endl
<< "Registers: " << props.regsPerBlock << std::endl
<< "Warp size: " << props.warpSize << std::endl
<< "Multiprocessors: " << props.multiProcessorCount
<< std::endl
<< "Copy engines: " << props.asyncEngineCount << std::endl
<< "Clock rate: " << (props.clockRate / 1e6)
<< " GHz" << std::endl
<< "Threads per MP: " << props.maxThreadsPerMultiProcessor
<< std::endl
<< "Threads per block: " << props.maxThreadsPerBlock
<< std::endl
<< "Max block size: " << props.maxThreadsDim[0] << "x"
<< props.maxThreadsDim[1] << "x" << props.maxThreadsDim[2]
<< std::endl
<< "Max grid size: " << props.maxGridSize[0] << "x"
<< props.maxGridSize[1] << "x" << props.maxGridSize[2]
<< std::endl
<< "Pitch: " << props.memPitch << " bytes" << std::endl;

std::cout << "Caps: " << NCAP(ECCEnabled, ecc)
<< NCAP(deviceOverlap, overlap)
<< NCAP(unifiedAddressing, uva)
<< NCAP(kernelExecTimeoutEnabled, timeout)
<< CAP(integrated) << NCAP(canMapHostMemory, hostdma)
<< CAP(surfaceAlignment) << CAP(tccDriver) << std::endl;
std::cout << std::endl;
}

std::cout << "DMA access: " << std::endl;
int tmp = 0;

// Print top row
printf(" | ");
for (int i = 0; i < ndevs; ++i)
printf("%2d ", i + 1);
printf("\n---+");
for (int i = 0; i < ndevs; ++i)
printf("---");
printf("\n");

for (int i = 0; i < ndevs; ++i)
{
printf("%2d | ", i + 1);
for (int j = 0; j < ndevs; ++j)
{
if (i == j)
{
printf(" x ");
continue;
}

cudaDeviceCanAccessPeer(&tmp, i, j);
printf("%2d ", tmp ? 1 : 0);
}
printf("\n");
}

return 0;
}
File renamed without changes.

0 comments on commit 517fc25

Please sign in to comment.