Added L0 diagnostics, corrected scaling test, added temperature test,…

… halfduplex measures host-device bandwidth
tbennun · Feb 7, 2016 · 517fc25 · 517fc25
1 parent 8fe9420
commit 517fc25
Show file tree

Hide file tree

Showing 8 changed files with 382 additions and 72 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,9 +61,12 @@ endif()
 
 #########################################
 
-# Level 1 (microbenchmarks)
+# Level 0 (diagnostics)
+
+cuda_add_executable(numgpus src/L0/numgpus.cpp)
+cuda_add_executable(devinfo src/L0/devinfo.cpp)
 
-cuda_add_executable(numgpus src/L1/numgpus.cpp)
+# Level 1 (microbenchmarks)
 
 cuda_add_executable(halfduplex src/L1/halfduplex.cpp)
 target_link_libraries(halfduplex gflags-static ${EXTRA_LIBS})
@@ -75,7 +78,7 @@ cuda_add_executable(uva src/L1/uva.cu)
 target_link_libraries(uva gflags-static ${EXTRA_LIBS})
 
 
-# Level 2
+# Level 2 (micro-applications)
 
 cuda_add_executable(gol src/L2/gol/golsample.cu src/L2/gol/main.cpp)
 target_link_libraries(gol gflags-static ${EXTRA_LIBS})

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ MGBench: Multi-GPU Computing Benchmark Suite
 
 This set of applications test the performance, bus speed, power efficiency and correctness of a multi-GPU node.
 
-It is comprised of Level-1 tests (microbenchmarks) and Level-2 tests (micro-applications).
+It is comprised of Level-0 tests (diagnostic utilities), Level-1 tests (microbenchmarks), and Level-2 tests (micro-applications).
 
 Requirements
 ------------

diff --git a/TESTS.md b/TESTS.md
@@ -1,13 +1,21 @@
 List of Tests
 =============
 
+Level-0
+-------
+
+* `numgpus`: Returns the number of available GPUs.
+
+* `devinfo`: Returns information regarding each device and the DMA access between them.
+
+
 Level-1
 -------
 
-* Half-duplex: Tests inter-GPU uni-directional bandwidth by copying data from each GPU to the rest of the GPUs.
+* Half-duplex: Tests uni-directional bandwidth by copying data from each GPU to the host/other GPUs.
   * Special flags:
-    * `--from`: Specify only one GPU to copy from (or -1 for all GPUs)
-    * `--to`: Specify a single target GPU to copy to (or -1 for all GPUs)
+    * `--from`: Specify only one GPU to copy from (0 for host, 1...N for a specific GPU or -1 for all GPUs)
+    * `--to`: Specify a single target GPU to copy to (0 for host, 1...N for a specific GPU or -1 for all GPUs)
 
 * Full-duplex: Tests inter-GPU bi-directional bandwidth by exchanging data between GPUs.
   * Special flags:
@@ -31,6 +39,9 @@ Level-2
   * Modes:
     * Correctness: Runs SGEMM with CPU regression for a small amount of iterations to verify the results.
     * Performance: Runs SGEMM without regression, averaging multiplication time over a large amount of repetitions to obtain accurate performance. Scaling should be near-linear.
+    * Cooling: Runs SGEMM for a specified amount of seconds in order to heat the GPUs.
+  * Special flags:
+    * `heat`: Specifies the number of seconds (instead of repetitions) to run SGEMM kernels consecutively, thereby heating the GPU(s).
 
 * Game of Life (gol): Simple stencil operator that tests multi-GPU correctness as well as inter-GPU communications.
   * Modes:
@@ -40,13 +51,6 @@ Level-2
   * Special flags:
     * `--save_images`: Saves the two images in case the regression test failed (default: true).
 
-
-Miscellaneous
--------------
-
-* `numgpus`: A simple application that prints the number of available GPUs
-
-
 Test Flags
 ----------
 

diff --git a/run.sh b/run.sh
@@ -35,7 +35,53 @@ then
     exit 0
 fi
 
+#######################################
+# Find nvidia-smi for temperature tests
+TEMPTEST=0
+NVSMI=`which nvidia-smi`
+if ! [ -x "$NVSMI" ]
+then
+    NVSMI=`find /usr/local -name 'nvidia-smi' 2> /dev/null`
+    if ! [ -x "$NVSMI" ]
+    then
+        NVSMI=`find -L /etc -name 'nvidia-smi' 2> /dev/null`
+        if ! [ -x "$NVSMI" ]
+        then
+            echo "WARNING: nvidia-smi not found"
+        else
+            TEMPTEST=1
+        fi
+    else
+        TEMPTEST=1
+    fi
+else
+    TEMPTEST=1
+fi
+
+if [ $TEMPTEST -eq 1 ]
+then
+    echo "Found nvidia-smi at ${NVSMI}"
+fi
+#######################################
+
+
+# Run L0 diagnostics
+echo ""
+echo "L0 diagnostics"
+echo "--------------"
+
+echo "1/2 Computer information"
+echo "CPU Info:" > l0-info.log
+cat /proc/cpuinfo >> l0-info.log
+echo "Memory Info:" >> l0-info.log
+cat /proc/meminfo >> l0-info.log
+
+echo "2/2 Device information"
+./build/devinfo > l0-devices.log
+
+
 # Run L1 tests
+echo ""
 echo "L1 Tests"
 echo "--------"
 
@@ -58,27 +104,27 @@ echo "6/7 Full-duplex DMA Write"
 ./build/uva --write --fullduplex > l1-uvawfull.log
 
 echo "7/7 Scaling"
-./build/sgemm -n 4096 -k 4096 -m 4096 --repetitions=100 --regression=false --scaling=true > l1-scaling.log
+./build/sgemm -n 4096 -k 4096 -m 4096 --repetitions=100 --regression=false --scaling > l1-scaling.log
 
 # Run L2 tests
 echo ""
 echo "L2 Tests"
 echo "--------"
 
 # Matrix multiplication
-echo "1/5 Matrix multiplication (correctness)"
+echo "1/6 Matrix multiplication (correctness)"
 ./build/sgemm -n 1024 -k 1024 -m 1024 --repetitions=100 --regression=true > l2-sgemm-correctness.log
-echo "2/5 Matrix multiplication (performance)"
+echo "2/6 Matrix multiplication (performance)"
 ./build/sgemm -n 8192 -k 8192 -m 8192 --repetitions=100 --regression=false > l2-sgemm-perf.log
 
 # Stencil operator
-echo "3/5 Stencil (correctness)"
+echo "3/6 Stencil (correctness)"
 ./build/gol --repetitions=5 --regression=true > l2-gol-correctness.log
-echo "4/5 Stencil (performance)"
+echo "4/6 Stencil (performance)"
 ./build/gol --repetitions=1000 --regression=false > l2-gol-perf.log
 
 # Test each GPU separately
-echo "5/5 Stencil (single GPU correctness)"
+echo "5/6 Stencil (single GPU correctness)"
 echo "" > l2-gol-single.log
 i=0
 while [ $i -lt $NUMGPUS ]
@@ -89,4 +135,44 @@ do
     i=`expr $i + 1`
 done
 
+
+# Temperature test
+if [ $TEMPTEST -eq 1 ]
+then
+    echo "6/6 Cooling"
+
+    # Measure initial temperature
+    echo "Initial temp: " > l2-cooling.log
+    $NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
+    echo "" >> l2-cooling.log
+
+    # Wait 1 minute, measure again
+    sleep 300
+    echo "Temp after 5min: " >> l2-cooling.log
+    $NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
+    echo "" >> l2-cooling.log
+
+    # Heat, measure temperature right after application
+    ./build/sgemm --heat=60 --regression=false --startwith=$NUMGPUS >> l2-cooling.log
+    echo "Temp after heat: " >> l2-cooling.log
+    $NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
+    echo "" >> l2-cooling.log
+
+    # Cool for 1 minute, measure again
+    sleep 60
+    echo "Temp after 1min: " >> l2-cooling.log
+    $NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
+    echo "" >> l2-cooling.log
+
+    # Cool for 4 more minutes, measure again
+    sleep 240
+    echo "Temp after 5min: " >> l2-cooling.log
+    $NVSMI -q -d TEMPERATURE | grep Current | awk '{print $(NF-1)}' | tr '\n' ' ' >> l2-cooling.log
+    echo "" >> l2-cooling.log
+
+else
+    echo "6/6 Cooling -- SKIPPED"
+    echo "SKIPPED" > l2-cooling.log
+fi
+
 echo "Done!"
diff --git a/src/L0/devinfo.cpp b/src/L0/devinfo.cpp
@@ -0,0 +1,142 @@
+// MGBench: Multi-GPU Computing Benchmark Suite
+// Copyright (c) 2016, Tal Ben-Nun
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the names of the copyright holders nor the names of its 
+//   contributors may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+static void HandleError(const char *file, int line, cudaError_t err)
+{
+    printf("ERROR in %s:%d: %s (%d)\n", file, line,
+           cudaGetErrorString(err), err);
+    cudaGetLastError();
+}
+
+// CUDA assertions
+#define CUDA_CHECK(err) do { cudaError_t errr = (err); if(errr != cudaSuccess) { HandleError(__FILE__, __LINE__, errr); exit(1); } } while(0)
+
+// Device capability helper macros
+#define CAP(cap) NCAP(cap, cap)
+#define NCAP(cap, name) ((props.cap) ? (#name " ") : "")
+
+int main(int argc, char **argv)
+{
+    int ndevs = 0;
+    if (cudaGetDeviceCount(&ndevs) != cudaSuccess)
+        return 1;
+
+    int version = 0;
+    CUDA_CHECK(cudaDriverGetVersion(&version));
+    std::cout << "Driver version: " << (version / 1000) << "."
+              << ((version % 100) / 10) << std::endl;
+
+    version = 0;
+    CUDA_CHECK(cudaRuntimeGetVersion(&version));
+    std::cout << "Runtime version: " << (version / 1000) << "."
+              << ((version % 100) / 10) << std::endl;
+    std::cout << std::endl;
+
+    // Print information for each GPU
+    for (int i = 0; i < ndevs; ++i)
+    {
+        CUDA_CHECK(cudaSetDevice(i));
+        cudaDeviceProp props;
+        CUDA_CHECK(cudaGetDeviceProperties(&props, i));
+
+        std::cout << "GPU " << (i + 1) << ": " << props.name << " ("
+                  << props.pciDomainID << "/" << props.pciBusID
+                  << "/" << props.pciDeviceID << ")" << std::endl
+
+
+                  << "Global memory: " << (props.totalGlobalMem/1024.0/1024.0)
+                  << " MB" << std::endl
+                  << "Constant memory: " << props.totalConstMem
+                  << " bytes" << std::endl
+                  << "Shared memory: " << props.sharedMemPerBlock
+                  << " bytes" << std::endl
+                  << "Registers: " << props.regsPerBlock << std::endl
+                  << "Warp size: " << props.warpSize << std::endl
+                  << "Multiprocessors: " << props.multiProcessorCount
+                  << std::endl
+                  << "Copy engines: " << props.asyncEngineCount << std::endl
+                  << "Clock rate: " << (props.clockRate / 1e6)
+                  << " GHz" << std::endl
+                  << "Threads per MP: " << props.maxThreadsPerMultiProcessor
+                  << std::endl
+                  << "Threads per block: " << props.maxThreadsPerBlock
+                  << std::endl
+                  << "Max block size: " << props.maxThreadsDim[0] << "x"
+                  << props.maxThreadsDim[1] << "x" << props.maxThreadsDim[2]
+                  << std::endl
+                  << "Max grid size: " << props.maxGridSize[0] << "x"
+                  << props.maxGridSize[1] << "x" << props.maxGridSize[2]
+                  << std::endl
+                  << "Pitch: " << props.memPitch << " bytes" << std::endl;
+
+        std::cout << "Caps: " << NCAP(ECCEnabled, ecc)
+                  << NCAP(deviceOverlap, overlap)
+                  << NCAP(unifiedAddressing, uva)
+                  << NCAP(kernelExecTimeoutEnabled, timeout)
+                  << CAP(integrated) << NCAP(canMapHostMemory, hostdma)
+                  << CAP(surfaceAlignment) << CAP(tccDriver) << std::endl;
+        std::cout << std::endl;
+    }
+
+    std::cout << "DMA access: " << std::endl;
+    int tmp = 0;
+
+    // Print top row
+    printf("   | ");
+    for (int i = 0; i < ndevs; ++i)
+        printf("%2d ", i + 1);
+    printf("\n---+");
+    for (int i = 0; i < ndevs; ++i)
+        printf("---");
+    printf("\n");
+
+    for (int i = 0; i < ndevs; ++i)
+    {
+        printf("%2d | ", i + 1);
+        for (int j = 0; j < ndevs; ++j)
+        {
+            if (i == j)
+            {
+                printf(" x ");
+                continue;
+            }
+
+            cudaDeviceCanAccessPeer(&tmp, i, j);
+            printf("%2d ", tmp ? 1 : 0);
+        }
+        printf("\n");
+    }
+
+    return 0;
+}
diff --git a/src/L1/numgpus.cpp → src/L0/numgpus.cpp b/src/L1/numgpus.cpp → src/L0/numgpus.cpp