Merge pull request tensorflow#28 from ROCmSoftwarePlatform/deven_swit…

…ch_to_eigen_fork Switch to using the ROCm fork for eigen.
Cerebras · Jun 21, 2018 · 10c273b · 10c273b
2 parents 844747c + b27878e
commit 10c273b
Show file tree

Hide file tree

Showing 11 changed files with 31 additions and 12,666 deletions.
diff --git a/build b/build
@@ -11,4 +11,4 @@
 pip uninstall -y tensorflow || true
 bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg &&
-pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp27-cp27mu-linux_x86_64.whl
+pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-cp27-cp27mu-linux_x86_64.whl
diff --git a/build_python3 b/build_python3
@@ -11,4 +11,4 @@
 pip3 uninstall -y tensorflow || true
 bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg &&
-pip3 install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
+pip3 install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
diff --git a/...flow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/...flow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -103,7 +103,7 @@ limitations under the License.
   }
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
@@ -135,7 +135,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -94,7 +94,7 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
@@ -109,7 +109,7 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
@@ -128,7 +128,7 @@ typedef unsigned __int64 uint64_t;
 
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -86,14 +86,14 @@ namespace tensorflow {
 
 typedef cudaStream_t gpuStream_t;
 typedef cudaDeviceProp gpuDeviceProp_t;
-#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kCudaScratchSize)
+#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
 using se::cuda::ScopedActivateExecutorContext;
 
 #elif TENSORFLOW_USE_ROCM
 
 typedef hipStream_t gpuStream_t;
 typedef hipDeviceProp_t gpuDeviceProp_t;
-#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kHipScratchSize)
+#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
 using se::rocm::ScopedActivateExecutorContext;
 
 #endif

diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -61,17 +61,10 @@ template <typename T>
 struct CheckNumericsLaunch {
   void Run(const GPUDevice &d, const T *data, int size,
            int abnormal_detected[2]) {
-#if GOOGLE_CUDA
-    const int32 block_size = d.maxCudaThreadsPerBlock();
+    const int32 block_size = d.maxGpuThreadsPerBlock();
     const int32 num_blocks =
-        (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+        (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
-#elif TENSORFLOW_USE_ROCM
-    const int32 block_size = d.maxHipThreadsPerBlock();
-    const int32 num_blocks =
-        (d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor()) /
-        block_size;
-#endif
 
     GPU_LAUNCH_KERNEL(CheckNumericsKernel<T>,
         dim3(num_blocks), dim3(block_size), 0, d.stream(),

diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -711,11 +711,7 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& device,
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
                                   ? std::numeric_limits<int>::max()
-#if GOOGLE_CUDA
-                                  : device.getNumCudaMultiProcessors();
-#elif TENSORFLOW_USE_ROCM
-                                  : device.getNumHipMultiProcessors();
-#endif
+                                  : device.getNumGpuMultiProcessors();
   GPU_LAUNCH_KERNEL(kernel,
            dim3(std::min(max_block_count, config.block_count)),
            dim3(config.thread_per_block), 0, device.stream(),

diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -217,17 +217,10 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
     OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
-#if GOOGLE_CUDA
-  const int32 block_size = d.maxCudaThreadsPerBlock();
+  const int32 block_size = d.maxGpuThreadsPerBlock();
   const int32 num_blocks =
-      (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
       block_size;
-#elif TENSORFLOW_USE_ROCM
-  const int32 block_size = d.maxHipThreadsPerBlock();
-  const int32 num_blocks =
-      (d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor()) /
-      block_size;
-#endif
 
   GPU_LAUNCH_KERNEL(FillPhiloxRandomKernelLaunch<Distribution>,
       dim3(num_blocks), dim3(block_size), 0, d.stream(),

diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
@@ -128,23 +128,13 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
   GpuLaunchConfig config;
   const int virtual_thread_count = work_element_count;
 
-#if GOOGLE_CUDA
-  const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
-      virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
-  const int block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
-#elif TENSORFLOW_USE_ROCM
   const int physical_thread_count = std::min(
-      d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor(),
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
+  const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   const int block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumHipMultiProcessors());
-#endif
+               d.getNumGpuMultiProcessors());
 
   config.virtual_thread_count = virtual_thread_count;
   config.thread_per_block = thread_per_block;
@@ -180,12 +170,12 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
   //CHECK_EQ(err, hipSuccess);
 
   const int physical_thread_count = std::min(
-      d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor(),
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       work_element_count);
-  thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
+  thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumHipMultiProcessors());
+               d.getNumGpuMultiProcessors());
 #endif
 
   block_count =
@@ -224,12 +214,12 @@ inline GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
   //CHECK_EQ(err, hipSuccess);
 
   const int physical_thread_count = std::min(
-      d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor(),
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       work_element_count);
-  int thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
+  int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumHipMultiProcessors());
+               d.getNumGpuMultiProcessors());
 #endif
 
   config.virtual_thread_count = work_element_count;
@@ -257,13 +247,8 @@ inline Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
   // ok to round down here and just do more loops in the kernel
   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
 
-#if GOOGLE_CUDA
   const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
-#elif TENSORFLOW_USE_ROCM
-  const int physical_thread_count =
-      d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor();
-#endif
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
 
   const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
 
@@ -326,11 +311,11 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig(
   //CHECK_EQ(err, hipSuccess);
 
   const int physical_thread_count =
-      d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor();
-  thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
+  thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumHipMultiProcessors());
+               d.getNumGpuMultiProcessors());
 #endif
 
   int threadsx = std::min({xdim, thread_per_block, xthreadlimit});

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
@@ -109,13 +109,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/267806ed9b4f.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/267806ed9b4f.tar.gz",
+      	"https://mirror.bazel.build/github.com/ROCmSoftwarePlatform/eigen-upstream/archive/0ac71664279a69e560c86aca22171ea6dba00b19.zip",
+	"https://github.com/ROCmSoftwarePlatform/eigen-upstream/archive/0ac71664279a69e560c86aca22171ea6dba00b19.zip",	
       ],
-      sha256 = "ade57357093463cab9e4e51cd5749c81483a75451b1471a3ebc73f9c1d14043b",
-      strip_prefix = "eigen-eigen-267806ed9b4f",
+      sha256 = "2c8a288058fc328327c6f2327863ba82fdc0cfc871591ad591ff29ed9ff1f06e",
+      strip_prefix = "eigen-upstream-0ac71664279a69e560c86aca22171ea6dba00b19",
       build_file = clean_dep("//third_party:eigen.BUILD"),
-      patch_file = clean_dep("//third_party:eigen_fix_gpu_compilation.patch")
   )
 
   tf_http_archive(