Skip to content

Commit

Permalink
Merge pull request tensorflow#28 from ROCmSoftwarePlatform/deven_swit…
Browse files Browse the repository at this point in the history
…ch_to_eigen_fork

Switch to using the ROCm fork for eigen.
  • Loading branch information
whchung committed Jun 21, 2018
2 parents 844747c + b27878e commit 10c273b
Show file tree
Hide file tree
Showing 11 changed files with 31 additions and 12,666 deletions.
2 changes: 1 addition & 1 deletion build
Expand Up @@ -11,4 +11,4 @@
pip uninstall -y tensorflow || true
bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg &&
pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp27-cp27mu-linux_x86_64.whl
pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-cp27-cp27mu-linux_x86_64.whl
2 changes: 1 addition & 1 deletion build_python3
Expand Up @@ -11,4 +11,4 @@
pip3 uninstall -y tensorflow || true
bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg &&
pip3 install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
pip3 install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
Expand Up @@ -103,7 +103,7 @@ limitations under the License.
}

#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
Expand Down Expand Up @@ -135,7 +135,7 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"

#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"

#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"

Expand Down
Expand Up @@ -94,7 +94,7 @@ typedef unsigned __int64 uint64_t;
#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
Expand All @@ -109,7 +109,7 @@ typedef unsigned __int64 uint64_t;
#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
Expand All @@ -128,7 +128,7 @@ typedef unsigned __int64 uint64_t;


#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/core/common_runtime/gpu/gpu_device.cc
Expand Up @@ -86,14 +86,14 @@ namespace tensorflow {

typedef cudaStream_t gpuStream_t;
typedef cudaDeviceProp gpuDeviceProp_t;
#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kCudaScratchSize)
#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
using se::cuda::ScopedActivateExecutorContext;

#elif TENSORFLOW_USE_ROCM

typedef hipStream_t gpuStream_t;
typedef hipDeviceProp_t gpuDeviceProp_t;
#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kHipScratchSize)
#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
using se::rocm::ScopedActivateExecutorContext;

#endif
Expand Down
11 changes: 2 additions & 9 deletions tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
Expand Up @@ -61,17 +61,10 @@ template <typename T>
struct CheckNumericsLaunch {
void Run(const GPUDevice &d, const T *data, int size,
int abnormal_detected[2]) {
#if GOOGLE_CUDA
const int32 block_size = d.maxCudaThreadsPerBlock();
const int32 block_size = d.maxGpuThreadsPerBlock();
const int32 num_blocks =
(d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
block_size;
#elif TENSORFLOW_USE_ROCM
const int32 block_size = d.maxHipThreadsPerBlock();
const int32 num_blocks =
(d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor()) /
block_size;
#endif

GPU_LAUNCH_KERNEL(CheckNumericsKernel<T>,
dim3(num_blocks), dim3(block_size), 0, d.stream(),
Expand Down
6 changes: 1 addition & 5 deletions tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
Expand Up @@ -711,11 +711,7 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& device,
const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
kKnownDepthMultiplier < 0
? std::numeric_limits<int>::max()
#if GOOGLE_CUDA
: device.getNumCudaMultiProcessors();
#elif TENSORFLOW_USE_ROCM
: device.getNumHipMultiProcessors();
#endif
: device.getNumGpuMultiProcessors();
GPU_LAUNCH_KERNEL(kernel,
dim3(std::min(max_block_count, config.block_count)),
dim3(config.thread_per_block), 0, device.stream(),
Expand Down
11 changes: 2 additions & 9 deletions tensorflow/core/kernels/random_op_gpu.cu.cc
Expand Up @@ -217,17 +217,10 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
typename Distribution::ResultElementType* data, int64 size,
Distribution dist) {
#if GOOGLE_CUDA
const int32 block_size = d.maxCudaThreadsPerBlock();
const int32 block_size = d.maxGpuThreadsPerBlock();
const int32 num_blocks =
(d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
(d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
block_size;
#elif TENSORFLOW_USE_ROCM
const int32 block_size = d.maxHipThreadsPerBlock();
const int32 num_blocks =
(d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor()) /
block_size;
#endif

GPU_LAUNCH_KERNEL(FillPhiloxRandomKernelLaunch<Distribution>,
dim3(num_blocks), dim3(block_size), 0, d.stream(),
Expand Down
41 changes: 13 additions & 28 deletions tensorflow/core/util/gpu_launch_config.h
Expand Up @@ -128,23 +128,13 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
GpuLaunchConfig config;
const int virtual_thread_count = work_element_count;

#if GOOGLE_CUDA
const int physical_thread_count = std::min(
d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
virtual_thread_count);
const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
const int block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumCudaMultiProcessors());
#elif TENSORFLOW_USE_ROCM
const int physical_thread_count = std::min(
d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor(),
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
virtual_thread_count);
const int thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
const int block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumHipMultiProcessors());
#endif
d.getNumGpuMultiProcessors());

config.virtual_thread_count = virtual_thread_count;
config.thread_per_block = thread_per_block;
Expand Down Expand Up @@ -180,12 +170,12 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
//CHECK_EQ(err, hipSuccess);

const int physical_thread_count = std::min(
d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor(),
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumHipMultiProcessors());
d.getNumGpuMultiProcessors());
#endif

block_count =
Expand Down Expand Up @@ -224,12 +214,12 @@ inline GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
//CHECK_EQ(err, hipSuccess);

const int physical_thread_count = std::min(
d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor(),
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
work_element_count);
int thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumHipMultiProcessors());
d.getNumGpuMultiProcessors());
#endif

config.virtual_thread_count = work_element_count;
Expand Down Expand Up @@ -257,13 +247,8 @@ inline Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
// ok to round down here and just do more loops in the kernel
int block_rows = std::max(kThreadsPerBlock / block_cols, 1);

#if GOOGLE_CUDA
const int physical_thread_count =
d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
#elif TENSORFLOW_USE_ROCM
const int physical_thread_count =
d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor();
#endif
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();

const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);

Expand Down Expand Up @@ -326,11 +311,11 @@ inline Gpu3DLaunchConfig GetGpu3DLaunchConfig(
//CHECK_EQ(err, hipSuccess);

const int physical_thread_count =
d.getNumHipMultiProcessors() * d.maxHipThreadsPerMultiProcessor();
thread_per_block = std::min(1024, d.maxHipThreadsPerBlock());
d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
block_count =
std::min(DivUp(physical_thread_count, thread_per_block),
d.getNumHipMultiProcessors());
d.getNumGpuMultiProcessors());
#endif

int threadsx = std::min({xdim, thread_per_block, xthreadlimit});
Expand Down
9 changes: 4 additions & 5 deletions tensorflow/workspace.bzl
Expand Up @@ -109,13 +109,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "eigen_archive",
urls = [
"https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/267806ed9b4f.tar.gz",
"https://bitbucket.org/eigen/eigen/get/267806ed9b4f.tar.gz",
"https://mirror.bazel.build/github.com/ROCmSoftwarePlatform/eigen-upstream/archive/0ac71664279a69e560c86aca22171ea6dba00b19.zip",
"https://github.com/ROCmSoftwarePlatform/eigen-upstream/archive/0ac71664279a69e560c86aca22171ea6dba00b19.zip",
],
sha256 = "ade57357093463cab9e4e51cd5749c81483a75451b1471a3ebc73f9c1d14043b",
strip_prefix = "eigen-eigen-267806ed9b4f",
sha256 = "2c8a288058fc328327c6f2327863ba82fdc0cfc871591ad591ff29ed9ff1f06e",
strip_prefix = "eigen-upstream-0ac71664279a69e560c86aca22171ea6dba00b19",
build_file = clean_dep("//third_party:eigen.BUILD"),
patch_file = clean_dep("//third_party:eigen_fix_gpu_compilation.patch")
)

tf_http_archive(
Expand Down

0 comments on commit 10c273b

Please sign in to comment.