Skip to content

Commit

Permalink
Added support for Arm Compute Library GEMM (tensorflow#170)
Browse files Browse the repository at this point in the history
* Add -Wno-c++11-narrowing to ComputeCpp device compiler flags to avoid build errors on 32-bit targets.

* Added SYCL support to DeviceSpec.parse_from_string - fixes a regression in running the Resnet sample from the TensorFlow models repository with SYCL.

* Bumped Eigen version.

* [OpenCL] Adds option to disable SYCL vectorization (tensorflow#161)

Adds an option to the configure script to disable SYCL vectorization.
This also rewrites and cleans up the computecpp.tpl build script, though
the actual behaviour has not changed.

* [OpenCL] Fixes Variable Resource op for SYCL (tensorflow#162)

Recent changes to the VariableResource ops were broken for SYCL. This
fixes the errors introduced by those changes.

* [OpenCL] Alignment fixed in Eigen

Don't need to use the alignment workaround any more, as the underlying
problem is fixed in Eigen.

* [OpenCL] Adds Eigen changes for new RC

* [OpenCL] Adds support for SYCL devices to nn_ops_test

* [OpenCL] Fixes multiple registrations of same op

The registration of `ReadVariableOp` does not depend on the datatype, so
we were registering more than ne of the same op.

* [OpenCL] Adds naive forward pass Conv2D kernel

Provides a very naive unoptimised forward convolution SYCL kernel.

* [OpenCL] Adds naive backprop for SYCL Conv2D

Adds both filter and input backprop

* [OpenCL] Fixes multiple registrations of same op (tensorflow#163)

The registration of `ReadVariableOp` does not depend on the datatype, so
we were registering more than ne of the same op.

* [ACL] Adding ARM Compute Library

* [ACL] Adds gemm code

* [ACL] Adds ARM_NO_EXCEPTIONS

* [ACL] Don't register half for ARM

* [ACL] Adds linking to OpenCL

* Tidied up formatting of ACL integration.

* Bug fixes to ARM Compute Library GEMM integration into matmul, from Duncan McBain.

* Fixed typos in configure.py help messages.

* Reverted formatting and logging changes that aren't related to ACL.
  • Loading branch information
alistair-low authored and Luke Iwanski committed Oct 20, 2017
1 parent 6125d3a commit 9586bef
Show file tree
Hide file tree
Showing 16 changed files with 253 additions and 3 deletions.
10 changes: 10 additions & 0 deletions configure.py
Expand Up @@ -949,6 +949,15 @@ def set_mkl():
'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
'time before build.')

def set_acl():
# Set up for ARM Compute Library
write_to_bazelrc('build:acl --define using_acl=true')
write_to_bazelrc('build:acl -c opt')
write_to_bazelrc('build:acl --copt="-DARM_COMPUTE_CL"')
write_to_bazelrc('build:acl --copt="-DARM_NO_EXCEPTIONS"')
print('Add "--config=acl" to your bazel command to build with ARM '
'Compute Library support.\nPlease set the environment variable '
'\"TF_ACL_ROOT\" every time before build.')

def set_monolithic():
# Add --config=monolithic to your bazel command to use a mostly-static
Expand Down Expand Up @@ -1030,6 +1039,7 @@ def main():

set_cc_opt_flags(environ_cp)
set_mkl()
set_acl()
set_monolithic()


Expand Down
9 changes: 9 additions & 0 deletions tensorflow/BUILD
Expand Up @@ -526,12 +526,21 @@ load(
"if_mkl",
)

load(
"//third_party/acl:build_defs.bzl",
"if_acl",
)

filegroup(
name = "intel_binary_blob",
data = if_mkl(
[
"//third_party/mkl:intel_binary_blob",
],
) + if_acl(
[
"//third_party/acl:intel_binary_blob",
],
),
)

Expand Down
16 changes: 16 additions & 0 deletions tensorflow/core/BUILD
Expand Up @@ -145,6 +145,10 @@ load(
"//third_party/mkl:build_defs.bzl",
"if_mkl",
)
load(
"//third_party/acl:build_defs.bzl",
"if_acl",
)

# -----------------------------------------------------------------------------
# Public targets
Expand Down Expand Up @@ -1775,6 +1779,10 @@ tf_cuda_library(
"//third_party/mkl:intel_binary_blob",
"@mkl_dnn//:mkl_dnn",
],
) + if_acl(
[
"//third_party/acl:intel_binary_blob",
],
),
alwayslink = 1,
)
Expand Down Expand Up @@ -1996,6 +2004,10 @@ tf_cuda_library(
[
"//third_party/mkl:intel_binary_blob",
],
) + if_acl(
[
"//third_party/acl:intel_binary_blob",
],
),
alwayslink = 1,
)
Expand Down Expand Up @@ -2040,6 +2052,10 @@ tf_cuda_library(
"//third_party/mkl:intel_binary_blob",
"@mkl_dnn//:mkl_dnn",
],
) + if_acl(
[
"//third_party/acl:intel_binary_blob"
],
) + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
alwayslink = 1,
)
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/core/kernels/BUILD
Expand Up @@ -50,6 +50,10 @@ load(
"//third_party/mkl:build_defs.bzl",
"if_mkl",
)
load(
"//third_party/acl:build_defs.bzl",
"if_acl",
)
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")

config_setting(
Expand Down Expand Up @@ -2600,6 +2604,8 @@ tf_kernel_library(
}) + if_mkl([
"//third_party/mkl:intel_binary_blob",
"@mkl_dnn//:mkl_dnn",
]) + if_acl([
"//third_party/acl:intel_binary_blob",
]) + if_cuda([
"//tensorflow/core/platform/default/build_config:cublas_plugin",
]),
Expand Down
54 changes: 54 additions & 0 deletions tensorflow/core/kernels/matmul_op.cc
Expand Up @@ -30,6 +30,13 @@ limitations under the License.
#include "tensorflow/core/platform/stream_executor.h"
#endif // GOOGLE_CUDA

#ifdef ARM_COMPUTE_CL
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLFunctions.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "utils/Utils.h"
#endif // ARM_COMPUTE_CL

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
Expand Down Expand Up @@ -492,9 +499,54 @@ class MatMulOp : public OpKernel {
f(ctx->eigen_device<Device>(), out->flat<T>());
return;
}
#ifdef ARM_COMPUTE_CL
arm_compute::CLScheduler::get().default_init();
arm_compute::CLGEMM arm_gemm;
arm_compute::CLTensor arm_a, arm_b, arm_out;

const arm_compute::TensorShape shape_a{b.shape().dim_size(0), b.shape().dim_size(1)},
shape_b{a.shape().dim_size(0), a.shape().dim_size(1)},
shape_out{out->shape().dim_size(0), out->shape().dim_size(1)};
arm_a.allocator()->init(arm_compute::TensorInfo(shape_a, 1, arm_compute::DataType::F32));
arm_b.allocator()->init(arm_compute::TensorInfo(shape_b, 1, arm_compute::DataType::F32));
arm_out.allocator()->init(arm_compute::TensorInfo(shape_out, 1, arm_compute::DataType::F32));

arm_gemm.configure(&arm_a, &arm_b, nullptr, &arm_out, 1.0f, 1.0f);

arm_a.allocator()->allocate();
arm_b.allocator()->allocate();
arm_out.allocator()->allocate();

auto fill_with_window =
[](const Tensor& tf_tensor, arm_compute::CLTensor& arm_tensor) {
arm_tensor.map(true);
auto tensor_flat = tf_tensor.flat<T>();
arm_compute::Window win;
win.use_tensor_dimensions(arm_tensor.info()->tensor_shape());
arm_compute::Iterator it(&arm_tensor, win);
arm_compute::execute_window_loop(win, [&] (arm_compute::Coordinates& c) {
*reinterpret_cast<T*>(it.ptr()) =
tensor_flat.data()[c.y() * tf_tensor.shape().dim_size(0) + c.x()];
}, it);
arm_tensor.unmap();
};

fill_with_window(b, arm_a); fill_with_window(a, arm_b);;
arm_gemm.run();

arm_compute::Window out_win;
out_win.use_tensor_dimensions(arm_out.info()->tensor_shape());
arm_out.map(true);
arm_compute::Iterator out_it(&arm_out, out_win);
auto eigen_out = out->flat<T>();
arm_compute::execute_window_loop(out_win, [&] (arm_compute::Coordinates& c) {
eigen_out.data()[c.y() * out->shape().dim_size(0) + c.x()] = *reinterpret_cast<float*>(out_it.ptr());
}, out_it);
arm_out.unmap();
#else
LaunchMatMul<Device, T, USE_CUBLAS>::launch(
ctx, a, b, dim_pair, &algorithms_, use_autotune_, out);
#endif // ARM_COMPUTE_CL
}

private:
Expand Down Expand Up @@ -562,7 +614,9 @@ TF_CALL_int32(REGISTER_CPU);
#else
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
#ifndef ARM_COMPUTE_CL
TF_CALL_half(REGISTER_CPU);
#endif // ARM_COMPUTE_CL

TF_CALL_int32(REGISTER_CPU);
TF_CALL_complex64(REGISTER_CPU);
Expand Down
6 changes: 5 additions & 1 deletion tensorflow/core/platform/default/build_config.bzl
Expand Up @@ -10,6 +10,10 @@ load(
"//third_party/mkl:build_defs.bzl",
"if_mkl",
)
load(
"//third_party/acl:build_defs.bzl",
"if_acl",
)

# Appends a suffix to a list of deps.
def tf_deps(deps, suffix):
Expand Down Expand Up @@ -538,5 +542,5 @@ def tf_additional_binary_deps():
] + if_mkl(
[
"//third_party/mkl:intel_binary_blob",
],
]) + if_acl(["//third_party/acl:intel_binary_blob",]
)
11 changes: 11 additions & 0 deletions tensorflow/tensorflow.bzl
Expand Up @@ -22,6 +22,9 @@ load(
"//third_party/mkl:build_defs.bzl",
"if_mkl",)

load(
"//third_party/acl:build_defs.bzl",
"if_acl",)

def full_path(relative_paths):
return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
Expand Down Expand Up @@ -286,6 +289,10 @@ def tf_cc_binary(name,
[
"//third_party/mkl:intel_binary_blob",
],
) + if_acl(
[
"//third_party/acl:intel_binary_blob",
],
),
linkopts=linkopts + _rpath_linkopts(name),
**kwargs)
Expand Down Expand Up @@ -537,6 +544,10 @@ def tf_cc_test(name,
[
"//third_party/mkl:intel_binary_blob",
],
) + if_acl(
[
"//third_party/acl:intel_binary_blob",
],
),
# Nested select() statements seem not to be supported when passed to
# linkstatic, and we already have a cuda select() passed in to this
Expand Down
7 changes: 7 additions & 0 deletions tensorflow/tools/lib_package/BUILD
Expand Up @@ -6,6 +6,7 @@ package(default_visibility = ["//visibility:private"])
load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
load("//tensorflow:tensorflow.bzl", "tf_binary_additional_srcs")
load("//third_party/mkl:build_defs.bzl", "if_mkl")
load("//third_party/acl:build_defs.bzl", "if_acl")

genrule(
name = "libtensorflow_proto",
Expand Down Expand Up @@ -119,6 +120,9 @@ genrule(
] + if_mkl([
"//third_party/mkl:LICENSE",
"@mkl//:LICENSE",
]) + if_acl([
"//third_party/acl:LICENSE",
"@acl//:LICENSE",
]),
outs = ["include/tensorflow/c/LICENSE"],
cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
Expand Down Expand Up @@ -154,6 +158,9 @@ genrule(
] + if_mkl([
"//third_party/mkl:LICENSE",
"@mkl//:LICENSE",
]) + if_acl([
"//third_party/acl:LICENSE",
"@acl//:LICENSE",
]),
outs = ["include/tensorflow/jni/LICENSE"],
cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
Expand Down
6 changes: 5 additions & 1 deletion tensorflow/tools/pip_package/BUILD
Expand Up @@ -9,6 +9,7 @@ load(
"transitive_hdrs",
)
load("//third_party/mkl:build_defs.bzl", "if_mkl")
load("//third_party/acl:build_defs.bzl", "if_acl")
load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")

# This returns a list of headers of all public header libraries (e.g.,
Expand Down Expand Up @@ -131,6 +132,8 @@ filegroup(
] + if_mkl([
"//third_party/mkl:LICENSE",
"@mkl//:LICENSE",
]) + if_acl([
"//third_party/acl:LICENSE",
]) + if_not_windows([
"@nccl_archive//:LICENSE.txt",
]) + tf_additional_license_deps(),
Expand Down Expand Up @@ -182,5 +185,6 @@ sh_binary(
"//tensorflow/python:test_ops",
"//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
],
}) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
}) + if_mkl(["//third_party/mkl:intel_binary_blob"]
) + if_acl(["//third_party/acl:intel_binary_blob"]),
)
11 changes: 11 additions & 0 deletions tensorflow/workspace.bzl
Expand Up @@ -3,6 +3,7 @@
load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
load("//third_party/mkl:build_defs.bzl", "mkl_repository")
load("//third_party/acl:build_defs.bzl", "acl_repository")
load("@io_bazel_rules_closure//closure/private:java_import_external.bzl",
"java_import_external")
load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
Expand Down Expand Up @@ -166,6 +167,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
repository = tf_repo_name,
)

acl_repository(
name = "acl",
urls = [
"https://github.com/lukeiwanski/ComputeLibrary/archive/feature/no_exceptions.zip",
],
strip_prefix = "ComputeLibrary-feature-no_exceptions",
build_file = str(Label("//third_party/acl:acl.BUILD")),
repository = tf_repo_name,
)

if path_prefix:
print("path_prefix was specified to tf_workspace but is no longer used " +
"and will be removed in the future.")
Expand Down
26 changes: 26 additions & 0 deletions third_party/acl/BUILD
@@ -0,0 +1,26 @@
licenses(["notice"]) # MIT License

exports_files(["LICENSE"])

config_setting(
name = "using_acl",
values = {
"define": "using_acl=true",
},
visibility = ["//visibility:public"],
)

load(
"//third_party/acl:build_defs.bzl",
"if_acl",
)

cc_library(
name = "intel_binary_blob",
srcs = if_acl([
"@acl//:libarm_compute.so",
"@acl//:libOpenCL.so",
]),
visibility = ["//visibility:public"],
deps = ["@acl//:acl_headers"],
)
Empty file added third_party/acl/LICENSE
Empty file.
30 changes: 30 additions & 0 deletions third_party/acl/acl.BUILD
@@ -0,0 +1,30 @@
licenses(["notice"]) # MIT

exports_files(["license.txt"])

filegroup(
name = "LICENSE",
srcs = [
"license.txt",
],
visibility = ["//visibility:public"],
)

cc_library(
name = "acl_headers",
srcs = glob(["**/*.h"]),
includes = [".", "include", "arm_compute", "support", "utils"],
visibility = ["//visibility:public"],
)

filegroup(
name = "libarm_compute.so",
srcs = ["lib/libarm_compute.so"],
visibility = ["//visibility:public"],
)

filegroup(
name = "libOpenCL.so",
srcs = ["lib/libOpenCL.so"],
visibility = ["//visibility:public"],
)

0 comments on commit 9586bef

Please sign in to comment.