Added support for Arm Compute Library GEMM (tensorflow#170)

* Add -Wno-c++11-narrowing to ComputeCpp device compiler flags to avoid build errors on 32-bit targets. * Added SYCL support to DeviceSpec.parse_from_string - fixes a regression in running the Resnet sample from the TensorFlow models repository with SYCL. * Bumped Eigen version. * [OpenCL] Adds option to disable SYCL vectorization (tensorflow#161) Adds an option to the configure script to disable SYCL vectorization. This also rewrites and cleans up the computecpp.tpl build script, though the actual behaviour has not changed. * [OpenCL] Fixes Variable Resource op for SYCL (tensorflow#162) Recent changes to the VariableResource ops were broken for SYCL. This fixes the errors introduced by those changes. * [OpenCL] Alignment fixed in Eigen Don't need to use the alignment workaround any more, as the underlying problem is fixed in Eigen. * [OpenCL] Adds Eigen changes for new RC * [OpenCL] Adds support for SYCL devices to nn_ops_test * [OpenCL] Fixes multiple registrations of same op The registration of `ReadVariableOp` does not depend on the datatype, so we were registering more than ne of the same op. * [OpenCL] Adds naive forward pass Conv2D kernel Provides a very naive unoptimised forward convolution SYCL kernel. * [OpenCL] Adds naive backprop for SYCL Conv2D Adds both filter and input backprop * [OpenCL] Fixes multiple registrations of same op (tensorflow#163) The registration of `ReadVariableOp` does not depend on the datatype, so we were registering more than ne of the same op. * [ACL] Adding ARM Compute Library * [ACL] Adds gemm code * [ACL] Adds ARM_NO_EXCEPTIONS * [ACL] Don't register half for ARM * [ACL] Adds linking to OpenCL * Tidied up formatting of ACL integration. * Bug fixes to ARM Compute Library GEMM integration into matmul, from Duncan McBain. * Fixed typos in configure.py help messages. * Reverted formatting and logging changes that aren't related to ACL.
codeplaysoftware · Oct 20, 2017 · 9586bef · 9586bef
1 parent 6125d3a
commit 9586bef
Show file tree

Hide file tree

Showing 16 changed files with 253 additions and 3 deletions.
diff --git a/configure.py b/configure.py
@@ -949,6 +949,15 @@ def set_mkl():
       'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
       'time before build.')
 
+def set_acl():
+  # Set up for ARM Compute Library
+  write_to_bazelrc('build:acl --define using_acl=true')
+  write_to_bazelrc('build:acl -c opt')
+  write_to_bazelrc('build:acl --copt="-DARM_COMPUTE_CL"')
+  write_to_bazelrc('build:acl --copt="-DARM_NO_EXCEPTIONS"')
+  print('Add "--config=acl" to your bazel command to build with ARM '
+        'Compute Library support.\nPlease set the environment variable '
+        '\"TF_ACL_ROOT\" every time before build.')
 
 def set_monolithic():
   # Add --config=monolithic to your bazel command to use a mostly-static
@@ -1030,6 +1039,7 @@ def main():
 
   set_cc_opt_flags(environ_cp)
   set_mkl()
+  set_acl()
   set_monolithic()
 
 

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
@@ -526,12 +526,21 @@ load(
     "if_mkl",
 )
 
+load(
+    "//third_party/acl:build_defs.bzl",
+    "if_acl",
+)
+
 filegroup(
     name = "intel_binary_blob",
     data = if_mkl(
         [
             "//third_party/mkl:intel_binary_blob",
         ],
+    ) + if_acl(
+        [
+            "//third_party/acl:intel_binary_blob",
+        ],
     ),
 )
 

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -145,6 +145,10 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load(
+    "//third_party/acl:build_defs.bzl",
+    "if_acl",
+)
 
 # -----------------------------------------------------------------------------
 # Public targets
@@ -1775,6 +1779,10 @@ tf_cuda_library(
             "//third_party/mkl:intel_binary_blob",
             "@mkl_dnn//:mkl_dnn",
         ],
+    ) + if_acl(
+        [
+            "//third_party/acl:intel_binary_blob",
+        ],
     ),
     alwayslink = 1,
 )
@@ -1996,6 +2004,10 @@ tf_cuda_library(
         [
             "//third_party/mkl:intel_binary_blob",
         ],
+    ) + if_acl(
+        [
+            "//third_party/acl:intel_binary_blob",
+        ],
     ),
     alwayslink = 1,
 )
@@ -2040,6 +2052,10 @@ tf_cuda_library(
             "//third_party/mkl:intel_binary_blob",
             "@mkl_dnn//:mkl_dnn",
         ],
+    ) + if_acl(
+        [
+            "//third_party/acl:intel_binary_blob"
+        ],
     ) + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
     alwayslink = 1,
 )

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
@@ -50,6 +50,10 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load(
+    "//third_party/acl:build_defs.bzl",
+    "if_acl",
+)
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 config_setting(
@@ -2600,6 +2604,8 @@ tf_kernel_library(
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
         "@mkl_dnn//:mkl_dnn",
+    ]) + if_acl([
+        "//third_party/acl:intel_binary_blob",
     ]) + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),

diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
@@ -30,6 +30,13 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
+#ifdef ARM_COMPUTE_CL
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/Utils.h"
+#endif  // ARM_COMPUTE_CL
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -492,9 +499,54 @@ class MatMulOp : public OpKernel {
       f(ctx->eigen_device<Device>(), out->flat<T>());
       return;
     }
+#ifdef ARM_COMPUTE_CL
+    arm_compute::CLScheduler::get().default_init();
+    arm_compute::CLGEMM arm_gemm;
+    arm_compute::CLTensor arm_a, arm_b, arm_out;
+
+    const arm_compute::TensorShape shape_a{b.shape().dim_size(0), b.shape().dim_size(1)},
+          shape_b{a.shape().dim_size(0), a.shape().dim_size(1)},
+          shape_out{out->shape().dim_size(0), out->shape().dim_size(1)};
+    arm_a.allocator()->init(arm_compute::TensorInfo(shape_a, 1, arm_compute::DataType::F32));
+    arm_b.allocator()->init(arm_compute::TensorInfo(shape_b, 1, arm_compute::DataType::F32));
+    arm_out.allocator()->init(arm_compute::TensorInfo(shape_out, 1, arm_compute::DataType::F32));
+
+    arm_gemm.configure(&arm_a, &arm_b, nullptr, &arm_out, 1.0f, 1.0f);
+
+    arm_a.allocator()->allocate();
+    arm_b.allocator()->allocate();
+    arm_out.allocator()->allocate();
+
+    auto fill_with_window =
+      [](const Tensor& tf_tensor, arm_compute::CLTensor& arm_tensor) {
+        arm_tensor.map(true);
+        auto tensor_flat = tf_tensor.flat<T>();
+        arm_compute::Window win;
+        win.use_tensor_dimensions(arm_tensor.info()->tensor_shape());
+        arm_compute::Iterator it(&arm_tensor, win);
+        arm_compute::execute_window_loop(win, [&] (arm_compute::Coordinates& c) {
+          *reinterpret_cast<T*>(it.ptr()) =
+            tensor_flat.data()[c.y() * tf_tensor.shape().dim_size(0) + c.x()];
+        }, it);
+        arm_tensor.unmap();
+    };
 
+    fill_with_window(b, arm_a); fill_with_window(a, arm_b);;
+    arm_gemm.run();
+
+    arm_compute::Window out_win;
+    out_win.use_tensor_dimensions(arm_out.info()->tensor_shape());
+    arm_out.map(true);
+    arm_compute::Iterator out_it(&arm_out, out_win);
+    auto eigen_out = out->flat<T>();
+    arm_compute::execute_window_loop(out_win, [&] (arm_compute::Coordinates& c) {
+      eigen_out.data()[c.y() * out->shape().dim_size(0) + c.x()] = *reinterpret_cast<float*>(out_it.ptr());
+    }, out_it);
+    arm_out.unmap();
+#else
     LaunchMatMul<Device, T, USE_CUBLAS>::launch(
         ctx, a, b, dim_pair, &algorithms_, use_autotune_, out);
+#endif  // ARM_COMPUTE_CL
   }
 
  private:
@@ -562,7 +614,9 @@ TF_CALL_int32(REGISTER_CPU);
 #else
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
+#ifndef ARM_COMPUTE_CL
 TF_CALL_half(REGISTER_CPU);
+#endif  // ARM_COMPUTE_CL
 
 TF_CALL_int32(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);

diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
@@ -10,6 +10,10 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load(
+    "//third_party/acl:build_defs.bzl",
+    "if_acl",
+)
 
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
@@ -538,5 +542,5 @@ def tf_additional_binary_deps():
   ] + if_mkl(
       [
           "//third_party/mkl:intel_binary_blob",
-      ],
+      ]) + if_acl(["//third_party/acl:intel_binary_blob",]
   )
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
@@ -22,6 +22,9 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",)
 
+load(
+    "//third_party/acl:build_defs.bzl",
+    "if_acl",)
 
 def full_path(relative_paths):
   return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
@@ -286,6 +289,10 @@ def tf_cc_binary(name,
           [
               "//third_party/mkl:intel_binary_blob",
           ],
+      ) + if_acl(
+          [
+              "//third_party/acl:intel_binary_blob",
+          ],
       ),
       linkopts=linkopts + _rpath_linkopts(name),
       **kwargs)
@@ -537,6 +544,10 @@ def tf_cc_test(name,
           [
               "//third_party/mkl:intel_binary_blob",
           ],
+      ) + if_acl(
+          [
+              "//third_party/acl:intel_binary_blob",
+          ],
       ),
       # Nested select() statements seem not to be supported when passed to
       # linkstatic, and we already have a cuda select() passed in to this

diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
@@ -6,6 +6,7 @@ package(default_visibility = ["//visibility:private"])
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
 load("//tensorflow:tensorflow.bzl", "tf_binary_additional_srcs")
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
+load("//third_party/acl:build_defs.bzl", "if_acl")
 
 genrule(
     name = "libtensorflow_proto",
@@ -119,6 +120,9 @@ genrule(
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
         "@mkl//:LICENSE",
+    ]) + if_acl([
+        "//third_party/acl:LICENSE",
+        "@acl//:LICENSE",
     ]),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
@@ -154,6 +158,9 @@ genrule(
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
         "@mkl//:LICENSE",
+    ]) + if_acl([
+        "//third_party/acl:LICENSE",
+        "@acl//:LICENSE",
     ]),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
@@ -9,6 +9,7 @@ load(
     "transitive_hdrs",
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
+load("//third_party/acl:build_defs.bzl", "if_acl")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
 
 # This returns a list of headers of all public header libraries (e.g.,
@@ -131,6 +132,8 @@ filegroup(
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
         "@mkl//:LICENSE",
+    ]) + if_acl([
+        "//third_party/acl:LICENSE",
     ]) + if_not_windows([
         "@nccl_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
@@ -182,5 +185,6 @@ sh_binary(
             "//tensorflow/python:test_ops",
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
-    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
+    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]
+    ) + if_acl(["//third_party/acl:intel_binary_blob"]),
 )
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
@@ -3,6 +3,7 @@
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
+load("//third_party/acl:build_defs.bzl", "acl_repository")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl",
      "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
@@ -166,6 +167,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       repository = tf_repo_name,
   )
 
+  acl_repository(
+      name = "acl",
+      urls = [
+          "https://github.com/lukeiwanski/ComputeLibrary/archive/feature/no_exceptions.zip",
+      ],
+      strip_prefix = "ComputeLibrary-feature-no_exceptions",
+      build_file = str(Label("//third_party/acl:acl.BUILD")),
+      repository = tf_repo_name,
+  )
+
   if path_prefix:
     print("path_prefix was specified to tf_workspace but is no longer used " +
           "and will be removed in the future.")

diff --git a/third_party/acl/BUILD b/third_party/acl/BUILD
@@ -0,0 +1,26 @@
+licenses(["notice"]) # MIT License
+
+exports_files(["LICENSE"])
+
+config_setting(
+    name = "using_acl",
+    values = {
+        "define": "using_acl=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+load(
+    "//third_party/acl:build_defs.bzl",
+    "if_acl",
+)
+
+cc_library(
+    name = "intel_binary_blob",
+    srcs = if_acl([
+        "@acl//:libarm_compute.so",
+        "@acl//:libOpenCL.so",
+    ]),
+    visibility = ["//visibility:public"],
+    deps = ["@acl//:acl_headers"],
+)
diff --git a/third_party/acl/LICENSE b/third_party/acl/LICENSE
diff --git a/third_party/acl/acl.BUILD b/third_party/acl/acl.BUILD
@@ -0,0 +1,30 @@
+licenses(["notice"])  # MIT
+
+exports_files(["license.txt"])
+
+filegroup(
+    name = "LICENSE",
+    srcs = [
+        "license.txt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "acl_headers",
+    srcs = glob(["**/*.h"]),
+    includes = [".", "include", "arm_compute", "support", "utils"],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "libarm_compute.so",
+    srcs = ["lib/libarm_compute.so"],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "libOpenCL.so",
+    srcs = ["lib/libOpenCL.so"],
+    visibility = ["//visibility:public"],
+)