From 62eed8bb6f9c01c2cb7899e634845c6bf29e427e Mon Sep 17 00:00:00 2001
From: shuw
Date: Fri, 29 Apr 2022 20:36:46 -0700
Subject: [PATCH 001/259] Disable where ops in xla auto-clustering
---
.../compiler/jit/compilability_check_util.cc | 8 +++++
.../compiler/jit/compilability_check_util.h | 3 ++
tensorflow/compiler/jit/flags.cc | 6 ++++
tensorflow/compiler/jit/flags.h | 3 ++
.../compiler/jit/mark_for_compilation_pass.cc | 33 +++++++++++++++++++
.../kernel_tests/array_ops/array_ops_test.py | 22 +++++++++++--
6 files changed, 73 insertions(+), 2 deletions(-)
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 9507a2f003a753..37effcfcbd768f 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -488,6 +488,14 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
return false;
}
+ if (!op_filter_.allow_where_op && node.type_string() == "Where") {
+ absl::string_view uncompilable_reason = "Where op";
+ MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
+ encapsulating_function, uncompilable_nodes);
+ LogNotCompilable(node, uncompilable_reason);
+ return false;
+ }
+
if (!op_filter_.allow_unique_op && node.type_string() == "Unique") {
absl::string_view uncompilable_reason = "Unique op";
MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 8435e8eea1af9d..2b31e575779bce 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -136,6 +136,9 @@ class RecursiveCompilabilityChecker {
// Whether to allow the compilation of CollectiveReduceV2Op.
bool allow_collective_reduce_v2 = true;
+ // Whether to allow the compilation of WhereOp.
+ bool allow_where_op = true;
+
// Whether to allow the compilation of UniqueOp. Compilation of the UniqueOp
// generates output with bounded dynamic shape that may cause failures with
// auto clustering.
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 4dd95dc776c8ff..b425ae09a05183 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -114,6 +114,12 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector* flag_list) {
" BN: TF FusedBatchNorm* operations."
" FUSIBLE: All TF operations that XLA can fuse (All the above). "
"You can also put any TF operation name, e.g. 'FUSIBLE,MatMul'."),
+ Flag("tf_xla_cluster_remove_from_excludelist",
+ &mark_for_compilation_flags->tf_xla_cluster_remove_from_excludelist,
+ "(experimental) "
+ "Remove the operations in auto-clustering excludelist. "
+ "If multiple, separate them with commas."
+ " Where, Unique, CollectiveReduceV2."),
Flag("tf_xla_clustering_debug",
&mark_for_compilation_flags->tf_xla_clustering_debug,
"Dump graphs during XLA compilation."),
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 7a1b1e7763e83f..d85c9560e1ef24 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -61,6 +61,9 @@ struct MarkForCompilationPassFlags {
// If non-empty, limit XLA clustering to the following TF operations.
string tf_xla_ops_to_cluster;
+ // If non-empty, remove following operations from XLA clustering excludelist.
+ string tf_xla_cluster_remove_from_excludelist;
+
// Dump graphs during XLA compilation.
bool tf_xla_clustering_debug;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 59bea5e8711c81..a73e12796a1ad7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1189,6 +1189,23 @@ StatusOr IsIdentityDrivingConstsInLoop(Node* node) {
return true;
}
+absl::flat_hash_set GetOrCreateClusterRemoveFromExcludelistOpList() {
+ MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
+ absl::flat_hash_set removelist;
+ for (auto s : absl::StrSplit(flags->tf_xla_cluster_remove_from_excludelist, ',')) {
+ if (!s.empty()) {
+ removelist.insert(string(s)) ;
+ }
+ }
+ if (VLOG_IS_ON(2) && !removelist.empty()) {
+ std::vector vremovelist(removelist.begin(), removelist.end());
+ absl::c_sort(vremovelist);
+ VLOG(2) << "XLA clustering will remove following TF operations from excludelist: "
+ << absl::StrJoin(vremovelist, " ");
+ }
+ return removelist;
+}
+
absl::flat_hash_set GetOrCreateAllowlist() {
absl::flat_hash_map>* allowlist_table =
tensorflow::GetAllowlistTable();
@@ -1289,13 +1306,29 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
continue;
}
+ auto cluster_remove_op_list = GetOrCreateClusterRemoveFromExcludelistOpList();
RecursiveCompilabilityChecker::OperationFilter filter =
CreateOperationFilter(*registration);
filter.require_always_compilable = true;
filter.allow_string_consts = false;
filter.allow_collective_reduce_v2 = false;
+ filter.allow_where_op = false;
filter.allow_unique_op = false;
+ for (const auto& s : cluster_remove_op_list) {
+ if (s == "Where") {
+ filter.allow_where_op = true;
+ } else if (s == "Unique") {
+ filter.allow_unique_op = true;
+ } else if (s == "CollectiveReduceV2") {
+ filter.allow_collective_reduce_v2 = true;
+ } else {
+ return errors::InvalidArgument(
+ "The operation '", s,
+ "' passed to --tf_xla_cluster_allow_ops is not supported by XLA.");
+ }
+ }
+
RecursiveCompilabilityChecker checker(
filter, DeviceType{registration->compilation_device_name});
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index cabd4bddc22b9a..334bd069dd10f3 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -13,6 +13,8 @@
# limitations under the License.
# ==============================================================================
"""Tests for array_ops."""
+import argparse
+import os
import re
import time
import unittest
@@ -1063,10 +1065,26 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase,
parameterized.TestCase):
"""Test that strided slice's custom gradient produces correct gradients."""
+ def conditional_decorator(self, dec, is_disable):
+ def decorator(func):
+ if is_disable:
+ return func
+ return dec(func)
+ return decorator
+
+ is_disable_where_op_auto_cluster = True
+ tf_xla_flags= os.environ.get('TF_XLA_FLAGS')
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--tf_xla_cluster_remove_from_excludelist')
+ if tf_xla_flags is not None:
+ args = parser.parse_args(tf_xla_flags.split())
+ candidates = vars(args)['tf_xla_cluster_remove_from_excludelist'].split(',')
+ is_disable_where_op_auto_cluster = 'Where' not in candidates
+
@parameterized.parameters(set((True, context.executing_eagerly())))
- @test_util.disable_xla(
+ @conditional_decorator(self, test_util.disable_xla(
"b/210077724: Auto-clustering with where op isn't supported. Has loose "
- "output shape bounds")
+ "output shape bounds"), is_disable_where_op_auto_cluster)
def testGradient(self, use_tape):
with test_util.device(use_gpu=True):
var = variables.Variable(
From 6a030ba980bc7cc700d5830b7aa08bee00dd8bb1 Mon Sep 17 00:00:00 2001
From: shuw
Date: Wed, 4 May 2022 11:02:15 -0700
Subject: [PATCH 002/259] Reverse ops exclusion logic
---
tensorflow/compiler/jit/flags.cc | 8 ++---
tensorflow/compiler/jit/flags.h | 2 +-
.../compiler/jit/mark_for_compilation_pass.cc | 35 +++++++++----------
.../kernel_tests/array_ops/array_ops_test.py | 6 ++--
4 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index b425ae09a05183..7134a43afcaea2 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -114,12 +114,12 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector* flag_list) {
" BN: TF FusedBatchNorm* operations."
" FUSIBLE: All TF operations that XLA can fuse (All the above). "
"You can also put any TF operation name, e.g. 'FUSIBLE,MatMul'."),
- Flag("tf_xla_cluster_remove_from_excludelist",
- &mark_for_compilation_flags->tf_xla_cluster_remove_from_excludelist,
+ Flag("tf_xla_cluster_exclude_ops",
+ &mark_for_compilation_flags->tf_xla_cluster_exclude_ops,
"(experimental) "
- "Remove the operations in auto-clustering excludelist. "
+ "Exclude the operations from auto-clustering. "
"If multiple, separate them with commas."
- " Where, Unique, CollectiveReduceV2."),
+ " Where, Some_other_ops"),
Flag("tf_xla_clustering_debug",
&mark_for_compilation_flags->tf_xla_clustering_debug,
"Dump graphs during XLA compilation."),
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index d85c9560e1ef24..b4900a48e0785f 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -62,7 +62,7 @@ struct MarkForCompilationPassFlags {
string tf_xla_ops_to_cluster;
// If non-empty, remove following operations from XLA clustering excludelist.
- string tf_xla_cluster_remove_from_excludelist;
+ string tf_xla_cluster_exclude_ops;
// Dump graphs during XLA compilation.
bool tf_xla_clustering_debug;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index a73e12796a1ad7..48cb89a3676e07 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1189,21 +1189,22 @@ StatusOr IsIdentityDrivingConstsInLoop(Node* node) {
return true;
}
-absl::flat_hash_set GetOrCreateClusterRemoveFromExcludelistOpList() {
+absl::flat_hash_set GetOrCreateClusterExcludeList() {
MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
- absl::flat_hash_set removelist;
- for (auto s : absl::StrSplit(flags->tf_xla_cluster_remove_from_excludelist, ',')) {
+ absl::flat_hash_set excludelist;
+ for (auto s : absl::StrSplit(flags->tf_xla_cluster_exclude_ops, ',')) {
if (!s.empty()) {
- removelist.insert(string(s)) ;
+ excludelist.insert(string(s)) ;
}
}
- if (VLOG_IS_ON(2) && !removelist.empty()) {
- std::vector vremovelist(removelist.begin(), removelist.end());
- absl::c_sort(vremovelist);
- VLOG(2) << "XLA clustering will remove following TF operations from excludelist: "
- << absl::StrJoin(vremovelist, " ");
+ if (VLOG_IS_ON(2) && !excludelist.empty()) {
+ std::vector vexcludelist(excludelist.begin(), excludelist.end());
+ absl::c_sort(vexcludelist);
+ VLOG(2) << "XLA clustering will exclude following TF operations from auto "
+ "clustering: "
+ << absl::StrJoin(vexcludelist, " ");
}
- return removelist;
+ return excludelist;
}
absl::flat_hash_set GetOrCreateAllowlist() {
@@ -1306,26 +1307,22 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
continue;
}
- auto cluster_remove_op_list = GetOrCreateClusterRemoveFromExcludelistOpList();
+ auto cluster_exclude_op_list = GetOrCreateClusterExcludeList();
RecursiveCompilabilityChecker::OperationFilter filter =
CreateOperationFilter(*registration);
filter.require_always_compilable = true;
filter.allow_string_consts = false;
filter.allow_collective_reduce_v2 = false;
- filter.allow_where_op = false;
filter.allow_unique_op = false;
+ filter.allow_where_op = true;
- for (const auto& s : cluster_remove_op_list) {
+ for (const auto& s : cluster_exclude_op_list) {
if (s == "Where") {
- filter.allow_where_op = true;
- } else if (s == "Unique") {
- filter.allow_unique_op = true;
- } else if (s == "CollectiveReduceV2") {
- filter.allow_collective_reduce_v2 = true;
+ filter.allow_where_op = false;
} else {
return errors::InvalidArgument(
"The operation '", s,
- "' passed to --tf_xla_cluster_allow_ops is not supported by XLA.");
+ "' passed to --tf_xla_cluster_exclude_ops is not supported by XLA.");
}
}
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index 334bd069dd10f3..410c8b3fa99632 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -1075,11 +1075,11 @@ def decorator(func):
is_disable_where_op_auto_cluster = True
tf_xla_flags= os.environ.get('TF_XLA_FLAGS')
parser = argparse.ArgumentParser()
- parser.add_argument('--tf_xla_cluster_remove_from_excludelist')
+ parser.add_argument('--tf_xla_cluster_exclude_ops')
if tf_xla_flags is not None:
args = parser.parse_args(tf_xla_flags.split())
- candidates = vars(args)['tf_xla_cluster_remove_from_excludelist'].split(',')
- is_disable_where_op_auto_cluster = 'Where' not in candidates
+ candidates = vars(args)['tf_xla_cluster_exclude_ops'].split(',')
+ is_disable_where_op_auto_cluster = 'Where' in candidates
@parameterized.parameters(set((True, context.executing_eagerly())))
@conditional_decorator(self, test_util.disable_xla(
From 73090fd74687ac8dd84de4fd3a51b257f5ffc144 Mon Sep 17 00:00:00 2001
From: shuw
Date: Tue, 10 May 2022 08:58:15 -0700
Subject: [PATCH 003/259] Revert array_ops_test
---
.../kernel_tests/array_ops/array_ops_test.py | 22 ++-----------------
1 file changed, 2 insertions(+), 20 deletions(-)
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index 410c8b3fa99632..cabd4bddc22b9a 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -13,8 +13,6 @@
# limitations under the License.
# ==============================================================================
"""Tests for array_ops."""
-import argparse
-import os
import re
import time
import unittest
@@ -1065,26 +1063,10 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase,
parameterized.TestCase):
"""Test that strided slice's custom gradient produces correct gradients."""
- def conditional_decorator(self, dec, is_disable):
- def decorator(func):
- if is_disable:
- return func
- return dec(func)
- return decorator
-
- is_disable_where_op_auto_cluster = True
- tf_xla_flags= os.environ.get('TF_XLA_FLAGS')
- parser = argparse.ArgumentParser()
- parser.add_argument('--tf_xla_cluster_exclude_ops')
- if tf_xla_flags is not None:
- args = parser.parse_args(tf_xla_flags.split())
- candidates = vars(args)['tf_xla_cluster_exclude_ops'].split(',')
- is_disable_where_op_auto_cluster = 'Where' in candidates
-
@parameterized.parameters(set((True, context.executing_eagerly())))
- @conditional_decorator(self, test_util.disable_xla(
+ @test_util.disable_xla(
"b/210077724: Auto-clustering with where op isn't supported. Has loose "
- "output shape bounds"), is_disable_where_op_auto_cluster)
+ "output shape bounds")
def testGradient(self, use_tape):
with test_util.device(use_gpu=True):
var = variables.Variable(
From 6421564703e17be6ec9e25e87cb6a8012748a09c Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 16:35:20 +0000
Subject: [PATCH 004/259] TF2XLA Bincount initiaL test and dummy kernel
---
tensorflow/compiler/tf2xla/kernels/BUILD | 1 +
.../compiler/tf2xla/kernels/bincount_op.cc | 63 +++++++++++++++++++
.../python/eager/def_function_xla_jit_test.py | 13 ++++
3 files changed, 77 insertions(+)
create mode 100644 tensorflow/compiler/tf2xla/kernels/bincount_op.cc
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index aa670d8332d5c1..f47aa62ac74162 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -33,6 +33,7 @@ tf_kernel_library(
"beta_op.cc",
"bias_ops.cc",
"binary_ops.cc",
+ "bincount_op.cc",
"broadcast_to_op.cc",
"bucketize_op.cc",
"cast_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
new file mode 100644
index 00000000000000..3f5d640d131938
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include
+
+#include
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+namespace {
+
+class BincountOp : public XlaOpKernel {
+ public:
+ explicit BincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+ DataType dtype;
+ }
+
+ void Compile(XlaOpKernelContext* ctx) override {
+ xla::XlaOp input = ctx->Input(0);
+
+ ctx->SetOutput(0, input);
+ }
+};
+
+REGISTER_XLA_OP(Name("Bincount"), BincountOp);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 4f971bb73284bf..22538210468b17 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -27,6 +27,7 @@
from tensorflow.python.framework import tensor_spec
from tensorflow.python.framework import test_util
from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
from tensorflow.python.ops import collective_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import control_flow_util
@@ -693,6 +694,18 @@ def f(x):
self.assertAllClose(f(constant_op.constant([3.1, 3.2, 3.2])), [3.1, 3.2])
+ @test_util.disable_mlir_bridge('TODO(b/199737685): MLIR bridge does not'
+ 'support tf.unique via jit_compile')
+ def testBincountCompilability(self):
+ self.skipTest("(TODO) There is only a dummy kernel")
+ with ops.device('device:{}:0'.format(self.device)):
+
+ @def_function.function(jit_compile=True)
+ def f(x):
+ return bincount_ops.bincount(x)
+ x =constant_op.constant([1, 1, 2, 3, 2, 4, 4, 5])
+ self.assertAllClose(f(x), [0, 2, 2, 1, 2, 1])
+
def testUpdateVariableMemoryUsage(self):
with ops.device('device:{}:0'.format(self.device)):
From 61b1b45e2be5812fa7e420d6d83b03a74e80f565 Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 18:53:49 +0000
Subject: [PATCH 005/259] Add tf2xla direct kernel test
---
tensorflow/compiler/tests/BUILD | 19 +++++++++
tensorflow/compiler/tests/bincount_op_test.py | 42 +++++++++++++++++++
2 files changed, 61 insertions(+)
create mode 100644 tensorflow/compiler/tests/bincount_op_test.py
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index c1fa93127fd235..99ea23ad1967a1 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2143,6 +2143,25 @@ tf_xla_py_test(
],
)
+tf_xla_py_test(
+ name = "bincount_op_test",
+ size = "small",
+ srcs = ["bincount_op_test.py"],
+ enable_mlir_bridge = False,
+ tags = [
+ "no_pip",
+ "optonly",
+ ],
+ deps = [
+ ":xla_test",
+ "//tensorflow/python:bincount_ops",
+ "//tensorflow/python:client_testlib",
+ "//tensorflow/python:errors",
+ "//tensorflow/python:framework",
+ "//tensorflow/python/compiler/xla:compiler_py",
+ ],
+)
+
tf_xla_py_test(
name = "where_op_test",
size = "small",
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
new file mode 100644
index 00000000000000..ce0d5266cb5489
--- /dev/null
+++ b/tensorflow/compiler/tests/bincount_op_test.py
@@ -0,0 +1,42 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for where op."""
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import bincount_ops
+from tensorflow.python.platform import test
+# pylint: enable=g-direct-tensorflow-import
+
+
+class WhereOpTest(xla_test.XLATestCase):
+
+ def testBincount(self):
+ """Test first form of where (return indices)."""
+
+ with self.session() as sess:
+ with self.test_scope():
+ x = array_ops.placeholder(dtypes.int32)
+ values = bincount_ops.bincount(x)
+
+ # Output of the computation is dynamic.
+ feed = [1, 1, 2, 3, 2, 4, 4, 5]
+ self.assertAllEqual([0, 2, 2, 1, 2, 1],
+ sess.run(values, {x: feed}))
+
+
+if __name__ == "__main__":
+ test.main()
From 7d604334c0db9a6e3478cb326db906c38d0f146a Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 18:56:01 +0000
Subject: [PATCH 006/259] Add dummy kernel direct test
---
tensorflow/compiler/tests/bincount_op_test.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
index ce0d5266cb5489..8e73263c8909a6 100644
--- a/tensorflow/compiler/tests/bincount_op_test.py
+++ b/tensorflow/compiler/tests/bincount_op_test.py
@@ -25,6 +25,7 @@
class WhereOpTest(xla_test.XLATestCase):
def testBincount(self):
+ self.skipTest("TODO: this a dummy kernel")
"""Test first form of where (return indices)."""
with self.session() as sess:
From 0dc3397de7012bf2d354825803aa5ae11452eb45 Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 19:02:28 +0000
Subject: [PATCH 007/259] Dep fix
---
tensorflow/compiler/tests/BUILD | 1 +
tensorflow/compiler/tests/bincount_op_test.py | 3 +--
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 99ea23ad1967a1..595b0c2cccb8c0 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2154,6 +2154,7 @@ tf_xla_py_test(
],
deps = [
":xla_test",
+ "//tensorflow/python:array_ops",
"//tensorflow/python:bincount_ops",
"//tensorflow/python:client_testlib",
"//tensorflow/python:errors",
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
index 8e73263c8909a6..150ce428b200dc 100644
--- a/tensorflow/compiler/tests/bincount_op_test.py
+++ b/tensorflow/compiler/tests/bincount_op_test.py
@@ -17,6 +17,7 @@
# pylint: disable=g-direct-tensorflow-import
from tensorflow.compiler.tests import xla_test
from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
from tensorflow.python.ops import bincount_ops
from tensorflow.python.platform import test
# pylint: enable=g-direct-tensorflow-import
@@ -26,8 +27,6 @@ class WhereOpTest(xla_test.XLATestCase):
def testBincount(self):
self.skipTest("TODO: this a dummy kernel")
- """Test first form of where (return indices)."""
-
with self.session() as sess:
with self.test_scope():
x = array_ops.placeholder(dtypes.int32)
From e8948092508eb7255f7ea80b301bb603b62f7df2 Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 19:03:27 +0000
Subject: [PATCH 008/259] Fix typo
---
tensorflow/python/eager/def_function_xla_jit_test.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 22538210468b17..9ccc98ae2ca0ef 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -695,7 +695,7 @@ def f(x):
self.assertAllClose(f(constant_op.constant([3.1, 3.2, 3.2])), [3.1, 3.2])
@test_util.disable_mlir_bridge('TODO(b/199737685): MLIR bridge does not'
- 'support tf.unique via jit_compile')
+ 'support tf.bincount via jit_compile')
def testBincountCompilability(self):
self.skipTest("(TODO) There is only a dummy kernel")
with ops.device('device:{}:0'.format(self.device)):
From 28d776fdba97f3739135c63cf6c5cc68970ee944 Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 19:08:05 +0000
Subject: [PATCH 009/259] Minimixe the import for the template
---
.../compiler/tf2xla/kernels/bincount_op.cc | 26 +------------------
1 file changed, 1 insertion(+), 25 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 3f5d640d131938..a5b3570a5d344e 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -13,37 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
-#include
-
-#include
-
-#include "absl/types/optional.h"
-#include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/comparators.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/shape.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/ops_util.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
namespace tensorflow {
namespace {
+// TODO: This is only a dummy kernel
class BincountOp : public XlaOpKernel {
public:
explicit BincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
From 790e14f61884a54a317c237fe549b67ebdf21618 Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 26 May 2022 19:41:09 +0000
Subject: [PATCH 010/259] Fix name
---
tensorflow/compiler/tests/bincount_op_test.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
index 150ce428b200dc..bae58f583a8d91 100644
--- a/tensorflow/compiler/tests/bincount_op_test.py
+++ b/tensorflow/compiler/tests/bincount_op_test.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
-"""Tests for where op."""
+"""Tests for bincount op."""
# pylint: disable=g-direct-tensorflow-import
from tensorflow.compiler.tests import xla_test
@@ -23,7 +23,7 @@
# pylint: enable=g-direct-tensorflow-import
-class WhereOpTest(xla_test.XLATestCase):
+class BincountOpTest(xla_test.XLATestCase):
def testBincount(self):
self.skipTest("TODO: this a dummy kernel")
From 325f49be5c02f6b9da94b2fb95f8242a00480475 Mon Sep 17 00:00:00 2001
From: bhack
Date: Sun, 29 May 2022 22:31:14 +0000
Subject: [PATCH 011/259] Dumb implementation for the basic test
---
tensorflow/compiler/tests/bincount_op_test.py | 1 -
.../compiler/tf2xla/kernels/bincount_op.cc | 63 ++++++++++++++++++-
.../python/eager/def_function_xla_jit_test.py | 1 -
3 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
index bae58f583a8d91..7e2a388139633c 100644
--- a/tensorflow/compiler/tests/bincount_op_test.py
+++ b/tensorflow/compiler/tests/bincount_op_test.py
@@ -26,7 +26,6 @@
class BincountOpTest(xla_test.XLATestCase):
def testBincount(self):
- self.skipTest("TODO: this a dummy kernel")
with self.session() as sess:
with self.test_scope():
x = array_ops.placeholder(dtypes.int32)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index a5b3570a5d344e..bd996544f818f1 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -13,8 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
+#include "tensorflow/compiler/xla/shape_util.h"
#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
namespace tensorflow {
namespace {
@@ -25,11 +31,64 @@ class BincountOp : public XlaOpKernel {
explicit BincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
DataType dtype;
}
-
+
void Compile(XlaOpKernelContext* ctx) override {
+ // Dumb implementation for the simplest test case
xla::XlaOp input = ctx->Input(0);
+ auto max = xla::ReduceAll(
+ input, xla::Zero(ctx->builder(), xla::S32),
+ xla::CreateScalarMaxComputation(xla::S32, ctx->builder()));
+ // TODO: it need to be max
+ int64_t output_size = 6;
+ StatusOr input_shape_or = ctx->builder()->GetShape(input);
+ OP_REQUIRES_OK(ctx, input_shape_or.status());
+ auto input_shape = input_shape_or.ValueOrDie();
+ int64_t size = input_shape.dimensions(0);
+
+ auto counter_shape = xla::ShapeUtil::MakeShape(xla::S32, {});
+ const xla::Shape data_shape = xla::ShapeUtil::MakeShape(xla::S32, {size});
+
+ const xla::Shape output_shape = xla::ShapeUtil::MakeShape(xla::S32, {output_size});
+
+ auto loop_shape = xla::ShapeUtil::MakeTupleShape(
+ {counter_shape, data_shape, output_shape});
+
+ // Create a computation for the condition
+ xla::XlaComputation condition;
+ {
+ std::unique_ptr builder =
+ ctx->builder()->CreateSubBuilder("condition");
+ auto param = xla::Parameter(builder.get(), 0, loop_shape, "param");
+ auto counter = xla::GetTupleElement(param, 0);
+ xla::Gt(xla::ConstantR0(builder.get(), size), counter);
+ condition = builder->Build().ConsumeValueOrDie();
+ }
- ctx->SetOutput(0, input);
+ // Create a computation for the body
+ xla::XlaComputation body;
+ {
+ std::unique_ptr builder =
+ ctx->builder()->CreateSubBuilder("body");
+ auto param = Parameter(builder.get(), 0, loop_shape, "param");
+ auto counter = xla::GetTupleElement(param, 0);
+ auto data_stack = xla::GetTupleElement(param, 1);
+ auto accum_stack = xla::GetTupleElement(param, 2);
+ auto data = xla::DynamicSlice(data_stack, {counter}, {1});
+ auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
+ accum = accum + xla::One(builder.get(), xla::S32);
+ accum_stack = xla::DynamicUpdateSlice(
+ accum_stack, xla::Reshape(accum, {1}), {data});
+ counter = counter + xla::One(builder.get(), xla::S32);
+ xla::Tuple(builder.get(), {counter, data_stack, accum_stack});
+ body = builder->Build().ConsumeValueOrDie();
+ }
+ // Create a While node with computations for the condition and the body.
+ auto zero = xla::Zero(ctx->builder(), xla::S32);
+ auto zero_broadcast = xla::Broadcast(zero, {output_size});
+ auto init = xla::Tuple(ctx->builder(), {zero, input, zero_broadcast});
+ auto result = xla::While(condition, body, init);
+ auto output = xla::GetTupleElement(result,2);
+ ctx->SetOutput(0, output);
}
};
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 9ccc98ae2ca0ef..4c4acb26fbd91f 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -697,7 +697,6 @@ def f(x):
@test_util.disable_mlir_bridge('TODO(b/199737685): MLIR bridge does not'
'support tf.bincount via jit_compile')
def testBincountCompilability(self):
- self.skipTest("(TODO) There is only a dummy kernel")
with ops.device('device:{}:0'.format(self.device)):
@def_function.function(jit_compile=True)
From acf9b6c9ee20d3bd6000981d06e8cfcfbb3f5c88 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 14:52:18 +0000
Subject: [PATCH 012/259] size at compiletime
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index bd996544f818f1..37e5886382ad1f 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -35,16 +35,12 @@ class BincountOp : public XlaOpKernel {
void Compile(XlaOpKernelContext* ctx) override {
// Dumb implementation for the simplest test case
xla::XlaOp input = ctx->Input(0);
- auto max = xla::ReduceAll(
- input, xla::Zero(ctx->builder(), xla::S32),
- xla::CreateScalarMaxComputation(xla::S32, ctx->builder()));
- // TODO: it need to be max
- int64_t output_size = 6;
+ int64_t output_size;
+ ctx->ConstantInputAsIntScalar("size",&output_size);
StatusOr input_shape_or = ctx->builder()->GetShape(input);
OP_REQUIRES_OK(ctx, input_shape_or.status());
auto input_shape = input_shape_or.ValueOrDie();
int64_t size = input_shape.dimensions(0);
-
auto counter_shape = xla::ShapeUtil::MakeShape(xla::S32, {});
const xla::Shape data_shape = xla::ShapeUtil::MakeShape(xla::S32, {size});
@@ -92,7 +88,7 @@ class BincountOp : public XlaOpKernel {
}
};
-REGISTER_XLA_OP(Name("Bincount"), BincountOp);
+REGISTER_XLA_OP(Name("Bincount").CompileTimeConstantInput("size"), BincountOp);
} // namespace
} // namespace tensorflow
From da6e95ff004f825eded1e31e3c45b3a220a16004 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 22:39:46 +0000
Subject: [PATCH 013/259] move test and add an extra failing one
---
tensorflow/compiler/tests/bincount_op_test.py | 41 -------------------
.../python/eager/def_function_xla_jit_test.py | 12 ------
tensorflow/python/ops/bincount_ops_test.py | 38 +++++++++++++++++
3 files changed, 38 insertions(+), 53 deletions(-)
delete mode 100644 tensorflow/compiler/tests/bincount_op_test.py
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
deleted file mode 100644
index 7e2a388139633c..00000000000000
--- a/tensorflow/compiler/tests/bincount_op_test.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for bincount op."""
-
-# pylint: disable=g-direct-tensorflow-import
-from tensorflow.compiler.tests import xla_test
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import bincount_ops
-from tensorflow.python.platform import test
-# pylint: enable=g-direct-tensorflow-import
-
-
-class BincountOpTest(xla_test.XLATestCase):
-
- def testBincount(self):
- with self.session() as sess:
- with self.test_scope():
- x = array_ops.placeholder(dtypes.int32)
- values = bincount_ops.bincount(x)
-
- # Output of the computation is dynamic.
- feed = [1, 1, 2, 3, 2, 4, 4, 5]
- self.assertAllEqual([0, 2, 2, 1, 2, 1],
- sess.run(values, {x: feed}))
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 4c4acb26fbd91f..4f971bb73284bf 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -27,7 +27,6 @@
from tensorflow.python.framework import tensor_spec
from tensorflow.python.framework import test_util
from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import bincount_ops
from tensorflow.python.ops import collective_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import control_flow_util
@@ -694,17 +693,6 @@ def f(x):
self.assertAllClose(f(constant_op.constant([3.1, 3.2, 3.2])), [3.1, 3.2])
- @test_util.disable_mlir_bridge('TODO(b/199737685): MLIR bridge does not'
- 'support tf.bincount via jit_compile')
- def testBincountCompilability(self):
- with ops.device('device:{}:0'.format(self.device)):
-
- @def_function.function(jit_compile=True)
- def f(x):
- return bincount_ops.bincount(x)
- x =constant_op.constant([1, 1, 2, 3, 2, 4, 4, 5])
- self.assertAllClose(f(x), [0, 2, 2, 1, 2, 1])
-
def testUpdateVariableMemoryUsage(self):
with ops.device('device:{}:0'.format(self.device)):
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index de7d1423870d76..bdcd8f27176590 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -18,6 +18,7 @@
import numpy as np
from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
from tensorflow.python.framework import errors
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor
@@ -163,6 +164,43 @@ def test_dense_input(self,
self.assertAllEqual(expected_values, y.values)
self.assertAllEqual(expected_shape, y.dense_shape)
+ @parameterized.named_parameters(
+ {
+ "testcase_name": "_baseline_test",
+ "x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
+ "expected_values": [0, 2, 2, 1, 2, 1],
+ "expected_shape": [6]
+ }, {
+ "testcase_name": "_no_maxlength",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [1, 1, 1, 2, 1],
+ "expected_shape": [2, 6]
+ })
+ def test_compiled_dense_input(self,
+ x,
+ expected_values,
+ expected_shape,
+ minlength=None,
+ maxlength=None,
+ binary_output=False,
+ weights=None,
+ axis=-1):
+ @def_function.function(jit_compile=True)
+ def f():
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis
+ )
+ return y
+ y = f()
+ self.assertAllEqual(expected_values, y)
+ #self.assertAllEqual(expected_shape, y)
+
+
@parameterized.named_parameters(
{
"testcase_name":
From a4c387aaafc044f4977d98ffd59f11a5468e7974 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 22:41:19 +0000
Subject: [PATCH 014/259] Align spaces
---
tensorflow/python/ops/bincount_ops_test.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index bdcd8f27176590..47426d171785ec 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -169,12 +169,12 @@ def test_dense_input(self,
"testcase_name": "_baseline_test",
"x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
"expected_values": [0, 2, 2, 1, 2, 1],
- "expected_shape": [6]
+ "expected_shape": [6]
}, {
"testcase_name": "_no_maxlength",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
"expected_values": [1, 1, 1, 2, 1],
- "expected_shape": [2, 6]
+ "expected_shape": [2, 6]
})
def test_compiled_dense_input(self,
x,
From 81024e96909dabaabcabadfc2326f3e48d28f2b7 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 23:06:47 +0000
Subject: [PATCH 015/259] Rename test
---
tensorflow/python/ops/bincount_ops_test.py | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 47426d171785ec..1b2b22122f29e7 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -169,17 +169,14 @@ def test_dense_input(self,
"testcase_name": "_baseline_test",
"x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
"expected_values": [0, 2, 2, 1, 2, 1],
- "expected_shape": [6]
}, {
"testcase_name": "_no_maxlength",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
"expected_values": [1, 1, 1, 2, 1],
- "expected_shape": [2, 6]
})
- def test_compiled_dense_input(self,
+ def test_compiled_dense(self,
x,
expected_values,
- expected_shape,
minlength=None,
maxlength=None,
binary_output=False,
@@ -198,7 +195,6 @@ def f():
return y
y = f()
self.assertAllEqual(expected_values, y)
- #self.assertAllEqual(expected_shape, y)
@parameterized.named_parameters(
From 2f75d39e7256769f13d5916d34b6b68989cedb4b Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 23:07:25 +0000
Subject: [PATCH 016/259] rename
---
tensorflow/python/ops/bincount_ops_test.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 1b2b22122f29e7..d338addc29bf79 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -166,7 +166,7 @@ def test_dense_input(self,
@parameterized.named_parameters(
{
- "testcase_name": "_baseline_test",
+ "testcase_name": "_baseline",
"x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
"expected_values": [0, 2, 2, 1, 2, 1],
}, {
From 2ac5fae8d735b7e32ccb19c7729425c277b3f3b5 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 23:14:49 +0000
Subject: [PATCH 017/259] Fix test
---
tensorflow/python/ops/bincount_ops_test.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index d338addc29bf79..87272a7b381d96 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -172,7 +172,7 @@ def test_dense_input(self,
}, {
"testcase_name": "_no_maxlength",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [1, 1, 1, 2, 1],
+ "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]],
})
def test_compiled_dense(self,
x,
From 42af8b43c9150bf51ad34b51ce468896b440cf5f Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 30 May 2022 23:25:08 +0000
Subject: [PATCH 018/259] Add other tests
---
tensorflow/python/ops/bincount_ops_test.py | 23 ++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 87272a7b381d96..d26f63077df201 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -166,13 +166,24 @@ def test_dense_input(self,
@parameterized.named_parameters(
{
- "testcase_name": "_baseline",
- "x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
- "expected_values": [0, 2, 2, 1, 2, 1],
+ "testcase_name": "_baseline",
+ "x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
+ "expected_values": [0, 2, 2, 1, 2, 1]
}, {
- "testcase_name": "_no_maxlength",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]],
+ "testcase_name": "_no_maxlength",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
+ }, {
+ "testcase_name": "_maxlength",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "maxlength": 7,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
+ }, {
+ "testcase_name": "_minlength",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 9,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
+ [1, 0, 0, 0, 2, 0, 0, 1, 0]]
})
def test_compiled_dense(self,
x,
From 54835bcbd13bbb09bbb3e6d21a5a3c109b057653 Mon Sep 17 00:00:00 2001
From: bhack
Date: Tue, 31 May 2022 16:18:55 +0000
Subject: [PATCH 019/259] Rename xla op with dense Add a failing test
---
.../compiler/tf2xla/kernels/bincount_op.cc | 6 ++--
tensorflow/python/ops/bincount_ops_test.py | 29 +++++++++----------
2 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 37e5886382ad1f..dfafa37bff3c3a 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -26,9 +26,9 @@ namespace tensorflow {
namespace {
// TODO: This is only a dummy kernel
-class BincountOp : public XlaOpKernel {
+class DenseBincountOp : public XlaOpKernel {
public:
- explicit BincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+ explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
DataType dtype;
}
@@ -88,7 +88,7 @@ class BincountOp : public XlaOpKernel {
}
};
-REGISTER_XLA_OP(Name("Bincount").CompileTimeConstantInput("size"), BincountOp);
+REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index d26f63077df201..b830988d031e13 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -30,7 +30,6 @@
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.platform import test
-
class TestSparseCount(test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(
@@ -173,18 +172,8 @@ def test_dense_input(self,
"testcase_name": "_no_maxlength",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
"expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
- }, {
- "testcase_name": "_maxlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
- }, {
- "testcase_name": "_minlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 2, 0, 0, 1, 0]]
- })
+ },)
+
def test_compiled_dense(self,
x,
expected_values,
@@ -194,7 +183,12 @@ def test_compiled_dense(self,
weights=None,
axis=-1):
@def_function.function(jit_compile=True)
- def f():
+ def f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis):
y = bincount_ops.bincount(
x,
weights=weights,
@@ -204,7 +198,12 @@ def f():
axis=axis
)
return y
- y = f()
+ y = f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
self.assertAllEqual(expected_values, y)
From 9a75d11a026cd9b73bb2fa518cfd96778eaea019 Mon Sep 17 00:00:00 2001
From: bhack
Date: Wed, 1 Jun 2022 01:42:00 +0000
Subject: [PATCH 020/259] Add rank2 code
---
.../compiler/tf2xla/kernels/bincount_op.cc | 36 ++++++++++++++-----
1 file changed, 28 insertions(+), 8 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index dfafa37bff3c3a..af66ef441a8982 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -36,15 +36,21 @@ class DenseBincountOp : public XlaOpKernel {
// Dumb implementation for the simplest test case
xla::XlaOp input = ctx->Input(0);
int64_t output_size;
- ctx->ConstantInputAsIntScalar("size",&output_size);
+ ctx->ConstantInputAsIntScalar("size", &output_size);
StatusOr input_shape_or = ctx->builder()->GetShape(input);
OP_REQUIRES_OK(ctx, input_shape_or.status());
auto input_shape = input_shape_or.ValueOrDie();
- int64_t size = input_shape.dimensions(0);
+ auto size = input_shape.dimensions(0);
+ auto dim = 1;
+ auto rank = input_shape.rank();
auto counter_shape = xla::ShapeUtil::MakeShape(xla::S32, {});
- const xla::Shape data_shape = xla::ShapeUtil::MakeShape(xla::S32, {size});
+ const xla::Shape data_shape = xla::ShapeUtil::MakeShape(xla::S32, {input_shape.dimensions()});
- const xla::Shape output_shape = xla::ShapeUtil::MakeShape(xla::S32, {output_size});
+ xla::Shape output_shape = xla::ShapeUtil::MakeShape(xla::S32, {output_size});
+ if (rank == 2) {
+ output_shape = xla::ShapeUtil::MakeShape(xla::S32, {rank, output_size});
+ dim = input_shape.dimensions(1);
+ }
auto loop_shape = xla::ShapeUtil::MakeTupleShape(
{counter_shape, data_shape, output_shape});
@@ -56,31 +62,45 @@ class DenseBincountOp : public XlaOpKernel {
ctx->builder()->CreateSubBuilder("condition");
auto param = xla::Parameter(builder.get(), 0, loop_shape, "param");
auto counter = xla::GetTupleElement(param, 0);
- xla::Gt(xla::ConstantR0(builder.get(), size), counter);
+ xla::Gt(xla::ConstantR0(builder.get(), size*dim), counter);
condition = builder->Build().ConsumeValueOrDie();
}
-
+
// Create a computation for the body
xla::XlaComputation body;
{
std::unique_ptr builder =
- ctx->builder()->CreateSubBuilder("body");
+ ctx->builder()->CreateSubBuilder("body");
auto param = Parameter(builder.get(), 0, loop_shape, "param");
auto counter = xla::GetTupleElement(param, 0);
auto data_stack = xla::GetTupleElement(param, 1);
auto accum_stack = xla::GetTupleElement(param, 2);
+ if (rank == 1) {
auto data = xla::DynamicSlice(data_stack, {counter}, {1});
auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
accum = accum + xla::One(builder.get(), xla::S32);
accum_stack = xla::DynamicUpdateSlice(
accum_stack, xla::Reshape(accum, {1}), {data});
+ }
+ else {
+ auto dim_xla = xla::ConstantR0(builder.get(), dim);
+ auto idx_1 = xla::Div(counter, dim_xla);
+ auto idx_2 = counter % dim_xla;
+ auto data = xla::DynamicSlice(data_stack, {idx_1, idx_2}, {1, 1});
+ auto data_scalar = xla::Reshape(data, {0,1}, {});
+ auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
+ accum = accum + xla::One(builder.get(), xla::S32);
+ accum_stack = xla::DynamicUpdateSlice(
+ accum_stack, xla::Reshape(accum, {1, 1}), {idx_1, data_scalar});
+ }
counter = counter + xla::One(builder.get(), xla::S32);
xla::Tuple(builder.get(), {counter, data_stack, accum_stack});
body = builder->Build().ConsumeValueOrDie();
}
+
// Create a While node with computations for the condition and the body.
auto zero = xla::Zero(ctx->builder(), xla::S32);
- auto zero_broadcast = xla::Broadcast(zero, {output_size});
+ auto zero_broadcast = xla::Broadcast(zero, {output_shape.dimensions()});
auto init = xla::Tuple(ctx->builder(), {zero, input, zero_broadcast});
auto result = xla::While(condition, body, init);
auto output = xla::GetTupleElement(result,2);
From 7f73eacc9f9ef42e0389e740a1733db0f99ce2a5 Mon Sep 17 00:00:00 2001
From: bhack
Date: Wed, 1 Jun 2022 19:09:39 +0000
Subject: [PATCH 021/259] Add conditional update
---
.../compiler/tf2xla/kernels/bincount_op.cc | 115 +++++++++++++++---
tensorflow/python/ops/bincount_ops_test.py | 8 +-
2 files changed, 102 insertions(+), 21 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index af66ef441a8982..53445ed94228a2 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -70,29 +70,106 @@ class DenseBincountOp : public XlaOpKernel {
xla::XlaComputation body;
{
std::unique_ptr builder =
- ctx->builder()->CreateSubBuilder("body");
+ ctx->builder()->CreateSubBuilder("body");
auto param = Parameter(builder.get(), 0, loop_shape, "param");
auto counter = xla::GetTupleElement(param, 0);
auto data_stack = xla::GetTupleElement(param, 1);
auto accum_stack = xla::GetTupleElement(param, 2);
- if (rank == 1) {
- auto data = xla::DynamicSlice(data_stack, {counter}, {1});
- auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
- accum = accum + xla::One(builder.get(), xla::S32);
- accum_stack = xla::DynamicUpdateSlice(
- accum_stack, xla::Reshape(accum, {1}), {data});
- }
- else {
- auto dim_xla = xla::ConstantR0(builder.get(), dim);
- auto idx_1 = xla::Div(counter, dim_xla);
- auto idx_2 = counter % dim_xla;
- auto data = xla::DynamicSlice(data_stack, {idx_1, idx_2}, {1, 1});
- auto data_scalar = xla::Reshape(data, {0,1}, {});
- auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
- accum = accum + xla::One(builder.get(), xla::S32);
- accum_stack = xla::DynamicUpdateSlice(
- accum_stack, xla::Reshape(accum, {1, 1}), {idx_1, data_scalar});
- }
+
+ if (rank == 1) {
+ auto data = xla::DynamicSlice(data_stack, {counter}, {1});
+ auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
+ auto data_scalar = xla::Reshape(data, {0}, {});
+
+ auto condition_shape = xla::ShapeUtil::MakeTupleShape(
+ {counter_shape, counter_shape, output_shape});
+
+ xla::XlaComputation update;
+ {
+ std::unique_ptr true_builder =
+ builder->CreateSubBuilder("update");
+ auto param = Parameter(true_builder.get(), 0, condition_shape, "param");
+ auto data_scalar = xla::GetTupleElement(param, 0);
+ auto accum = xla::GetTupleElement(param, 1);
+ auto accum_stack = xla::GetTupleElement(param, 2);
+ accum = accum + xla::One(true_builder.get(), xla::S32);
+ accum_stack = xla::DynamicUpdateSlice(
+ accum_stack, xla::Reshape(accum, {1}), {data_scalar});
+ xla::Tuple(true_builder.get(), {accum, accum_stack});
+ auto update = true_builder->Build().ValueOrDie();
+ }
+
+ xla::XlaComputation no_update;
+ {
+ std::unique_ptr false_builder =
+ builder->CreateSubBuilder("no_update");
+ auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
+ auto data = xla::GetTupleElement(param, 0);
+ auto accum = xla::GetTupleElement(param, 1);
+ auto accum_stack = xla::GetTupleElement(param, 2);
+ xla::Tuple(false_builder.get(), {accum, accum_stack});
+ auto no_update = false_builder->Build().ValueOrDie();
+ }
+
+ std::unique_ptr cond_builder =
+ builder->CreateSubBuilder("cond");
+ auto output_size_xla = xla::ConstantR0(cond_builder.get(), output_size);
+ auto pred = xla::Lt(data_scalar, output_size_xla);
+ auto tuple = xla::Tuple(cond_builder.get(), {data_scalar, accum, accum_stack});
+ auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
+ accum = xla::GetTupleElement(cond, 0);
+ accum_stack = xla::GetTupleElement(cond, 1);
+ }
+ else {
+ auto condition_shape = xla::ShapeUtil::MakeTupleShape(
+ {counter_shape, counter_shape, output_shape, counter_shape});
+
+ auto dim_xla = xla::ConstantR0(builder.get(), dim);
+ auto idx_1 = xla::Div(counter, dim_xla);
+ auto idx_2 = counter % dim_xla;
+ auto data = xla::DynamicSlice(data_stack, {idx_1, idx_2}, {1, 1});
+ auto data_scalar = xla::Reshape(data, {0,1}, {});
+ auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
+
+ xla::XlaComputation update;
+ {
+ std::unique_ptr true_builder =
+ builder->CreateSubBuilder("update_rank2");
+ auto param = Parameter(true_builder.get(), 0, condition_shape, "param");
+
+ auto data_scalar = xla::GetTupleElement(param, 0);
+ auto idx_1 = xla::GetTupleElement(param, 1);
+ auto accum_stack = xla::GetTupleElement(param, 2);
+ auto accum = xla::GetTupleElement(param, 3);
+ accum = accum + xla::One(true_builder.get(), xla::S32);
+ accum_stack = xla::DynamicUpdateSlice(
+ accum_stack, xla::Reshape(accum, {1, 1}), {idx_1, data_scalar});
+ xla::Tuple(true_builder.get(), {accum, accum_stack});
+ auto update = true_builder->Build().ValueOrDie();
+ }
+
+ xla::XlaComputation no_update;
+ {
+ std::unique_ptr false_builder =
+ builder->CreateSubBuilder("no_update_rank2");
+ auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
+ auto data_scalar = xla::GetTupleElement(param, 0);
+ auto idx_1 = xla::GetTupleElement(param, 1);
+ auto accum_stack = xla::GetTupleElement(param, 2);
+ auto accum = xla::GetTupleElement(param, 3);
+ xla::Tuple(false_builder.get(), {accum, accum_stack});
+ auto no_update = false_builder->Build().ValueOrDie();
+ }
+ std::unique_ptr cond_builder =
+ builder->CreateSubBuilder("cond_rank2");
+ auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
+
+ auto pred = xla::Lt(data_scalar, output_size_xla);
+ auto tuple = xla::Tuple(cond_builder.get(), {data_scalar, idx_1, accum_stack, accum});
+ auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
+ accum = xla::GetTupleElement(cond, 0);
+ accum_stack = xla::GetTupleElement(cond, 1);
+ }
counter = counter + xla::One(builder.get(), xla::S32);
xla::Tuple(builder.get(), {counter, data_stack, accum_stack});
body = builder->Build().ConsumeValueOrDie();
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index b830988d031e13..e2844069301ff2 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -172,8 +172,12 @@ def test_dense_input(self,
"testcase_name": "_no_maxlength",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
"expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
- },)
-
+ }, {
+ "testcase_name": "_maxlength",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "maxlength": 7,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
+ })
def test_compiled_dense(self,
x,
expected_values,
From 5662fe3acecdbefba552c77b6accab2677889b8b Mon Sep 17 00:00:00 2001
From: bhack
Date: Wed, 1 Jun 2022 19:48:14 +0000
Subject: [PATCH 022/259] Small fix
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 53445ed94228a2..d3c0943a5f9f76 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -96,7 +96,7 @@ class DenseBincountOp : public XlaOpKernel {
accum_stack = xla::DynamicUpdateSlice(
accum_stack, xla::Reshape(accum, {1}), {data_scalar});
xla::Tuple(true_builder.get(), {accum, accum_stack});
- auto update = true_builder->Build().ValueOrDie();
+ update = true_builder->Build().ValueOrDie();
}
xla::XlaComputation no_update;
@@ -108,7 +108,7 @@ class DenseBincountOp : public XlaOpKernel {
auto accum = xla::GetTupleElement(param, 1);
auto accum_stack = xla::GetTupleElement(param, 2);
xla::Tuple(false_builder.get(), {accum, accum_stack});
- auto no_update = false_builder->Build().ValueOrDie();
+ no_update = false_builder->Build().ValueOrDie();
}
std::unique_ptr cond_builder =
@@ -145,7 +145,7 @@ class DenseBincountOp : public XlaOpKernel {
accum_stack = xla::DynamicUpdateSlice(
accum_stack, xla::Reshape(accum, {1, 1}), {idx_1, data_scalar});
xla::Tuple(true_builder.get(), {accum, accum_stack});
- auto update = true_builder->Build().ValueOrDie();
+ update = true_builder->Build().ValueOrDie();
}
xla::XlaComputation no_update;
@@ -158,7 +158,7 @@ class DenseBincountOp : public XlaOpKernel {
auto accum_stack = xla::GetTupleElement(param, 2);
auto accum = xla::GetTupleElement(param, 3);
xla::Tuple(false_builder.get(), {accum, accum_stack});
- auto no_update = false_builder->Build().ValueOrDie();
+ no_update = false_builder->Build().ValueOrDie();
}
std::unique_ptr cond_builder =
builder->CreateSubBuilder("cond_rank2");
From eaf92ec5c788b4cd497bc6c3f3d6b366283f5408 Mon Sep 17 00:00:00 2001
From: bhack
Date: Wed, 1 Jun 2022 20:39:21 +0000
Subject: [PATCH 023/259] Fix
---
.../compiler/tf2xla/kernels/bincount_op.cc | 20 +++++++++----------
tensorflow/python/ops/bincount_ops_test.py | 12 +++++++++++
2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index d3c0943a5f9f76..a945e8431d3d71 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -79,6 +79,7 @@ class DenseBincountOp : public XlaOpKernel {
if (rank == 1) {
auto data = xla::DynamicSlice(data_stack, {counter}, {1});
auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
+ accum = xla::Reshape(accum, {0}, {});
auto data_scalar = xla::Reshape(data, {0}, {});
auto condition_shape = xla::ShapeUtil::MakeTupleShape(
@@ -87,7 +88,7 @@ class DenseBincountOp : public XlaOpKernel {
xla::XlaComputation update;
{
std::unique_ptr true_builder =
- builder->CreateSubBuilder("update");
+ ctx->builder()->CreateSubBuilder("update");
auto param = Parameter(true_builder.get(), 0, condition_shape, "param");
auto data_scalar = xla::GetTupleElement(param, 0);
auto accum = xla::GetTupleElement(param, 1);
@@ -102,7 +103,7 @@ class DenseBincountOp : public XlaOpKernel {
xla::XlaComputation no_update;
{
std::unique_ptr false_builder =
- builder->CreateSubBuilder("no_update");
+ ctx->builder()->CreateSubBuilder("no_update");
auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
auto data = xla::GetTupleElement(param, 0);
auto accum = xla::GetTupleElement(param, 1);
@@ -111,11 +112,9 @@ class DenseBincountOp : public XlaOpKernel {
no_update = false_builder->Build().ValueOrDie();
}
- std::unique_ptr cond_builder =
- builder->CreateSubBuilder("cond");
- auto output_size_xla = xla::ConstantR0(cond_builder.get(), output_size);
+ auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
auto pred = xla::Lt(data_scalar, output_size_xla);
- auto tuple = xla::Tuple(cond_builder.get(), {data_scalar, accum, accum_stack});
+ auto tuple = xla::Tuple(builder.get(), {data_scalar, accum, accum_stack});
auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
accum = xla::GetTupleElement(cond, 0);
accum_stack = xla::GetTupleElement(cond, 1);
@@ -130,6 +129,7 @@ class DenseBincountOp : public XlaOpKernel {
auto data = xla::DynamicSlice(data_stack, {idx_1, idx_2}, {1, 1});
auto data_scalar = xla::Reshape(data, {0,1}, {});
auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
+ accum = xla::Reshape(accum, {0,1}, {});
xla::XlaComputation update;
{
@@ -160,12 +160,10 @@ class DenseBincountOp : public XlaOpKernel {
xla::Tuple(false_builder.get(), {accum, accum_stack});
no_update = false_builder->Build().ValueOrDie();
}
- std::unique_ptr cond_builder =
- builder->CreateSubBuilder("cond_rank2");
- auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
+ auto limit = xla::ConstantR0(builder.get(), output_size);
- auto pred = xla::Lt(data_scalar, output_size_xla);
- auto tuple = xla::Tuple(cond_builder.get(), {data_scalar, idx_1, accum_stack, accum});
+ auto pred = xla::Lt(data_scalar, limit);
+ auto tuple = xla::Tuple(builder.get(), {data_scalar, idx_1, accum_stack, accum});
auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
accum = xla::GetTupleElement(cond, 0);
accum_stack = xla::GetTupleElement(cond, 1);
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index e2844069301ff2..8c482db559c7d3 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -177,6 +177,18 @@ def test_dense_input(self,
"x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
"maxlength": 7,
"expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
+ }, {
+ "testcase_name": "_minlength",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 9,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
+ [1, 0, 0, 0, 2, 0, 0, 1, 0]]
+ }, {
+ "testcase_name": "_minlength_larger_values",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 3,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
+ [1, 0, 0, 0, 2, 0, 0, 1]]
})
def test_compiled_dense(self,
x,
From bb0cbfe1776c6acec97f5dedb8f6ee94c2fdcfca Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 2 Jun 2022 16:42:57 +0000
Subject: [PATCH 024/259] Handle weights and binary_output
---
.../compiler/tf2xla/kernels/bincount_op.cc | 100 +++++++++++++-----
tensorflow/python/ops/bincount_ops_test.py | 64 ++++++++++-
2 files changed, 133 insertions(+), 31 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index a945e8431d3d71..ad8e5a029c8372 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
#include "tensorflow/compiler/xla/client/lib/comparators.h"
#include "tensorflow/compiler/xla/client/lib/constants.h"
#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
@@ -27,14 +28,32 @@ namespace {
// TODO: This is only a dummy kernel
class DenseBincountOp : public XlaOpKernel {
+ private:
+ bool binary_output_;
public:
explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
- DataType dtype;
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
}
void Compile(XlaOpKernelContext* ctx) override {
// Dumb implementation for the simplest test case
xla::XlaOp input = ctx->Input(0);
+ xla::XlaOp weights = ctx->Input(2);
+ StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
+ OP_REQUIRES_OK(ctx, weights_shape_or.status());
+ auto weights_shape = weights_shape_or.ValueOrDie();
+ auto weights_size = weights_shape.dimensions(0);
+ auto input_xla_type = ctx->input_xla_type(0);
+ xla::PrimitiveType dtype;
+ bool has_weight;
+ if (weights_size){
+ has_weight = true;
+ dtype = ctx->input_xla_type(2);
+ }
+ else {
+ has_weight = false;
+ dtype = input_xla_type;
+ }
int64_t output_size;
ctx->ConstantInputAsIntScalar("size", &output_size);
StatusOr input_shape_or = ctx->builder()->GetShape(input);
@@ -44,16 +63,16 @@ class DenseBincountOp : public XlaOpKernel {
auto dim = 1;
auto rank = input_shape.rank();
auto counter_shape = xla::ShapeUtil::MakeShape(xla::S32, {});
- const xla::Shape data_shape = xla::ShapeUtil::MakeShape(xla::S32, {input_shape.dimensions()});
+ const xla::Shape data_shape = xla::ShapeUtil::MakeShape(input_xla_type, {input_shape.dimensions()});
- xla::Shape output_shape = xla::ShapeUtil::MakeShape(xla::S32, {output_size});
+ xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
if (rank == 2) {
- output_shape = xla::ShapeUtil::MakeShape(xla::S32, {rank, output_size});
+ output_shape = xla::ShapeUtil::MakeShape(dtype, {rank, output_size});
dim = input_shape.dimensions(1);
}
auto loop_shape = xla::ShapeUtil::MakeTupleShape(
- {counter_shape, data_shape, output_shape});
+ {counter_shape, data_shape, output_shape, weights_shape});
// Create a computation for the condition
xla::XlaComputation condition;
@@ -75,15 +94,18 @@ class DenseBincountOp : public XlaOpKernel {
auto counter = xla::GetTupleElement(param, 0);
auto data_stack = xla::GetTupleElement(param, 1);
auto accum_stack = xla::GetTupleElement(param, 2);
-
+ auto weights = xla::GetTupleElement(param, 3);
+ auto accum_shape = xla::ShapeUtil::MakeShape(dtype, {});
+
if (rank == 1) {
auto data = xla::DynamicSlice(data_stack, {counter}, {1});
auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
accum = xla::Reshape(accum, {0}, {});
+ accum = xla::ConvertElementType(accum, dtype);
auto data_scalar = xla::Reshape(data, {0}, {});
auto condition_shape = xla::ShapeUtil::MakeTupleShape(
- {counter_shape, counter_shape, output_shape});
+ {counter_shape, counter_shape, accum_shape, output_shape, weights_shape});
xla::XlaComputation update;
{
@@ -91,9 +113,21 @@ class DenseBincountOp : public XlaOpKernel {
ctx->builder()->CreateSubBuilder("update");
auto param = Parameter(true_builder.get(), 0, condition_shape, "param");
auto data_scalar = xla::GetTupleElement(param, 0);
- auto accum = xla::GetTupleElement(param, 1);
- auto accum_stack = xla::GetTupleElement(param, 2);
- accum = accum + xla::One(true_builder.get(), xla::S32);
+ auto counter = xla::GetTupleElement(param, 1);
+ auto accum = xla::GetTupleElement(param, 2);
+ auto accum_stack = xla::GetTupleElement(param, 3);
+ auto weights = xla::GetTupleElement(param, 4);
+ if (binary_output_){
+ accum = xla::One(true_builder.get(), dtype);
+ }
+ else if (! has_weight) {
+ accum = accum + xla::One(true_builder.get(), dtype);
+ }
+ else {
+ auto weight = xla::DynamicSlice(weights, {counter}, {1});
+ weight = xla::Reshape(weight, {0}, {});
+ accum = accum + weight;
+ }
accum_stack = xla::DynamicUpdateSlice(
accum_stack, xla::Reshape(accum, {1}), {data_scalar});
xla::Tuple(true_builder.get(), {accum, accum_stack});
@@ -106,22 +140,25 @@ class DenseBincountOp : public XlaOpKernel {
ctx->builder()->CreateSubBuilder("no_update");
auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
auto data = xla::GetTupleElement(param, 0);
- auto accum = xla::GetTupleElement(param, 1);
- auto accum_stack = xla::GetTupleElement(param, 2);
+ auto count = xla::GetTupleElement(param, 1);
+ auto accum = xla::GetTupleElement(param, 2);
+ auto accum_stack = xla::GetTupleElement(param, 3);
xla::Tuple(false_builder.get(), {accum, accum_stack});
no_update = false_builder->Build().ValueOrDie();
}
auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
auto pred = xla::Lt(data_scalar, output_size_xla);
- auto tuple = xla::Tuple(builder.get(), {data_scalar, accum, accum_stack});
+ auto tuple = xla::Tuple(builder.get(), {data_scalar, counter, accum, accum_stack, weights});
auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
accum = xla::GetTupleElement(cond, 0);
accum_stack = xla::GetTupleElement(cond, 1);
+
}
else {
auto condition_shape = xla::ShapeUtil::MakeTupleShape(
- {counter_shape, counter_shape, output_shape, counter_shape});
+ {counter_shape, counter_shape, counter_shape, output_shape,
+ accum_shape, weights_shape});
auto dim_xla = xla::ConstantR0(builder.get(), dim);
auto idx_1 = xla::Div(counter, dim_xla);
@@ -130,7 +167,7 @@ class DenseBincountOp : public XlaOpKernel {
auto data_scalar = xla::Reshape(data, {0,1}, {});
auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
accum = xla::Reshape(accum, {0,1}, {});
-
+ accum = xla::ConvertElementType(accum, dtype);
xla::XlaComputation update;
{
std::unique_ptr true_builder =
@@ -139,9 +176,21 @@ class DenseBincountOp : public XlaOpKernel {
auto data_scalar = xla::GetTupleElement(param, 0);
auto idx_1 = xla::GetTupleElement(param, 1);
- auto accum_stack = xla::GetTupleElement(param, 2);
- auto accum = xla::GetTupleElement(param, 3);
- accum = accum + xla::One(true_builder.get(), xla::S32);
+ auto idx_2 = xla::GetTupleElement(param, 2);
+ auto accum_stack = xla::GetTupleElement(param, 3);
+ auto accum = xla::GetTupleElement(param, 4);
+ auto weights = xla::GetTupleElement(param, 5);
+ if (binary_output_){
+ accum = xla::One(true_builder.get(), dtype);
+ }
+ else if (! has_weight) {
+ accum = accum + xla::One(true_builder.get(), dtype);
+ }
+ else {
+ auto weight = xla::DynamicSlice(weights, {idx_1, idx_2}, {1, 1});
+ auto weigth_scalar = xla::Reshape(weight, {0,1}, {});
+ accum = accum + weigth_scalar;
+ }
accum_stack = xla::DynamicUpdateSlice(
accum_stack, xla::Reshape(accum, {1, 1}), {idx_1, data_scalar});
xla::Tuple(true_builder.get(), {accum, accum_stack});
@@ -153,30 +202,29 @@ class DenseBincountOp : public XlaOpKernel {
std::unique_ptr false_builder =
builder->CreateSubBuilder("no_update_rank2");
auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
- auto data_scalar = xla::GetTupleElement(param, 0);
- auto idx_1 = xla::GetTupleElement(param, 1);
- auto accum_stack = xla::GetTupleElement(param, 2);
- auto accum = xla::GetTupleElement(param, 3);
+ auto accum_stack = xla::GetTupleElement(param, 3);
+ auto accum = xla::GetTupleElement(param, 4);
xla::Tuple(false_builder.get(), {accum, accum_stack});
no_update = false_builder->Build().ValueOrDie();
}
auto limit = xla::ConstantR0(builder.get(), output_size);
auto pred = xla::Lt(data_scalar, limit);
- auto tuple = xla::Tuple(builder.get(), {data_scalar, idx_1, accum_stack, accum});
+ auto tuple = xla::Tuple(builder.get(), {data_scalar, idx_1, idx_2, accum_stack, accum, weights});
auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
accum = xla::GetTupleElement(cond, 0);
accum_stack = xla::GetTupleElement(cond, 1);
}
counter = counter + xla::One(builder.get(), xla::S32);
- xla::Tuple(builder.get(), {counter, data_stack, accum_stack});
+ xla::Tuple(builder.get(), {counter, data_stack, accum_stack, weights});
body = builder->Build().ConsumeValueOrDie();
}
// Create a While node with computations for the condition and the body.
auto zero = xla::Zero(ctx->builder(), xla::S32);
- auto zero_broadcast = xla::Broadcast(zero, {output_shape.dimensions()});
- auto init = xla::Tuple(ctx->builder(), {zero, input, zero_broadcast});
+ auto zero_out = xla::Zero(ctx->builder(), dtype);
+ auto zero_broadcast = xla::Broadcast(zero_out, {output_shape.dimensions()});
+ auto init = xla::Tuple(ctx->builder(), {zero, input, zero_broadcast, weights});
auto result = xla::While(condition, body, init);
auto output = xla::GetTupleElement(result,2);
ctx->SetOutput(0, output);
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 8c482db559c7d3..92ee1ee8159159 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -165,10 +165,6 @@ def test_dense_input(self,
@parameterized.named_parameters(
{
- "testcase_name": "_baseline",
- "x": np.array([1, 1, 2, 3, 2, 4, 4, 5], dtype=np.int32),
- "expected_values": [0, 2, 2, 1, 2, 1]
- }, {
"testcase_name": "_no_maxlength",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
"expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
@@ -189,7 +185,65 @@ def test_dense_input(self,
"minlength": 3,
"expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
[1, 0, 0, 0, 2, 0, 0, 1]]
- })
+ }, {
+ "testcase_name": "_no_maxlength_binary",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [[0, 1, 1, 1, 0, 0],
+ [0, 0, 0, 0, 1, 1]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_maxlength_binary",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "maxlength": 7,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0],
+ [1, 0, 0, 0, 1, 0, 0]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_minlength_binary",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 9,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
+ [1, 0, 0, 0, 1, 0, 0, 1, 0]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_minlength_larger_values_binary",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 3,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
+ [1, 0, 0, 0, 1, 0, 0, 1]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_no_maxlength_weights",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [[0. , 2. , 1. , 0.5, 0. , 0. ],
+ [0. , 0. , 0. , 0. , 9. , 3. ]],
+ "weights": [[0.5, 1, 2], [3, 4, 5]]
+ }, {
+ "testcase_name": "_1d",
+ "x": np.array([3, 2, 1, 1], dtype=np.int32),
+ "expected_values": [0, 2, 1, 1]
+ }, {
+ "testcase_name": "_1d_binary",
+ "x": np.array([3, 2, 1, 1], dtype=np.int32),
+ "expected_values": [0, 1, 1, 1],
+ "binary_output": True
+ }, {
+ "testcase_name": "_1d_no_maxlenght_weights",
+ "x": np.array([3, 2, 1, 5, 4, 4], dtype=np.int32),
+ "weights": [0.5, 1, 2, 3, 4, 5],
+ "expected_values": [0. , 2. , 1. , 0.5, 9. , 3. ]
+ }, #{
+ # This is going to fail
+ # INVALID_ARGUMENT: Detected unsupported operations when trying to compile graph...
+ # Bincount (No registered 'Bincount' OpKernel for XLA_CPU_JIT devices compatible
+ # with node {{node bincount/Bincount}}){{node bincount/Bincount}}`
+ #
+ # "testcase_name": "_all_axes",
+ # "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ # "expected_values": [0, 4, 4, 5],
+ # "axis": None
+ #}
+ )
def test_compiled_dense(self,
x,
expected_values,
From 146ef2717f2751a358520723a89518c939f2b4cf Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 2 Jun 2022 17:34:31 +0000
Subject: [PATCH 025/259] Modify the failing test
---
tensorflow/python/ops/bincount_ops_test.py | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 92ee1ee8159159..c9b330b8d2c972 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -232,18 +232,12 @@ def test_dense_input(self,
"x": np.array([3, 2, 1, 5, 4, 4], dtype=np.int32),
"weights": [0.5, 1, 2, 3, 4, 5],
"expected_values": [0. , 2. , 1. , 0.5, 9. , 3. ]
- }, #{
- # This is going to fail
- # INVALID_ARGUMENT: Detected unsupported operations when trying to compile graph...
- # Bincount (No registered 'Bincount' OpKernel for XLA_CPU_JIT devices compatible
- # with node {{node bincount/Bincount}}){{node bincount/Bincount}}`
- #
- # "testcase_name": "_all_axes",
- # "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- # "expected_values": [0, 4, 4, 5],
- # "axis": None
- #}
- )
+ }, {
+ "testcase_name": "_all_axes",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [0, 1, 1, 1, 2, 1],
+ "axis": 0 # With None (recursive call) -> Bincount (No registered 'Bincount'
+ })
def test_compiled_dense(self,
x,
expected_values,
From fee13938e257eb3f267e6169ecdb19f53fa8782f Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 16:42:53 +0000
Subject: [PATCH 026/259] Try with pycuda_tests
---
tensorflow/python/BUILD | 22 +++----
tensorflow/python/ops/bincount_ops_test.py | 74 ++++++++++++++++++++++
2 files changed, 85 insertions(+), 11 deletions(-)
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d7a3355986bcf8..f90e1328c91c3d 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1435,17 +1435,6 @@ py_library(
],
)
-tf_py_test(
- name = "bincount_ops_test",
- size = "small",
- srcs = ["ops/bincount_ops_test.py"],
- python_version = "PY3",
- deps = [
- ":bincount_ops",
- ":platform_test",
- ],
-)
-
py_library(
name = "ctc_ops",
srcs = ["ops/ctc_ops.py"],
@@ -2688,6 +2677,17 @@ py_library(
],
)
+cuda_py_test(
+ name = "bincount_ops_test",
+ size = "small",
+ srcs = ["ops/bincount_ops_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":bincount_ops",
+ ":platform_test",
+ ],
+)
+
cuda_py_test(
name = "bitwise_ops_test",
size = "small",
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index c9b330b8d2c972..9eb6b91de3403e 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -15,6 +15,8 @@
"""Tests for bincount ops."""
from absl.testing import parameterized
+
+import timeit
import numpy as np
from tensorflow.python.eager import context
@@ -238,6 +240,7 @@ def test_dense_input(self,
"expected_values": [0, 1, 1, 1, 2, 1],
"axis": 0 # With None (recursive call) -> Bincount (No registered 'Bincount'
})
+ @test_util.disable_mlir_bridge('TODO: ?')
def test_compiled_dense(self,
x,
expected_values,
@@ -269,7 +272,78 @@ def f(x,
binary_output=binary_output,
axis=axis)
self.assertAllEqual(expected_values, y)
+ @parameterized.named_parameters(
+ {
+ "testcase_name": "_no_maxlength_small",
+ "x": np.random.randint(100, size=(200, 200), dtype=np.int32)
+ }, {
+ "testcase_name": "_no_maxlength_medium",
+ "x": np.random.randint(200, size=(500, 500), dtype=np.int32)
+ }, {
+ "testcase_name": "_no_maxlength_large",
+ "x": np.random.randint(500, size=(1000, 1000), dtype=np.int32)
+ })
+ @test_util.disable_mlir_bridge('TODO: ?')
+ def test_compiled_dense_perf(self,
+ x,
+ minlength=None,
+ maxlength=None,
+ binary_output=False,
+ weights=None,
+ axis=-1):
+
+ @def_function.function(jit_compile=True)
+ def f_compiled(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis
+ )
+ return y
+
+ def f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis
+ )
+ return y
+
+ lambda_f = lambda: f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
+ lambda_fc = lambda: f_compiled(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
+ # warm-up
+ lambda_f(); lambda_fc()
+ not_compiled = timeit.timeit(lambda_f, number=100)
+ compiled = timeit.timeit(lambda_fc, number=100)
+ print("XLA Compiled: %f Notcompiled: %f" % (compiled , not_compiled))
+ self.assertLess(not_compiled, compiled)
@parameterized.named_parameters(
{
From b1da6c1cfc82b21c32de25d42074126b0bbaa85e Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 17:09:55 +0000
Subject: [PATCH 027/259] Remove old test
---
tensorflow/compiler/tests/BUILD | 20 --------------------
1 file changed, 20 deletions(-)
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 595b0c2cccb8c0..c1fa93127fd235 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2143,26 +2143,6 @@ tf_xla_py_test(
],
)
-tf_xla_py_test(
- name = "bincount_op_test",
- size = "small",
- srcs = ["bincount_op_test.py"],
- enable_mlir_bridge = False,
- tags = [
- "no_pip",
- "optonly",
- ],
- deps = [
- ":xla_test",
- "//tensorflow/python:array_ops",
- "//tensorflow/python:bincount_ops",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:errors",
- "//tensorflow/python:framework",
- "//tensorflow/python/compiler/xla:compiler_py",
- ],
-)
-
tf_xla_py_test(
name = "where_op_test",
size = "small",
From 21a53cf21f7139cd95f899fd41846e4fb6ad2623 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 17:29:30 +0000
Subject: [PATCH 028/259] Add autograph
---
tensorflow/python/ops/bincount_ops_test.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 9eb6b91de3403e..d4127d3d8af490 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -308,7 +308,8 @@ def f_compiled(x,
axis=axis
)
return y
-
+
+ @def_function.function(jit_compile=False)
def f(x,
weights=weights,
minlength=minlength,
@@ -343,7 +344,7 @@ def f(x,
not_compiled = timeit.timeit(lambda_f, number=100)
compiled = timeit.timeit(lambda_fc, number=100)
print("XLA Compiled: %f Notcompiled: %f" % (compiled , not_compiled))
- self.assertLess(not_compiled, compiled)
+ self.assertLess(compiled, not_compiled)
@parameterized.named_parameters(
{
From df5138fd114bd3b6f58f882b829c72a781f871fe Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 21:38:22 +0000
Subject: [PATCH 029/259] Few fixes
---
.../compiler/tf2xla/kernels/bincount_op.cc | 23 ++++++++++++-------
tensorflow/python/ops/bincount_ops_test.py | 12 +++++-----
2 files changed, 21 insertions(+), 14 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index ad8e5a029c8372..f00900fb043ed3 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -62,7 +62,7 @@ class DenseBincountOp : public XlaOpKernel {
auto size = input_shape.dimensions(0);
auto dim = 1;
auto rank = input_shape.rank();
- auto counter_shape = xla::ShapeUtil::MakeShape(xla::S32, {});
+ auto counter_shape = xla::ShapeUtil::MakeShape(xla::S64, {});
const xla::Shape data_shape = xla::ShapeUtil::MakeShape(input_xla_type, {input_shape.dimensions()});
xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
@@ -81,7 +81,7 @@ class DenseBincountOp : public XlaOpKernel {
ctx->builder()->CreateSubBuilder("condition");
auto param = xla::Parameter(builder.get(), 0, loop_shape, "param");
auto counter = xla::GetTupleElement(param, 0);
- xla::Gt(xla::ConstantR0(builder.get(), size*dim), counter);
+ xla::Gt(xla::ConstantR0(builder.get(), size*dim), counter);
condition = builder->Build().ConsumeValueOrDie();
}
@@ -117,6 +117,7 @@ class DenseBincountOp : public XlaOpKernel {
auto accum = xla::GetTupleElement(param, 2);
auto accum_stack = xla::GetTupleElement(param, 3);
auto weights = xla::GetTupleElement(param, 4);
+
if (binary_output_){
accum = xla::One(true_builder.get(), dtype);
}
@@ -147,7 +148,8 @@ class DenseBincountOp : public XlaOpKernel {
no_update = false_builder->Build().ValueOrDie();
}
- auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
+ auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
+ data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
auto pred = xla::Lt(data_scalar, output_size_xla);
auto tuple = xla::Tuple(builder.get(), {data_scalar, counter, accum, accum_stack, weights});
auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
@@ -156,15 +158,17 @@ class DenseBincountOp : public XlaOpKernel {
}
else {
+
auto condition_shape = xla::ShapeUtil::MakeTupleShape(
{counter_shape, counter_shape, counter_shape, output_shape,
accum_shape, weights_shape});
- auto dim_xla = xla::ConstantR0(builder.get(), dim);
+ auto dim_xla = xla::ConstantR0(builder.get(), dim);
auto idx_1 = xla::Div(counter, dim_xla);
auto idx_2 = counter % dim_xla;
auto data = xla::DynamicSlice(data_stack, {idx_1, idx_2}, {1, 1});
auto data_scalar = xla::Reshape(data, {0,1}, {});
+ data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
accum = xla::Reshape(accum, {0,1}, {});
accum = xla::ConvertElementType(accum, dtype);
@@ -180,6 +184,7 @@ class DenseBincountOp : public XlaOpKernel {
auto accum_stack = xla::GetTupleElement(param, 3);
auto accum = xla::GetTupleElement(param, 4);
auto weights = xla::GetTupleElement(param, 5);
+
if (binary_output_){
accum = xla::One(true_builder.get(), dtype);
}
@@ -207,21 +212,23 @@ class DenseBincountOp : public XlaOpKernel {
xla::Tuple(false_builder.get(), {accum, accum_stack});
no_update = false_builder->Build().ValueOrDie();
}
- auto limit = xla::ConstantR0(builder.get(), output_size);
-
+ auto limit = xla::ConstantR0(builder.get(), output_size);
+ data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
+
+
auto pred = xla::Lt(data_scalar, limit);
auto tuple = xla::Tuple(builder.get(), {data_scalar, idx_1, idx_2, accum_stack, accum, weights});
auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
accum = xla::GetTupleElement(cond, 0);
accum_stack = xla::GetTupleElement(cond, 1);
}
- counter = counter + xla::One(builder.get(), xla::S32);
+ counter = counter + xla::One(builder.get(), xla::S64);
xla::Tuple(builder.get(), {counter, data_stack, accum_stack, weights});
body = builder->Build().ConsumeValueOrDie();
}
// Create a While node with computations for the condition and the body.
- auto zero = xla::Zero(ctx->builder(), xla::S32);
+ auto zero = xla::Zero(ctx->builder(), xla::S64);
auto zero_out = xla::Zero(ctx->builder(), dtype);
auto zero_broadcast = xla::Broadcast(zero_out, {output_shape.dimensions()});
auto init = xla::Tuple(ctx->builder(), {zero, input, zero_broadcast, weights});
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index d4127d3d8af490..fd63879c78579e 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -275,13 +275,13 @@ def f(x,
@parameterized.named_parameters(
{
"testcase_name": "_no_maxlength_small",
- "x": np.random.randint(100, size=(200, 200), dtype=np.int32)
+ "x": np.random.randint(200, size=(200, 200), dtype=np.int32)
}, {
"testcase_name": "_no_maxlength_medium",
- "x": np.random.randint(200, size=(500, 500), dtype=np.int32)
+ "x": np.random.randint(500, size=(500, 500), dtype=np.int32)
}, {
"testcase_name": "_no_maxlength_large",
- "x": np.random.randint(500, size=(1000, 1000), dtype=np.int32)
+ "x": np.random.randint(100, size=(1000, 1000), dtype=np.int32)
})
@test_util.disable_mlir_bridge('TODO: ?')
def test_compiled_dense_perf(self,
@@ -308,7 +308,7 @@ def f_compiled(x,
axis=axis
)
return y
-
+
@def_function.function(jit_compile=False)
def f(x,
weights=weights,
@@ -341,8 +341,8 @@ def f(x,
axis=axis)
# warm-up
lambda_f(); lambda_fc()
- not_compiled = timeit.timeit(lambda_f, number=100)
- compiled = timeit.timeit(lambda_fc, number=100)
+ not_compiled = timeit.timeit(lambda_f, number=10)
+ compiled = timeit.timeit(lambda_fc, number=10)
print("XLA Compiled: %f Notcompiled: %f" % (compiled , not_compiled))
self.assertLess(compiled, not_compiled)
From 99a24ca8c85721d87121e6a643ba23afdfce4a00 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 21:40:26 +0000
Subject: [PATCH 030/259] Fix string
---
tensorflow/python/ops/bincount_ops_test.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index fd63879c78579e..6f334afee6bb89 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -343,7 +343,8 @@ def f(x,
lambda_f(); lambda_fc()
not_compiled = timeit.timeit(lambda_f, number=10)
compiled = timeit.timeit(lambda_fc, number=10)
- print("XLA Compiled: %f Notcompiled: %f" % (compiled , not_compiled))
+ print("XLA JIT -> compiled: %f | not compiled: %f" %
+ (compiled , not_compiled))
self.assertLess(compiled, not_compiled)
@parameterized.named_parameters(
From 22e1eb6b9fd01d1efcc487aa424988b9bf5bbca4 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 22:05:37 +0000
Subject: [PATCH 031/259] remove extra casting
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 2 --
1 file changed, 2 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index f00900fb043ed3..d728e99c94a587 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -101,7 +101,6 @@ class DenseBincountOp : public XlaOpKernel {
auto data = xla::DynamicSlice(data_stack, {counter}, {1});
auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
accum = xla::Reshape(accum, {0}, {});
- accum = xla::ConvertElementType(accum, dtype);
auto data_scalar = xla::Reshape(data, {0}, {});
auto condition_shape = xla::ShapeUtil::MakeTupleShape(
@@ -171,7 +170,6 @@ class DenseBincountOp : public XlaOpKernel {
data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
accum = xla::Reshape(accum, {0,1}, {});
- accum = xla::ConvertElementType(accum, dtype);
xla::XlaComputation update;
{
std::unique_ptr true_builder =
From b7cf9b35b3d716c7d3f113f33513cd8740da4a4a Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 22:52:52 +0000
Subject: [PATCH 032/259] XLAlist allowlist
---
tensorflow/compiler/jit/mark_for_compilation_pass.cc | 1 +
1 file changed, 1 insertion(+)
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 59bea5e8711c81..a96ec97411a465 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1983,6 +1983,7 @@ absl::flat_hash_set GetKnownXLAAllowlistOp() {
"Cross",
"Cumprod",
"Cumsum",
+ "DenseBincount".
"DataFormatDimMap",
"DataFormatVecPermute",
"DepthToSpace",
From 90633091a6ad99352ed6c572994be0dbaa1f5733 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 3 Jun 2022 22:57:52 +0000
Subject: [PATCH 033/259] Fix typo
---
tensorflow/compiler/jit/mark_for_compilation_pass.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index a96ec97411a465..c473e3d4689a36 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1983,7 +1983,7 @@ absl::flat_hash_set GetKnownXLAAllowlistOp() {
"Cross",
"Cumprod",
"Cumsum",
- "DenseBincount".
+ "DenseBincount",
"DataFormatDimMap",
"DataFormatVecPermute",
"DepthToSpace",
From a39a359ddec4c442314ea9e1749b71f03021a6f2 Mon Sep 17 00:00:00 2001
From: bhack
Date: Sat, 4 Jun 2022 00:34:15 +0000
Subject: [PATCH 034/259] Fix
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index d728e99c94a587..558dc3e4114789 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -67,7 +67,7 @@ class DenseBincountOp : public XlaOpKernel {
xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
if (rank == 2) {
- output_shape = xla::ShapeUtil::MakeShape(dtype, {rank, output_size});
+ output_shape = xla::ShapeUtil::MakeShape(dtype, {size, output_size});
dim = input_shape.dimensions(1);
}
From cdc2e5c69864e5a4de9f8b5adbd049cbff47f233 Mon Sep 17 00:00:00 2001
From: shuw
Date: Mon, 6 Jun 2022 18:55:55 -0700
Subject: [PATCH 035/259] Add unittest to verify tf.where is disabled
---
.../compiler/xla/xla_disable_op_test.py | 112 ++++++++++++++++++
1 file changed, 112 insertions(+)
create mode 100644 tensorflow/python/compiler/xla/xla_disable_op_test.py
diff --git a/tensorflow/python/compiler/xla/xla_disable_op_test.py b/tensorflow/python/compiler/xla/xla_disable_op_test.py
new file mode 100644
index 00000000000000..0c972d328f3212
--- /dev/null
+++ b/tensorflow/python/compiler/xla/xla_disable_op_test.py
@@ -0,0 +1,112 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for checking if tf.where op is excluded from XLA auto-clustering."""
+
+import functools
+import numpy as np
+import os
+import subprocess
+
+from tensorflow.compat.v1 import config
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sort_ops
+from tensorflow.python.ops import sort_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+
+
+cmds_linux = {
+ "grep_where": (
+ "grep 'Where' /tmp/xla_logs/* && rm -rf /tmp/xla_logs/"),
+}
+
+def run_shell_cmd(args):
+ """Executes shell commands and returns output.
+
+ Args:
+ args: String of shell commands to run.
+
+ Returns:
+ Tuple output (stdoutdata, stderrdata) from running the shell commands.
+ """
+ proc = subprocess.Popen(
+ args,
+ shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT
+ )
+ return proc.communicate()
+
+class XlaDisableOpTest(test.TestCase):
+
+ @def_function.function()
+ def _runEvaluation(self, x, y, predictions):
+ dummy_loss = 0.9
+ predictions = array_ops.reshape(predictions, [-1])
+ display_ids = x
+ display_ids = array_ops.reshape(display_ids, [-1])
+ labels = array_ops.reshape(y, [-1])
+ sorted_ids = sort_ops.argsort(display_ids)
+ display_ids = array_ops.gather(display_ids, indices=sorted_ids)
+ predictions = array_ops.gather(predictions, indices=sorted_ids)
+ labels = array_ops.gather(labels, indices=sorted_ids)
+ _, display_ids_idx, display_ids_ads_count = array_ops.unique_with_counts(
+ display_ids, out_idx=dtypes.int64)
+ pad_length = 30 - math_ops.reduce_max(display_ids_ads_count)
+ preds = ragged_tensor.RaggedTensor.from_value_rowids(
+ predictions, display_ids_idx).to_tensor()
+ labels = ragged_tensor.RaggedTensor.from_value_rowids(
+ labels, display_ids_idx).to_tensor()
+ labels_mask = math_ops.reduce_max(labels, 1)
+ preds_masked = array_ops.boolean_mask(preds, labels_mask)
+ labels_masked = array_ops.boolean_mask(labels, labels_mask)
+ labels_masked = math_ops.argmax(labels_masked, axis=1, output_type=dtypes.int32)
+ labels_masked = array_ops.reshape(labels_masked, [-1, 1])
+
+ preds_masked = array_ops.pad(preds_masked, [(0, 0), (0, pad_length)])
+ _, predictions_idx = nn_ops.top_k(preds_masked, 12)
+ indices = math_ops.equal(predictions_idx, labels_masked)
+ return math_ops.cast(array_ops.shape(indices)[0], dtypes.float64)
+
+ def testRunEval(self):
+ dim_prediction = 1024
+ config.optimizer.set_jit(True)
+ pre = np.random.random((dim_prediction, 1))
+ y_tmp = np.zeros((dim_prediction, 1), dtype=float)
+
+ num_ones = np.random.randint(1, dim_prediction+1, 1)
+ id_one = np.random.randint(0, dim_prediction, num_ones)
+ for i in id_one:
+ y_tmp[i][0] = 1.
+ x_tmp = np.random.randint(0, dim_prediction,
+ (dim_prediction, 1), dtype=np.int64)
+ display_id_counter = self._runEvaluation(x_tmp, y_tmp, pre)
+
+ out,err = run_shell_cmd(cmds_linux['grep_where'])
+ self.assertEqual(err, None)
+ self.assertEqual(len(out), 0)
+
+
+if __name__ == '__main__':
+ os.environ['XLA_FLAGS'] = "--xla_dump_to='/tmp/xla_logs'"
+ os.environ['TF_XLA_FLAGS'] = "--tf_xla_cluster_exclude_ops=Where"
+ test.main()
From 891b60b7121a84b0dd806d17c7b1956d4eff7e04 Mon Sep 17 00:00:00 2001
From: bhack
Date: Tue, 7 Jun 2022 18:52:58 +0000
Subject: [PATCH 036/259] new impl
---
.../compiler/tf2xla/kernels/bincount_op.cc | 160 +-----------------
1 file changed, 1 insertion(+), 159 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 558dc3e4114789..54987b84f6a4a3 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -74,165 +74,7 @@ class DenseBincountOp : public XlaOpKernel {
auto loop_shape = xla::ShapeUtil::MakeTupleShape(
{counter_shape, data_shape, output_shape, weights_shape});
- // Create a computation for the condition
- xla::XlaComputation condition;
- {
- std::unique_ptr builder =
- ctx->builder()->CreateSubBuilder("condition");
- auto param = xla::Parameter(builder.get(), 0, loop_shape, "param");
- auto counter = xla::GetTupleElement(param, 0);
- xla::Gt(xla::ConstantR0(builder.get(), size*dim), counter);
- condition = builder->Build().ConsumeValueOrDie();
- }
-
- // Create a computation for the body
- xla::XlaComputation body;
- {
- std::unique_ptr builder =
- ctx->builder()->CreateSubBuilder("body");
- auto param = Parameter(builder.get(), 0, loop_shape, "param");
- auto counter = xla::GetTupleElement(param, 0);
- auto data_stack = xla::GetTupleElement(param, 1);
- auto accum_stack = xla::GetTupleElement(param, 2);
- auto weights = xla::GetTupleElement(param, 3);
- auto accum_shape = xla::ShapeUtil::MakeShape(dtype, {});
-
- if (rank == 1) {
- auto data = xla::DynamicSlice(data_stack, {counter}, {1});
- auto accum = xla::DynamicSlice(accum_stack, {data}, {1});
- accum = xla::Reshape(accum, {0}, {});
- auto data_scalar = xla::Reshape(data, {0}, {});
-
- auto condition_shape = xla::ShapeUtil::MakeTupleShape(
- {counter_shape, counter_shape, accum_shape, output_shape, weights_shape});
-
- xla::XlaComputation update;
- {
- std::unique_ptr true_builder =
- ctx->builder()->CreateSubBuilder("update");
- auto param = Parameter(true_builder.get(), 0, condition_shape, "param");
- auto data_scalar = xla::GetTupleElement(param, 0);
- auto counter = xla::GetTupleElement(param, 1);
- auto accum = xla::GetTupleElement(param, 2);
- auto accum_stack = xla::GetTupleElement(param, 3);
- auto weights = xla::GetTupleElement(param, 4);
-
- if (binary_output_){
- accum = xla::One(true_builder.get(), dtype);
- }
- else if (! has_weight) {
- accum = accum + xla::One(true_builder.get(), dtype);
- }
- else {
- auto weight = xla::DynamicSlice(weights, {counter}, {1});
- weight = xla::Reshape(weight, {0}, {});
- accum = accum + weight;
- }
- accum_stack = xla::DynamicUpdateSlice(
- accum_stack, xla::Reshape(accum, {1}), {data_scalar});
- xla::Tuple(true_builder.get(), {accum, accum_stack});
- update = true_builder->Build().ValueOrDie();
- }
-
- xla::XlaComputation no_update;
- {
- std::unique_ptr false_builder =
- ctx->builder()->CreateSubBuilder("no_update");
- auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
- auto data = xla::GetTupleElement(param, 0);
- auto count = xla::GetTupleElement(param, 1);
- auto accum = xla::GetTupleElement(param, 2);
- auto accum_stack = xla::GetTupleElement(param, 3);
- xla::Tuple(false_builder.get(), {accum, accum_stack});
- no_update = false_builder->Build().ValueOrDie();
- }
-
- auto output_size_xla = xla::ConstantR0(builder.get(), output_size);
- data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
- auto pred = xla::Lt(data_scalar, output_size_xla);
- auto tuple = xla::Tuple(builder.get(), {data_scalar, counter, accum, accum_stack, weights});
- auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
- accum = xla::GetTupleElement(cond, 0);
- accum_stack = xla::GetTupleElement(cond, 1);
-
- }
- else {
-
- auto condition_shape = xla::ShapeUtil::MakeTupleShape(
- {counter_shape, counter_shape, counter_shape, output_shape,
- accum_shape, weights_shape});
-
- auto dim_xla = xla::ConstantR0(builder.get(), dim);
- auto idx_1 = xla::Div(counter, dim_xla);
- auto idx_2 = counter % dim_xla;
- auto data = xla::DynamicSlice(data_stack, {idx_1, idx_2}, {1, 1});
- auto data_scalar = xla::Reshape(data, {0,1}, {});
- data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
- auto accum = xla::DynamicSlice(accum_stack, {idx_1, data_scalar}, {1, 1});
- accum = xla::Reshape(accum, {0,1}, {});
- xla::XlaComputation update;
- {
- std::unique_ptr true_builder =
- builder->CreateSubBuilder("update_rank2");
- auto param = Parameter(true_builder.get(), 0, condition_shape, "param");
-
- auto data_scalar = xla::GetTupleElement(param, 0);
- auto idx_1 = xla::GetTupleElement(param, 1);
- auto idx_2 = xla::GetTupleElement(param, 2);
- auto accum_stack = xla::GetTupleElement(param, 3);
- auto accum = xla::GetTupleElement(param, 4);
- auto weights = xla::GetTupleElement(param, 5);
-
- if (binary_output_){
- accum = xla::One(true_builder.get(), dtype);
- }
- else if (! has_weight) {
- accum = accum + xla::One(true_builder.get(), dtype);
- }
- else {
- auto weight = xla::DynamicSlice(weights, {idx_1, idx_2}, {1, 1});
- auto weigth_scalar = xla::Reshape(weight, {0,1}, {});
- accum = accum + weigth_scalar;
- }
- accum_stack = xla::DynamicUpdateSlice(
- accum_stack, xla::Reshape(accum, {1, 1}), {idx_1, data_scalar});
- xla::Tuple(true_builder.get(), {accum, accum_stack});
- update = true_builder->Build().ValueOrDie();
- }
-
- xla::XlaComputation no_update;
- {
- std::unique_ptr false_builder =
- builder->CreateSubBuilder("no_update_rank2");
- auto param = Parameter(false_builder.get(), 0, condition_shape, "param");
- auto accum_stack = xla::GetTupleElement(param, 3);
- auto accum = xla::GetTupleElement(param, 4);
- xla::Tuple(false_builder.get(), {accum, accum_stack});
- no_update = false_builder->Build().ValueOrDie();
- }
- auto limit = xla::ConstantR0(builder.get(), output_size);
- data_scalar = xla::ConvertElementType(data_scalar, xla::S64);
-
-
- auto pred = xla::Lt(data_scalar, limit);
- auto tuple = xla::Tuple(builder.get(), {data_scalar, idx_1, idx_2, accum_stack, accum, weights});
- auto cond = xla::Conditional(pred, tuple, update, tuple, no_update);
- accum = xla::GetTupleElement(cond, 0);
- accum_stack = xla::GetTupleElement(cond, 1);
- }
- counter = counter + xla::One(builder.get(), xla::S64);
- xla::Tuple(builder.get(), {counter, data_stack, accum_stack, weights});
- body = builder->Build().ConsumeValueOrDie();
- }
-
- // Create a While node with computations for the condition and the body.
- auto zero = xla::Zero(ctx->builder(), xla::S64);
- auto zero_out = xla::Zero(ctx->builder(), dtype);
- auto zero_broadcast = xla::Broadcast(zero_out, {output_shape.dimensions()});
- auto init = xla::Tuple(ctx->builder(), {zero, input, zero_broadcast, weights});
- auto result = xla::While(condition, body, init);
- auto output = xla::GetTupleElement(result,2);
- ctx->SetOutput(0, output);
+ ctx->SetOutput(0, input);
}
};
From 7564d671fd4027ccb693cf1981eae779a066cd3e Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 10 Jun 2022 17:10:47 +0000
Subject: [PATCH 037/259] HLO friendly TF python implementation
---
.../compiler/tf2xla/kernels/bincount_op.cc | 2 +-
tensorflow/python/ops/bincount_ops.py | 75 +++-
tensorflow/python/ops/bincount_ops_test.py | 354 +++++++++---------
3 files changed, 247 insertions(+), 184 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 54987b84f6a4a3..88ace8ff6329c2 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -78,7 +78,7 @@ class DenseBincountOp : public XlaOpKernel {
}
};
-REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
+//REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 610f9ec94befa5..02274f7b5384a2 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -22,11 +22,67 @@
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import gen_count_ops
from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.eager import def_function
from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import tf_export
+@def_function.function(jit_compile=True)
+def dense_bincount_1d(input=[],
+ size=None,
+ weights=[],
+ binary_output=False):
+
+ input = array_ops.reshape(input, [array_ops.shape(input)[0],-1])
+ output_shape = [size]
+ idx = input
+ if (binary_output):
+ updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.bool)
+ output = array_ops.zeros(output_shape, dtype=dtypes.bool)
+ elif (len(weights)):
+ updates = weights
+ output = array_ops.zeros(output_shape, dtype=weights.dtype)
+ else:
+ updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.int32)
+ output = array_ops.zeros(output_shape, dtype=dtypes.int32)
+
+ histogram_out = array_ops.tensor_scatter_add(output, idx, updates)
+
+ return histogram_out
+
+def prepare_idxs(input):
+ j_indices = array_ops.reshape(input, [-1, 1])
+ i_indices = array_ops.expand_dims(array_ops.repeat(math_ops.range(array_ops.shape(input)[0]), array_ops.shape(input)[1]), axis=-1)
+
+ new_indices = array_ops.concat([i_indices, j_indices], axis=-1)
+ return new_indices
+
+@def_function.function(jit_compile=True)
+def dense_bincount_2d(input=[],
+ size=None,
+ weights=[],
+ binary_output=False):
+
+ input = array_ops.reshape(input, [array_ops.shape(input)[0],-1])
+ idx = prepare_idxs(input)
+ output_shape = [input.shape[0], size]
+
+ if (binary_output):
+ updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.bool)
+ output = array_ops.zeros(output_shape, dtype=dtypes.bool)
+ elif (len(weights)):
+ updates = array_ops.reshape(weights, [array_ops.shape(idx)[0]])
+ output = array_ops.zeros(output_shape, dtype=weights.dtype)
+ else:
+ updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.int32)
+ output = array_ops.zeros(output_shape, dtype=dtypes.int32)
+
+ histogram_out = array_ops.tensor_scatter_add(output, idx, updates)
+
+ return histogram_out
+
@tf_export("math.bincount", v1=[])
def bincount(arr,
@@ -36,7 +92,8 @@ def bincount(arr,
dtype=dtypes.int32,
name=None,
axis=None,
- binary_output=False):
+ binary_output=False,
+ pseudo_hlo=False):
"""Counts the number of occurrences of each value in an integer array.
If `minlength` and `maxlength` are not given, returns a vector with length
@@ -208,7 +265,21 @@ def bincount(arr,
binary_output=binary_output)
else:
weights = validate_dense_weights(arr, weights, dtype)
- return gen_math_ops.dense_bincount(
+ if (pseudo_hlo == True):
+ if (len(arr.shape)==1):
+ return dense_bincount_1d(
+ input=arr,
+ size=output_size,
+ weights=weights,
+ binary_output=binary_output)
+ else:
+ return dense_bincount_2d(
+ input=arr,
+ size=output_size,
+ weights=weights,
+ binary_output=binary_output)
+ else:
+ return gen_math_ops.dense_bincount(
input=arr,
size=output_size,
weights=weights,
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 1812af0ff59d92..598ea89a637a0a 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -188,187 +188,6 @@ def test_dense_input(self,
self.assertAllEqual(expected_values, y.values)
self.assertAllEqual(expected_shape, y.dense_shape)
- @parameterized.named_parameters(
- {
- "testcase_name": "_no_maxlength",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
- }, {
- "testcase_name": "_maxlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
- }, {
- "testcase_name": "_minlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 2, 0, 0, 1, 0]]
- }, {
- "testcase_name": "_minlength_larger_values",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 3,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 2, 0, 0, 1]]
- }, {
- "testcase_name": "_no_maxlength_binary",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],
- [0, 0, 0, 0, 1, 1]],
- "binary_output": True,
- }, {
- "testcase_name": "_maxlength_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],
- [1, 0, 0, 0, 1, 0, 0]],
- "binary_output": True,
- }, {
- "testcase_name": "_minlength_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 1, 0, 0, 1, 0]],
- "binary_output": True,
- }, {
- "testcase_name": "_minlength_larger_values_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 3,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 1, 0, 0, 1]],
- "binary_output": True,
- }, {
- "testcase_name": "_no_maxlength_weights",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0. , 2. , 1. , 0.5, 0. , 0. ],
- [0. , 0. , 0. , 0. , 9. , 3. ]],
- "weights": [[0.5, 1, 2], [3, 4, 5]]
- }, {
- "testcase_name": "_1d",
- "x": np.array([3, 2, 1, 1], dtype=np.int32),
- "expected_values": [0, 2, 1, 1]
- }, {
- "testcase_name": "_1d_binary",
- "x": np.array([3, 2, 1, 1], dtype=np.int32),
- "expected_values": [0, 1, 1, 1],
- "binary_output": True
- }, {
- "testcase_name": "_1d_no_maxlenght_weights",
- "x": np.array([3, 2, 1, 5, 4, 4], dtype=np.int32),
- "weights": [0.5, 1, 2, 3, 4, 5],
- "expected_values": [0. , 2. , 1. , 0.5, 9. , 3. ]
- }, {
- "testcase_name": "_all_axes",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [0, 1, 1, 1, 2, 1],
- "axis": 0 # With None (recursive call) -> Bincount (No registered 'Bincount'
- })
- @test_util.disable_mlir_bridge('TODO: ?')
- def test_compiled_dense(self,
- x,
- expected_values,
- minlength=None,
- maxlength=None,
- binary_output=False,
- weights=None,
- axis=-1):
- @def_function.function(jit_compile=True)
- def f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- )
- return y
- y = f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- self.assertAllEqual(expected_values, y)
- @parameterized.named_parameters(
- {
- "testcase_name": "_no_maxlength_small",
- "x": np.random.randint(200, size=(200, 200), dtype=np.int32)
- }, {
- "testcase_name": "_no_maxlength_medium",
- "x": np.random.randint(500, size=(500, 500), dtype=np.int32)
- }, {
- "testcase_name": "_no_maxlength_large",
- "x": np.random.randint(100, size=(1000, 1000), dtype=np.int32)
- })
- @test_util.disable_mlir_bridge('TODO: ?')
- def test_compiled_dense_perf(self,
- x,
- minlength=None,
- maxlength=None,
- binary_output=False,
- weights=None,
- axis=-1):
-
- @def_function.function(jit_compile=True)
- def f_compiled(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- )
- return y
-
- @def_function.function(jit_compile=False)
- def f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- )
- return y
-
- lambda_f = lambda: f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
-
- lambda_fc = lambda: f_compiled(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- # warm-up
- lambda_f(); lambda_fc()
- not_compiled = timeit.timeit(lambda_f, number=10)
- compiled = timeit.timeit(lambda_fc, number=10)
- print("XLA JIT -> compiled: %f | not compiled: %f" %
- (compiled , not_compiled))
- self.assertLess(compiled, not_compiled)
@parameterized.named_parameters(
{
@@ -729,7 +548,180 @@ def test_ragged_input(self,
self.assertAllEqual(expected_values, y.values)
self.assertAllEqual(expected_shape, y.dense_shape)
+class TestCompiledDenseBincount(test.TestCase, parameterized.TestCase):
+
+ @parameterized.named_parameters(
+ {
+ "testcase_name": "_no_maxlength_basic",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
+ }, {
+ "testcase_name": "_maxlength",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "maxlength": 7,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
+ }, {
+ "testcase_name": "_minlength",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 9,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
+ [1, 0, 0, 0, 2, 0, 0, 1, 0]]
+ }, {
+ "testcase_name": "_minlength_larger_values",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 3,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
+ [1, 0, 0, 0, 2, 0, 0, 1]]
+ }, {
+ "testcase_name": "_no_maxlength_binary",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [[0, 1, 1, 1, 0, 0],
+ [0, 0, 0, 0, 1, 1]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_maxlength_binary",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "maxlength": 7,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0],
+ [1, 0, 0, 0, 1, 0, 0]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_minlength_binary",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 9,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
+ [1, 0, 0, 0, 1, 0, 0, 1, 0]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_minlength_larger_values_binary",
+ "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+ "minlength": 3,
+ "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
+ [1, 0, 0, 0, 1, 0, 0, 1]],
+ "binary_output": True,
+ }, {
+ "testcase_name": "_no_maxlength_weights",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [[0. , 2. , 1. , 0.5, 0. , 0. ],
+ [0. , 0. , 0. , 0. , 9. , 3. ]],
+ "weights": [[0.5, 1, 2], [3, 4, 5]]
+ }, {
+ "testcase_name": "_1d_no_maxlenght_base",
+ "x": np.array([3, 2, 1, 1], dtype=np.int32),
+ "expected_values": [0, 2, 1, 1]
+ }, {
+ "testcase_name": "_1d_binary",
+ "x": np.array([3, 2, 1, 1], dtype=np.int32),
+ "expected_values": [0, 1, 1, 1],
+ "binary_output": True
+ }, {
+ "testcase_name": "_1d_no_maxlenght_weights",
+ "x": np.array([3, 2, 1, 5, 4, 4], dtype=np.int32),
+ "weights": [0.5, 1, 2, 3, 4, 5],
+ "expected_values": [0. , 2. , 1. , 0.5, 9. , 3. ]
+ }, {
+ "testcase_name": "_all_axes",
+ "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+ "expected_values": [0, 1, 1, 1, 2, 1],
+ "axis": 0 # With None (recursive call) -> Bincount (No registered 'Bincount'
+ })
+ @test_util.disable_mlir_bridge('TODO: ?')
+ def test_compiled_dense(self,
+ x,
+ expected_values,
+ minlength=None,
+ maxlength=None,
+ binary_output=False,
+ weights=None,
+ axis=-1):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis,
+ pseudo_hlo=True
+ )
+ self.assertAllEqual(expected_values, y)
+
+ @parameterized.named_parameters(
+ {
+ "testcase_name": "_no_maxlength_small",
+ "x": np.random.randint(200, size=(200, 200), dtype=np.int32)
+ }, {
+ "testcase_name": "_no_maxlength_medium",
+ "x": np.random.randint(500, size=(500, 500), dtype=np.int32)
+ }, {
+ "testcase_name": "_no_maxlength_large",
+ "x": np.random.randint(100, size=(1000, 1000), dtype=np.int32)
+ })
+ @test_util.disable_mlir_bridge('TODO: ?')
+ def test_compiled_dense_perf(self,
+ x,
+ minlength=None,
+ maxlength=None,
+ binary_output=False,
+ weights=None,
+ axis=-1):
+
+ @def_function.function()
+ def f_compiled(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis,
+ pseudo_hlo= True
+ )
+ return y
+
+ @def_function.function()
+ def f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis
+ )
+ return y
+
+ lambda_f = lambda: f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
+
+ lambda_fc = lambda: f_compiled(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
+ # warm-up
+ lambda_f(); lambda_fc()
+ not_compiled = timeit.timeit(lambda_f, number=10)
+ compiled = timeit.timeit(lambda_fc, number=10)
+ print("XLA JIT -> compiled: %f | not compiled: %f" %
+ (compiled , not_compiled))
+ self.assertLess(compiled, not_compiled)
+
class TestDenseBincount(test.TestCase, parameterized.TestCase):
@parameterized.parameters([{
From 35ca59dbca7da63b972646d38b1da1e6c2aa19b2 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 10 Jun 2022 19:37:34 +0000
Subject: [PATCH 038/259] Make pylint happy
---
tensorflow/python/ops/bincount_ops.py | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 02274f7b5384a2..89673a7c050fa4 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -24,20 +24,19 @@
from tensorflow.python.ops import gen_math_ops
from tensorflow.python.eager import def_function
from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import tf_export
@def_function.function(jit_compile=True)
-def dense_bincount_1d(input=[],
+def dense_bincount_1d(input_arr=[],
size=None,
weights=[],
binary_output=False):
- input = array_ops.reshape(input, [array_ops.shape(input)[0],-1])
+ input_arr = array_ops.reshape(input_arr, [array_ops.shape(input_arr)[0],-1])
output_shape = [size]
- idx = input
+ idx = array_ops.reshape(input_arr, [array_ops.shape(input_arr)[0],-1])
if (binary_output):
updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.bool)
output = array_ops.zeros(output_shape, dtype=dtypes.bool)
@@ -52,22 +51,24 @@ def dense_bincount_1d(input=[],
return histogram_out
-def prepare_idxs(input):
- j_indices = array_ops.reshape(input, [-1, 1])
- i_indices = array_ops.expand_dims(array_ops.repeat(math_ops.range(array_ops.shape(input)[0]), array_ops.shape(input)[1]), axis=-1)
+def prepare_idxs(input_arr):
+ j_indices = array_ops.reshape(input_arr, [-1, 1])
+ dim1 = math_ops.range(array_ops.shape(input_arr)[0])
+ dim2 = array_ops.shape(input_arr)[1]
+ i_indices = array_ops.expand_dims(array_ops.repeat(dim1, dim2), axis=-1)
new_indices = array_ops.concat([i_indices, j_indices], axis=-1)
return new_indices
@def_function.function(jit_compile=True)
-def dense_bincount_2d(input=[],
+def dense_bincount_2d(input_arr=[],
size=None,
weights=[],
binary_output=False):
- input = array_ops.reshape(input, [array_ops.shape(input)[0],-1])
- idx = prepare_idxs(input)
- output_shape = [input.shape[0], size]
+ input_arr = array_ops.reshape(input_arr, [array_ops.shape(input_arr)[0],-1])
+ idx = prepare_idxs(input_arr)
+ output_shape = [input_arr.shape[0], size]
if (binary_output):
updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.bool)
@@ -268,13 +269,13 @@ def bincount(arr,
if (pseudo_hlo == True):
if (len(arr.shape)==1):
return dense_bincount_1d(
- input=arr,
+ input_arr=arr,
size=output_size,
weights=weights,
binary_output=binary_output)
else:
return dense_bincount_2d(
- input=arr,
+ input_arr=arr,
size=output_size,
weights=weights,
binary_output=binary_output)
From e6d57132b7b462ba04cf0e5020d7f9368a0b79ed Mon Sep 17 00:00:00 2001
From: bhack
Date: Tue, 14 Jun 2022 20:27:30 +0000
Subject: [PATCH 039/259] Porting TF 2.3.x GPU ir_emitter CPU
---
.../compiler/xla/service/cpu/ir_emitter.cc | 167 +++++++++++++++++-
.../compiler/xla/service/cpu/ir_emitter.h | 24 +++
2 files changed, 189 insertions(+), 2 deletions(-)
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index dad4b96647b674..222947f2a1549b 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -86,6 +86,7 @@ limitations under the License.
namespace xla {
namespace {
+using llvm_ir::IrArray;
using llvm_ir::IrName;
using llvm_ir::SetToFirstInsertPoint;
} // namespace
@@ -1926,8 +1927,170 @@ Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
return Unimplemented("Send-done is not implemented on CPU.");
}
-Status IrEmitter::HandleScatter(HloInstruction*) {
- return Unimplemented("Scatter is not implemented on CPUs.");
+Status IrEmitter::HandleScatter(HloInstruction* scatter) {
+ const HloInstruction* operand = scatter->operand(0);
+ const HloInstruction* scatter_indices = scatter->operand(1);
+ const HloInstruction* updates = scatter->operand(2);
+
+ TF_RETURN_IF_ERROR(EmitScatter(
+ scatter,
+ /*scatter_indices_gen=*/
+ [=](const IrArray::Index& index) {
+ return GetIrArray(*scatter_indices, *scatter)
+ .EmitReadArrayElement(index, &b_, "scatter_index");
+ },
+ /*updates_gen=*/
+ [=](const IrArray::Index& index) {
+ return GetIrArray(*updates, *scatter)
+ .EmitReadArrayElement(index, &b_, "update");
+ }));
+
+ return Status::OK();
+}
+Status IrEmitter::EmitScatter(
+ HloInstruction* scatter,
+ const llvm_ir::ElementGenerator& scatter_indices_gen,
+ const llvm_ir::ElementGenerator& updates_gen) {
+ const HloInstruction* operand = scatter->operand(0);
+ const HloInstruction* scatter_indices = scatter->operand(1);
+ const HloInstruction* updates = scatter->operand(2);
+ const ScatterDimensionNumbers& dim_numbers =
+ scatter->scatter_dimension_numbers();
+ CHECK(ShapeUtil::Equal(scatter->shape(), operand->shape()));
+
+ auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
+ std::vector raw_window_multidim;
+ std::vector input_scatter_multidim;
+ std::vector raw_window_bounds;
+
+ // Partition the index into window indices and scatter indices.
+ for (int64_t i = 0, e = index.size(); i != e; ++i) {
+ // For window indices also remember the window size, this comes in handy
+ // later.
+ if (absl::c_binary_search(dim_numbers.update_window_dims(), i)) {
+ raw_window_multidim.push_back(index[i]);
+ raw_window_bounds.push_back(updates->shape().dimensions(i));
+ } else {
+ input_scatter_multidim.push_back(index[i]);
+ }
+ }
+ DCHECK_EQ(raw_window_multidim.size(),
+ dim_numbers.update_window_dims_size());
+
+ // Apply inserted_window_dims to the window dimensions.
+ int64_t raw_window_multidim_idx = 0;
+ std::vector input_window_multidim;
+ std::vector input_window_bounds;
+ for (int64_t i = 0, e = operand->shape().rank(); i != e; ++i) {
+ if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
+ input_window_bounds.push_back(1); // Trivial dimension.
+ input_window_multidim.push_back(index.GetConstantWithIndexType(0));
+ } else {
+ input_window_bounds.push_back(
+ raw_window_bounds[raw_window_multidim_idx]);
+ input_window_multidim.push_back(
+ raw_window_multidim[raw_window_multidim_idx]);
+ ++raw_window_multidim_idx;
+ }
+ }
+ DCHECK_EQ(input_window_multidim.size(), operand->shape().rank());
+
+ // Insert a 1 dimension at the end if index_vector_dim requests one.
+ Shape scatter_indices_shape = scatter_indices->shape();
+ if (dim_numbers.index_vector_dim() == scatter_indices_shape.rank()) {
+ scatter_indices_shape.add_dimensions(1);
+ scatter_indices_shape.mutable_layout()->add_minor_to_major(
+ dim_numbers.index_vector_dim());
+ }
+
+ // Now load the indices corresponding to the current window from
+ // scatter_indices.
+ std::vector raw_scatter_index_multidim =
+ input_scatter_multidim;
+ raw_scatter_index_multidim.insert(
+ raw_scatter_index_multidim.begin() + dim_numbers.index_vector_dim(),
+ nullptr);
+ llvm::Value* is_in_bounds = b_.getTrue();
+ for (int64_t i = 0, e = dim_numbers.scatter_dims_to_operand_dims_size();
+ i != e; ++i) {
+ // Our index is stored along index_vector_dim, insert that into the lookup
+ // index into scatter_indices.
+ raw_scatter_index_multidim[dim_numbers.index_vector_dim()] =
+ index.GetConstantWithIndexType(i);
+ llvm_ir::IrArray::Index raw_scatter_index_index(
+ raw_scatter_index_multidim, scatter_indices_shape, index.GetType());
+
+ int64_t operand_dim = dim_numbers.scatter_dims_to_operand_dims(i);
+ TF_ASSIGN_OR_RETURN(
+ llvm::Value* const loaded_scatter_index,
+ scatter_indices_gen(raw_scatter_index_index.SourceIndexOfReshape(
+ scatter_indices_shape, scatter_indices->shape(), &b_)));
+ // And add the index to our window index. This yields the output index.
+ llvm::Value* casted_scatter_index =
+ IntCast(loaded_scatter_index, index.GetType(),
+ /*isSigned=*/true);
+ llvm::Value* dim_offset =
+ Add(input_window_multidim[operand_dim], casted_scatter_index);
+ input_window_multidim[operand_dim] = dim_offset;
+
+ // Also do the bounds check now.
+ int64_t max_index = operand->shape().dimensions(operand_dim) -
+ input_window_bounds[operand_dim] + 1;
+ // is_in_bounds = index >= 0 && index < dim_size-window_size+1
+ // --> index u< dim_size-window_size+1
+ is_in_bounds =
+ And(is_in_bounds, ICmpULT(casted_scatter_index,
+ index.GetConstantWithIndexType(max_index)));
+ }
+
+ llvm_ir::LlvmIfData if_window_in_bounds_data = llvm_ir::EmitIfThenElse(
+ is_in_bounds, "scatter.in_bounds", &b_, /*emit_else=*/false);
+ llvm_ir::SetToFirstInsertPoint(if_window_in_bounds_data.true_block, &b_);
+ // All done, now just read from the calculated input from the window, and do
+ // an atomic store to the calculated location in the output.
+ HloInstruction* output_hlo =
+ scatter->IsFused() ? scatter->parent()->FusionInstruction() : scatter;
+ llvm_ir::IrArray::Index input_window_index(
+ input_window_multidim, output_hlo->shape(), index.GetType());
+ llvm::Value* output_address =
+ GetIrArray(*output_hlo, *output_hlo)
+ .EmitArrayElementAddress(input_window_index, &b_);
+ llvm::Value* input_address = Alloca(llvm_ir::PrimitiveTypeToIrType(
+ updates->shape().element_type(), module_));
+ TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index));
+ Store(input_ir_value, input_address);
+
+ // (TODO) These are defined in the GPU emitter only
+ // are these GPU specific?
+
+ /*if (!scatter->unique_indices()) {
+ return EmitAtomicOperationForNestedComputation(
+ *scatter->to_apply(), output_address, input_address);
+ } else {
+ return EmitCallToNestedComputation(*scatter->to_apply(),
+ {output_address, input_address},
+ output_address);
+ }*/
+ };
+
+ // (TODO): These are defined only in the GPU emitter
+ // are these GPU specific?
+
+ // Launch a kernel that reads every element in the updates tensor. We could
+ // also do one kernel per window instead if bounds checks turn out to be a
+ // bottleneck.
+ // LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+ // updates->shape(), ir_emitter_context_->device_description());
+ // UpdateLaunchDimensions(launch_dimensions,
+ // ir_emitter_context_->llvm_module());
+
+ // (TODO):
+ // GetIndexTypeForKernel is gpu only
+ //return ParallelLoopEmitter(loop_body_emitter, updates->shape(),
+ // launch_dimensions, &b_)
+ // .EmitLoop(IrName(scatter),
+ // GetIndexTypeForKernel(scatter, launch_dimensions.launch_bound(),
+ // &b_));
}
Status IrEmitter::HandleSlice(HloInstruction* slice) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 6fec593aae277f..f99443d64776b7 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -230,6 +230,17 @@ class IrEmitter : public DfsHloVisitorWithDefault,
// not found, this will log a fatal error.
llvm::Value* GetEmittedValueFor(const HloInstruction* hlo);
+ // Returns the IrArray which contains the output of hlo.
+ //
+ // consumer is the HLO in which this IrArray is used -- we use this to (try
+ // to) add metadata indicating that the array is invariant within consumer.
+ //
+ // To get the buffer into which hlo should write its own output, call
+ // GetIrArray(hlo, hlo).
+ llvm_ir::IrArray GetIrArray(const HloInstruction& hlo,
+ const HloInstruction& consumer,
+ const ShapeIndex& shape_index = {});
+
// Gets an IrArray representing the given hlo.
llvm_ir::IrArray GetIrArrayFor(const HloInstruction* hlo);
@@ -341,6 +352,19 @@ class IrEmitter : public DfsHloVisitorWithDefault,
Status EmitMemcpy(const HloInstruction& source,
const HloInstruction& destination);
+ // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
+ // the process. `scatter` may be fused, scatter indices are taken from
+ // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
+ // expected to have the operand values in it already. If unique_indices
+ // is false, we will use an atomic update. Using false for unique_indices
+ // is safe only when it is guaranteed that there are no duplicate
+ // indices.
+ // When using unique_indices=true, it is the caller's responsibility to
+ // ensure there is no overlap.
+ Status EmitScatter(HloInstruction* scatter,
+ const llvm_ir::ElementGenerator& scatter_indices_gen,
+ const llvm_ir::ElementGenerator& updates_gen);
+
// Emits IR to compute the target address of the buffer for the given op.
// After calling this function, you can get a pointer to this buffer by
// calling GetIrArrayForOp or GetEmittedValueFor.
From 759c76ed46a4cd266b1cde0b0a3c15cf112aa0b9 Mon Sep 17 00:00:00 2001
From: bhack
Date: Tue, 14 Jun 2022 20:31:19 +0000
Subject: [PATCH 040/259] Prevent crash
---
tensorflow/compiler/xla/service/cpu/ir_emitter.cc | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 222947f2a1549b..ffc8da912634fe 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1928,6 +1928,9 @@ Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
}
Status IrEmitter::HandleScatter(HloInstruction* scatter) {
+ // (TODO) this is to prevent the crash untill the porting is complete
+ return Unimplemented("Scatter is not implemented on CPUs.");
+ // ------------
const HloInstruction* operand = scatter->operand(0);
const HloInstruction* scatter_indices = scatter->operand(1);
const HloInstruction* updates = scatter->operand(2);
From 097e6c487e145808a2f790b2a4a2b747dc75beaa Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 16 Jun 2022 21:19:59 +0000
Subject: [PATCH 041/259] Add 1d bincount dense scatter Remove wip scatter CPU
emitter
---
.../compiler/tf2xla/kernels/bincount_op.cc | 47 ++++-
.../compiler/xla/service/cpu/ir_emitter.cc | 167 +-----------------
.../compiler/xla/service/cpu/ir_emitter.h | 24 ---
tensorflow/python/ops/bincount_ops.py | 19 +-
tensorflow/python/ops/bincount_ops_test.py | 32 +++-
5 files changed, 65 insertions(+), 224 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 88ace8ff6329c2..764e77f0d224c7 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -45,13 +45,13 @@ class DenseBincountOp : public XlaOpKernel {
auto weights_size = weights_shape.dimensions(0);
auto input_xla_type = ctx->input_xla_type(0);
xla::PrimitiveType dtype;
- bool has_weight;
+ bool has_weights;
if (weights_size){
- has_weight = true;
+ has_weights = true;
dtype = ctx->input_xla_type(2);
}
else {
- has_weight = false;
+ has_weights = false;
dtype = input_xla_type;
}
int64_t output_size;
@@ -70,15 +70,46 @@ class DenseBincountOp : public XlaOpKernel {
output_shape = xla::ShapeUtil::MakeShape(dtype, {size, output_size});
dim = input_shape.dimensions(1);
}
+ input = xla::Reshape(input, {size, 1});
+ auto idx = xla::Reshape(input, {size, 1});
+ auto one = xla::One(ctx->builder(), input_xla_type);
+ xla::XlaOp updates, output;
+ if (has_weights) {
+ updates = weights;
+ auto zero = xla::Zero(ctx->builder(), dtype);
+ output = xla::Broadcast(zero, {output_shape.dimensions()});
+ }
+ else {
+ auto zero = xla::Zero(ctx->builder(), input_xla_type);
+ updates = xla::Broadcast(one, {output_shape.dimensions()});
+ output = xla::Broadcast(zero, {output_shape.dimensions()});
+ }
+
+ xla::XlaComputation assn_computation = [&] {
+ std::unique_ptr subb =
+ ctx->builder()->CreateSubBuilder("scatter_bincount");
+ xla::Shape param_shape = xla::ShapeUtil::MakeShape(dtype, {});
+ auto p0 = xla::Parameter(subb.get(), 0, param_shape, "p0");
+ auto p1 = xla::Parameter(subb.get(), 1, param_shape, "p1");
+ if (binary_output_) {
+ xla::One(subb.get(), xla::S32);
+ }
+ else {
+ xla::Add(p0, p1);
+ }
+ return subb->BuildAndNoteError();
+ }();
+ xla::ScatterDimensionNumbers scatter_dnums;
+ scatter_dnums.set_index_vector_dim(1);
+ scatter_dnums.add_inserted_window_dims(0);
+ scatter_dnums.add_scatter_dims_to_operand_dims(0);
+ output = xla::Scatter(output, idx, updates, assn_computation, scatter_dnums, false, false);
- auto loop_shape = xla::ShapeUtil::MakeTupleShape(
- {counter_shape, data_shape, output_shape, weights_shape});
-
- ctx->SetOutput(0, input);
+ ctx->SetOutput(0, output);
}
};
-//REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
+REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index ffc8da912634fe..a78c33e5058945 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1927,173 +1927,8 @@ Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
return Unimplemented("Send-done is not implemented on CPU.");
}
-Status IrEmitter::HandleScatter(HloInstruction* scatter) {
- // (TODO) this is to prevent the crash untill the porting is complete
+Status IrEmitter::HandleScatter(HloInstruction*) {
return Unimplemented("Scatter is not implemented on CPUs.");
- // ------------
- const HloInstruction* operand = scatter->operand(0);
- const HloInstruction* scatter_indices = scatter->operand(1);
- const HloInstruction* updates = scatter->operand(2);
-
- TF_RETURN_IF_ERROR(EmitScatter(
- scatter,
- /*scatter_indices_gen=*/
- [=](const IrArray::Index& index) {
- return GetIrArray(*scatter_indices, *scatter)
- .EmitReadArrayElement(index, &b_, "scatter_index");
- },
- /*updates_gen=*/
- [=](const IrArray::Index& index) {
- return GetIrArray(*updates, *scatter)
- .EmitReadArrayElement(index, &b_, "update");
- }));
-
- return Status::OK();
-}
-Status IrEmitter::EmitScatter(
- HloInstruction* scatter,
- const llvm_ir::ElementGenerator& scatter_indices_gen,
- const llvm_ir::ElementGenerator& updates_gen) {
- const HloInstruction* operand = scatter->operand(0);
- const HloInstruction* scatter_indices = scatter->operand(1);
- const HloInstruction* updates = scatter->operand(2);
- const ScatterDimensionNumbers& dim_numbers =
- scatter->scatter_dimension_numbers();
- CHECK(ShapeUtil::Equal(scatter->shape(), operand->shape()));
-
- auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
- std::vector raw_window_multidim;
- std::vector input_scatter_multidim;
- std::vector raw_window_bounds;
-
- // Partition the index into window indices and scatter indices.
- for (int64_t i = 0, e = index.size(); i != e; ++i) {
- // For window indices also remember the window size, this comes in handy
- // later.
- if (absl::c_binary_search(dim_numbers.update_window_dims(), i)) {
- raw_window_multidim.push_back(index[i]);
- raw_window_bounds.push_back(updates->shape().dimensions(i));
- } else {
- input_scatter_multidim.push_back(index[i]);
- }
- }
- DCHECK_EQ(raw_window_multidim.size(),
- dim_numbers.update_window_dims_size());
-
- // Apply inserted_window_dims to the window dimensions.
- int64_t raw_window_multidim_idx = 0;
- std::vector input_window_multidim;
- std::vector input_window_bounds;
- for (int64_t i = 0, e = operand->shape().rank(); i != e; ++i) {
- if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
- input_window_bounds.push_back(1); // Trivial dimension.
- input_window_multidim.push_back(index.GetConstantWithIndexType(0));
- } else {
- input_window_bounds.push_back(
- raw_window_bounds[raw_window_multidim_idx]);
- input_window_multidim.push_back(
- raw_window_multidim[raw_window_multidim_idx]);
- ++raw_window_multidim_idx;
- }
- }
- DCHECK_EQ(input_window_multidim.size(), operand->shape().rank());
-
- // Insert a 1 dimension at the end if index_vector_dim requests one.
- Shape scatter_indices_shape = scatter_indices->shape();
- if (dim_numbers.index_vector_dim() == scatter_indices_shape.rank()) {
- scatter_indices_shape.add_dimensions(1);
- scatter_indices_shape.mutable_layout()->add_minor_to_major(
- dim_numbers.index_vector_dim());
- }
-
- // Now load the indices corresponding to the current window from
- // scatter_indices.
- std::vector raw_scatter_index_multidim =
- input_scatter_multidim;
- raw_scatter_index_multidim.insert(
- raw_scatter_index_multidim.begin() + dim_numbers.index_vector_dim(),
- nullptr);
- llvm::Value* is_in_bounds = b_.getTrue();
- for (int64_t i = 0, e = dim_numbers.scatter_dims_to_operand_dims_size();
- i != e; ++i) {
- // Our index is stored along index_vector_dim, insert that into the lookup
- // index into scatter_indices.
- raw_scatter_index_multidim[dim_numbers.index_vector_dim()] =
- index.GetConstantWithIndexType(i);
- llvm_ir::IrArray::Index raw_scatter_index_index(
- raw_scatter_index_multidim, scatter_indices_shape, index.GetType());
-
- int64_t operand_dim = dim_numbers.scatter_dims_to_operand_dims(i);
- TF_ASSIGN_OR_RETURN(
- llvm::Value* const loaded_scatter_index,
- scatter_indices_gen(raw_scatter_index_index.SourceIndexOfReshape(
- scatter_indices_shape, scatter_indices->shape(), &b_)));
- // And add the index to our window index. This yields the output index.
- llvm::Value* casted_scatter_index =
- IntCast(loaded_scatter_index, index.GetType(),
- /*isSigned=*/true);
- llvm::Value* dim_offset =
- Add(input_window_multidim[operand_dim], casted_scatter_index);
- input_window_multidim[operand_dim] = dim_offset;
-
- // Also do the bounds check now.
- int64_t max_index = operand->shape().dimensions(operand_dim) -
- input_window_bounds[operand_dim] + 1;
- // is_in_bounds = index >= 0 && index < dim_size-window_size+1
- // --> index u< dim_size-window_size+1
- is_in_bounds =
- And(is_in_bounds, ICmpULT(casted_scatter_index,
- index.GetConstantWithIndexType(max_index)));
- }
-
- llvm_ir::LlvmIfData if_window_in_bounds_data = llvm_ir::EmitIfThenElse(
- is_in_bounds, "scatter.in_bounds", &b_, /*emit_else=*/false);
- llvm_ir::SetToFirstInsertPoint(if_window_in_bounds_data.true_block, &b_);
- // All done, now just read from the calculated input from the window, and do
- // an atomic store to the calculated location in the output.
- HloInstruction* output_hlo =
- scatter->IsFused() ? scatter->parent()->FusionInstruction() : scatter;
- llvm_ir::IrArray::Index input_window_index(
- input_window_multidim, output_hlo->shape(), index.GetType());
- llvm::Value* output_address =
- GetIrArray(*output_hlo, *output_hlo)
- .EmitArrayElementAddress(input_window_index, &b_);
- llvm::Value* input_address = Alloca(llvm_ir::PrimitiveTypeToIrType(
- updates->shape().element_type(), module_));
- TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index));
- Store(input_ir_value, input_address);
-
- // (TODO) These are defined in the GPU emitter only
- // are these GPU specific?
-
- /*if (!scatter->unique_indices()) {
- return EmitAtomicOperationForNestedComputation(
- *scatter->to_apply(), output_address, input_address);
- } else {
- return EmitCallToNestedComputation(*scatter->to_apply(),
- {output_address, input_address},
- output_address);
- }*/
- };
-
- // (TODO): These are defined only in the GPU emitter
- // are these GPU specific?
-
- // Launch a kernel that reads every element in the updates tensor. We could
- // also do one kernel per window instead if bounds checks turn out to be a
- // bottleneck.
- // LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
- // updates->shape(), ir_emitter_context_->device_description());
- // UpdateLaunchDimensions(launch_dimensions,
- // ir_emitter_context_->llvm_module());
-
- // (TODO):
- // GetIndexTypeForKernel is gpu only
- //return ParallelLoopEmitter(loop_body_emitter, updates->shape(),
- // launch_dimensions, &b_)
- // .EmitLoop(IrName(scatter),
- // GetIndexTypeForKernel(scatter, launch_dimensions.launch_bound(),
- // &b_));
}
Status IrEmitter::HandleSlice(HloInstruction* slice) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index f99443d64776b7..6fec593aae277f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -230,17 +230,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
// not found, this will log a fatal error.
llvm::Value* GetEmittedValueFor(const HloInstruction* hlo);
- // Returns the IrArray which contains the output of hlo.
- //
- // consumer is the HLO in which this IrArray is used -- we use this to (try
- // to) add metadata indicating that the array is invariant within consumer.
- //
- // To get the buffer into which hlo should write its own output, call
- // GetIrArray(hlo, hlo).
- llvm_ir::IrArray GetIrArray(const HloInstruction& hlo,
- const HloInstruction& consumer,
- const ShapeIndex& shape_index = {});
-
// Gets an IrArray representing the given hlo.
llvm_ir::IrArray GetIrArrayFor(const HloInstruction* hlo);
@@ -352,19 +341,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
Status EmitMemcpy(const HloInstruction& source,
const HloInstruction& destination);
- // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
- // the process. `scatter` may be fused, scatter indices are taken from
- // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
- // expected to have the operand values in it already. If unique_indices
- // is false, we will use an atomic update. Using false for unique_indices
- // is safe only when it is guaranteed that there are no duplicate
- // indices.
- // When using unique_indices=true, it is the caller's responsibility to
- // ensure there is no overlap.
- Status EmitScatter(HloInstruction* scatter,
- const llvm_ir::ElementGenerator& scatter_indices_gen,
- const llvm_ir::ElementGenerator& updates_gen);
-
// Emits IR to compute the target address of the buffer for the given op.
// After calling this function, you can get a pointer to this buffer by
// calling GetIrArrayForOp or GetEmittedValueFor.
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 89673a7c050fa4..79bc2616c8edfb 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -93,8 +93,7 @@ def bincount(arr,
dtype=dtypes.int32,
name=None,
axis=None,
- binary_output=False,
- pseudo_hlo=False):
+ binary_output=False):
"""Counts the number of occurrences of each value in an integer array.
If `minlength` and `maxlength` are not given, returns a vector with length
@@ -266,21 +265,7 @@ def bincount(arr,
binary_output=binary_output)
else:
weights = validate_dense_weights(arr, weights, dtype)
- if (pseudo_hlo == True):
- if (len(arr.shape)==1):
- return dense_bincount_1d(
- input_arr=arr,
- size=output_size,
- weights=weights,
- binary_output=binary_output)
- else:
- return dense_bincount_2d(
- input_arr=arr,
- size=output_size,
- weights=weights,
- binary_output=binary_output)
- else:
- return gen_math_ops.dense_bincount(
+ return gen_math_ops.dense_bincount(
input=arr,
size=output_size,
weights=weights,
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 598ea89a637a0a..4b60960c4a6362 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -634,16 +634,31 @@ def test_compiled_dense(self,
binary_output=False,
weights=None,
axis=-1):
- y = bincount_ops.bincount(
- x,
+
+ @def_function.function(jit_compile=True)
+ def f (x,
weights=weights,
minlength=minlength,
maxlength=maxlength,
binary_output=binary_output,
- axis=axis,
- pseudo_hlo=True
- )
- self.assertAllEqual(expected_values, y)
+ axis=axis
+ ):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis
+ )
+ return y
+ res = f(x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
+ self.assertAllEqual(expected_values, res)
@parameterized.named_parameters(
{
@@ -665,7 +680,7 @@ def test_compiled_dense_perf(self,
weights=None,
axis=-1):
- @def_function.function()
+ @def_function.function(jit_compile=True)
def f_compiled(x,
weights=weights,
minlength=minlength,
@@ -678,8 +693,7 @@ def f_compiled(x,
minlength=minlength,
maxlength=maxlength,
binary_output=binary_output,
- axis=axis,
- pseudo_hlo= True
+ axis=axis
)
return y
From 53004cc132d34698ce1c0a65d09d3709d5092cf5 Mon Sep 17 00:00:00 2001
From: shuw
Date: Thu, 16 Jun 2022 20:13:44 -0700
Subject: [PATCH 042/259] Add C++ unittest
---
.../jit/mark_for_compilation_pass_test.cc | 19 +++
.../compiler/xla/xla_disable_op_test.py | 112 ------------------
2 files changed, 19 insertions(+), 112 deletions(-)
delete mode 100644 tensorflow/python/compiler/xla/xla_disable_op_test.py
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index aeecd9a3947d20..79acf4eeb7a9d7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -196,6 +196,25 @@ TEST(XlaCompilationTest, StringUnsupported) {
EXPECT_TRUE(clusters.empty());
}
+TEST(XlaCompilationTest, WhereUnsupported) {
+ std::unique_ptr graph(new Graph(OpRegistry::Global()));
+ {
+ GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+ Node* a = ops::SourceOp(
+ "Const", builder.opts()
+ .WithName("A")
+ .WithAttr("dtype", DT_INT32)
+ .WithAttr("value", Tensor()));
+ Node* b = ops::UnaryOp("Where", a, builder.opts().WithName("B"));
+ ops::BinaryOp("Gather", b, a, builder.opts().WithName("C"));
+ TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+ }
+
+ TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+ auto clusters = GetClusters(*graph);
+ EXPECT_TRUE(!clusters.empty());
+}
+
TEST(XlaCompilationTest, HalfSupported) {
std::unique_ptr graph(new Graph(OpRegistry::Global()));
{
diff --git a/tensorflow/python/compiler/xla/xla_disable_op_test.py b/tensorflow/python/compiler/xla/xla_disable_op_test.py
deleted file mode 100644
index 0c972d328f3212..00000000000000
--- a/tensorflow/python/compiler/xla/xla_disable_op_test.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test for checking if tf.where op is excluded from XLA auto-clustering."""
-
-import functools
-import numpy as np
-import os
-import subprocess
-
-from tensorflow.compat.v1 import config
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import sort_ops
-from tensorflow.python.ops import sort_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import test
-
-
-cmds_linux = {
- "grep_where": (
- "grep 'Where' /tmp/xla_logs/* && rm -rf /tmp/xla_logs/"),
-}
-
-def run_shell_cmd(args):
- """Executes shell commands and returns output.
-
- Args:
- args: String of shell commands to run.
-
- Returns:
- Tuple output (stdoutdata, stderrdata) from running the shell commands.
- """
- proc = subprocess.Popen(
- args,
- shell=True,
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT
- )
- return proc.communicate()
-
-class XlaDisableOpTest(test.TestCase):
-
- @def_function.function()
- def _runEvaluation(self, x, y, predictions):
- dummy_loss = 0.9
- predictions = array_ops.reshape(predictions, [-1])
- display_ids = x
- display_ids = array_ops.reshape(display_ids, [-1])
- labels = array_ops.reshape(y, [-1])
- sorted_ids = sort_ops.argsort(display_ids)
- display_ids = array_ops.gather(display_ids, indices=sorted_ids)
- predictions = array_ops.gather(predictions, indices=sorted_ids)
- labels = array_ops.gather(labels, indices=sorted_ids)
- _, display_ids_idx, display_ids_ads_count = array_ops.unique_with_counts(
- display_ids, out_idx=dtypes.int64)
- pad_length = 30 - math_ops.reduce_max(display_ids_ads_count)
- preds = ragged_tensor.RaggedTensor.from_value_rowids(
- predictions, display_ids_idx).to_tensor()
- labels = ragged_tensor.RaggedTensor.from_value_rowids(
- labels, display_ids_idx).to_tensor()
- labels_mask = math_ops.reduce_max(labels, 1)
- preds_masked = array_ops.boolean_mask(preds, labels_mask)
- labels_masked = array_ops.boolean_mask(labels, labels_mask)
- labels_masked = math_ops.argmax(labels_masked, axis=1, output_type=dtypes.int32)
- labels_masked = array_ops.reshape(labels_masked, [-1, 1])
-
- preds_masked = array_ops.pad(preds_masked, [(0, 0), (0, pad_length)])
- _, predictions_idx = nn_ops.top_k(preds_masked, 12)
- indices = math_ops.equal(predictions_idx, labels_masked)
- return math_ops.cast(array_ops.shape(indices)[0], dtypes.float64)
-
- def testRunEval(self):
- dim_prediction = 1024
- config.optimizer.set_jit(True)
- pre = np.random.random((dim_prediction, 1))
- y_tmp = np.zeros((dim_prediction, 1), dtype=float)
-
- num_ones = np.random.randint(1, dim_prediction+1, 1)
- id_one = np.random.randint(0, dim_prediction, num_ones)
- for i in id_one:
- y_tmp[i][0] = 1.
- x_tmp = np.random.randint(0, dim_prediction,
- (dim_prediction, 1), dtype=np.int64)
- display_id_counter = self._runEvaluation(x_tmp, y_tmp, pre)
-
- out,err = run_shell_cmd(cmds_linux['grep_where'])
- self.assertEqual(err, None)
- self.assertEqual(len(out), 0)
-
-
-if __name__ == '__main__':
- os.environ['XLA_FLAGS'] = "--xla_dump_to='/tmp/xla_logs'"
- os.environ['TF_XLA_FLAGS'] = "--tf_xla_cluster_exclude_ops=Where"
- test.main()
From dc7415615c5722901b9272f22a8cdae1765b3db0 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 13:14:55 +0000
Subject: [PATCH 043/259] Add rank 2 support
---
.../compiler/tf2xla/kernels/bincount_op.cc | 58 ++++++++++++-------
tensorflow/python/ops/bincount_ops_test.py | 15 ++++-
2 files changed, 48 insertions(+), 25 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 764e77f0d224c7..9e0dbd901605e1 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -34,7 +34,7 @@ class DenseBincountOp : public XlaOpKernel {
explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
}
-
+
void Compile(XlaOpKernelContext* ctx) override {
// Dumb implementation for the simplest test case
xla::XlaOp input = ctx->Input(0);
@@ -60,30 +60,48 @@ class DenseBincountOp : public XlaOpKernel {
OP_REQUIRES_OK(ctx, input_shape_or.status());
auto input_shape = input_shape_or.ValueOrDie();
auto size = input_shape.dimensions(0);
- auto dim = 1;
auto rank = input_shape.rank();
- auto counter_shape = xla::ShapeUtil::MakeShape(xla::S64, {});
- const xla::Shape data_shape = xla::ShapeUtil::MakeShape(input_xla_type, {input_shape.dimensions()});
xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
+ xla::XlaOp idx, updates, output;
+ xla::ScatterDimensionNumbers scatter_dnums;
+ scatter_dnums.set_index_vector_dim(1);
+ scatter_dnums.add_inserted_window_dims(0);
+ scatter_dnums.add_scatter_dims_to_operand_dims(0);
+ auto one = xla::One(ctx->builder(), input_xla_type);
+ auto zero = xla::Zero(ctx->builder(), input_xla_type);;
+
if (rank == 2) {
output_shape = xla::ShapeUtil::MakeShape(dtype, {size, output_size});
- dim = input_shape.dimensions(1);
- }
- input = xla::Reshape(input, {size, 1});
- auto idx = xla::Reshape(input, {size, 1});
- auto one = xla::One(ctx->builder(), input_xla_type);
- xla::XlaOp updates, output;
- if (has_weights) {
- updates = weights;
- auto zero = xla::Zero(ctx->builder(), dtype);
- output = xla::Broadcast(zero, {output_shape.dimensions()});
+ scatter_dnums.add_inserted_window_dims(1);
+ scatter_dnums.add_scatter_dims_to_operand_dims(1);
+ auto i_shape = xla::ShapeUtil::MakeShape(input_xla_type, {input_shape.dimensions()});
+ auto i = xla::Iota(ctx->builder(), i_shape, 0);
+ i = xla::Reshape(i, {input_shape.dimensions(0)*input_shape.dimensions(1), 1});
+ auto j = xla::Reshape(input, {input_shape.dimensions(0)*input_shape.dimensions(1), 1});
+ std::vector iotas_to_concat;
+ iotas_to_concat.push_back(i);
+ iotas_to_concat.push_back(j);
+ idx = xla::ConcatInDim(ctx->builder(), iotas_to_concat, 1);
+ updates = xla::Broadcast(one, {input_shape.dimensions(0)*input_shape.dimensions(1)});
+ if (has_weights) {
+ weights = xla::Reshape(weights, {input_shape.dimensions(0)*input_shape.dimensions(1)});
+ zero = xla::Zero(ctx->builder(), dtype);
+ updates = weights;
+ }
}
- else {
- auto zero = xla::Zero(ctx->builder(), input_xla_type);
- updates = xla::Broadcast(one, {output_shape.dimensions()});
- output = xla::Broadcast(zero, {output_shape.dimensions()});
+
+ else {
+ input = xla::Reshape(input, {size, 1});
+ idx = xla::Reshape(input, {size, 1});
+ updates = xla::Broadcast(one, {size});
+ if (has_weights) {
+ updates = weights;
+ zero = xla::Zero(ctx->builder(), dtype);
+ }
}
+
+ output = xla::Broadcast(zero, {output_shape.dimensions()});
xla::XlaComputation assn_computation = [&] {
std::unique_ptr subb =
@@ -99,10 +117,6 @@ class DenseBincountOp : public XlaOpKernel {
}
return subb->BuildAndNoteError();
}();
- xla::ScatterDimensionNumbers scatter_dnums;
- scatter_dnums.set_index_vector_dim(1);
- scatter_dnums.add_inserted_window_dims(0);
- scatter_dnums.add_scatter_dims_to_operand_dims(0);
output = xla::Scatter(output, idx, updates, assn_computation, scatter_dnums, false, false);
ctx->SetOutput(0, output);
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 4b60960c4a6362..34bd6fb907778b 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -652,6 +652,13 @@ def f (x,
axis=axis
)
return y
+ #print("CAiooooooooooooooooooooooooooooooooooooooooooo")
+ #print(f.experimental_get_compiler_ir(f(x,
+ # weights=weights,
+ # minlength=minlength,
+ # maxlength=maxlength,
+ # binary_output=binary_output,
+ # axis=axis))('hlo'))
res = f(x,
weights=weights,
minlength=minlength,
@@ -672,6 +679,10 @@ def f (x,
"x": np.random.randint(100, size=(1000, 1000), dtype=np.int32)
})
@test_util.disable_mlir_bridge('TODO: ?')
+ # TODO: Disable performance test on CPU
+ # missing scatter emitter for CPU fallback to a serial xla::While
+ # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/service/cpu/ir_emitter.cc#L1929-L1931
+ @test_util.run_gpu_only
def test_compiled_dense_perf(self,
x,
minlength=None,
@@ -731,9 +742,7 @@ def f(x,
lambda_f(); lambda_fc()
not_compiled = timeit.timeit(lambda_f, number=10)
compiled = timeit.timeit(lambda_fc, number=10)
- print("XLA JIT -> compiled: %f | not compiled: %f" %
- (compiled , not_compiled))
- self.assertLess(compiled, not_compiled)
+ self.assertLess(compiled, not_compiled * 1.01)
class TestDenseBincount(test.TestCase, parameterized.TestCase):
From 2ce52a30fcef4d0f32fb76e931639893c99b1c26 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 13:19:35 +0000
Subject: [PATCH 044/259] Remove debug IR
---
tensorflow/python/ops/bincount_ops_test.py | 7 -------
1 file changed, 7 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 34bd6fb907778b..76d8d54a638374 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -652,13 +652,6 @@ def f (x,
axis=axis
)
return y
- #print("CAiooooooooooooooooooooooooooooooooooooooooooo")
- #print(f.experimental_get_compiler_ir(f(x,
- # weights=weights,
- # minlength=minlength,
- # maxlength=maxlength,
- # binary_output=binary_output,
- # axis=axis))('hlo'))
res = f(x,
weights=weights,
minlength=minlength,
From 9c0e941bdadd000a4541543685e3e6060fd64d64 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 13:29:28 +0000
Subject: [PATCH 045/259] Change test tollerance logic
---
tensorflow/python/ops/bincount_ops_test.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 76d8d54a638374..6b1c23440658f5 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -735,7 +735,7 @@ def f(x,
lambda_f(); lambda_fc()
not_compiled = timeit.timeit(lambda_f, number=10)
compiled = timeit.timeit(lambda_fc, number=10)
- self.assertLess(compiled, not_compiled * 1.01)
+ self.assertAlmostEqual(compiled, not_compiled, 2)
class TestDenseBincount(test.TestCase, parameterized.TestCase):
From ff718d3deee305b0df07ff543f2ffee770c7075b Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 13:34:44 +0000
Subject: [PATCH 046/259] Fix lint
---
tensorflow/python/ops/bincount_ops_test.py | 43 +++++++++++-----------
1 file changed, 21 insertions(+), 22 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 6b1c23440658f5..7f910d1d7b8262 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -627,31 +627,30 @@ class TestCompiledDenseBincount(test.TestCase, parameterized.TestCase):
})
@test_util.disable_mlir_bridge('TODO: ?')
def test_compiled_dense(self,
- x,
- expected_values,
- minlength=None,
- maxlength=None,
- binary_output=False,
- weights=None,
- axis=-1):
+ x,
+ expected_values,
+ minlength=None,
+ maxlength=None,
+ binary_output=False,
+ weights=None,
+ axis=-1):
@def_function.function(jit_compile=True)
def f (x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- ):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- )
- return y
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis):
+ y = bincount_ops.bincount(
+ x,
+ weights=weights,
+ minlength=minlength,
+ maxlength=maxlength,
+ binary_output=binary_output,
+ axis=axis)
+ return y
+
res = f(x,
weights=weights,
minlength=minlength,
From d4f0a65dc5d0e521c1e31d09028b1070aa4f70b8 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 13:58:19 +0000
Subject: [PATCH 047/259] Change perf test
---
tensorflow/python/ops/bincount_ops_test.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 7f910d1d7b8262..0a6ae826fa6006 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -734,7 +734,7 @@ def f(x,
lambda_f(); lambda_fc()
not_compiled = timeit.timeit(lambda_f, number=10)
compiled = timeit.timeit(lambda_fc, number=10)
- self.assertAlmostEqual(compiled, not_compiled, 2)
+ self.assertLess(compiled, not_compiled)
class TestDenseBincount(test.TestCase, parameterized.TestCase):
From 57463e1c97b4a411335f491716d6ece0a48d6f73 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 18:49:05 +0200
Subject: [PATCH 048/259] Update bincount_op.cc
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 9e0dbd901605e1..99887ee0b1c3cd 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -110,7 +110,7 @@ class DenseBincountOp : public XlaOpKernel {
auto p0 = xla::Parameter(subb.get(), 0, param_shape, "p0");
auto p1 = xla::Parameter(subb.get(), 1, param_shape, "p1");
if (binary_output_) {
- xla::One(subb.get(), xla::S32);
+ xla::One(subb.get(), dtype);
}
else {
xla::Add(p0, p1);
From 2c5c3ac1700f2b95c6ccb2ed96171af02e35a18b Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 18:20:02 +0000
Subject: [PATCH 049/259] Remove unused code
---
.../compiler/xla/service/cpu/ir_emitter.cc | 1 -
tensorflow/python/ops/bincount_ops.py | 57 -------------------
2 files changed, 58 deletions(-)
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index a78c33e5058945..dad4b96647b674 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -86,7 +86,6 @@ limitations under the License.
namespace xla {
namespace {
-using llvm_ir::IrArray;
using llvm_ir::IrName;
using llvm_ir::SetToFirstInsertPoint;
} // namespace
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 79bc2616c8edfb..610f9ec94befa5 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -22,68 +22,11 @@
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import gen_count_ops
from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.eager import def_function
from tensorflow.python.ops import math_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import tf_export
-@def_function.function(jit_compile=True)
-def dense_bincount_1d(input_arr=[],
- size=None,
- weights=[],
- binary_output=False):
-
- input_arr = array_ops.reshape(input_arr, [array_ops.shape(input_arr)[0],-1])
- output_shape = [size]
- idx = array_ops.reshape(input_arr, [array_ops.shape(input_arr)[0],-1])
- if (binary_output):
- updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.bool)
- output = array_ops.zeros(output_shape, dtype=dtypes.bool)
- elif (len(weights)):
- updates = weights
- output = array_ops.zeros(output_shape, dtype=weights.dtype)
- else:
- updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.int32)
- output = array_ops.zeros(output_shape, dtype=dtypes.int32)
-
- histogram_out = array_ops.tensor_scatter_add(output, idx, updates)
-
- return histogram_out
-
-def prepare_idxs(input_arr):
- j_indices = array_ops.reshape(input_arr, [-1, 1])
- dim1 = math_ops.range(array_ops.shape(input_arr)[0])
- dim2 = array_ops.shape(input_arr)[1]
- i_indices = array_ops.expand_dims(array_ops.repeat(dim1, dim2), axis=-1)
-
- new_indices = array_ops.concat([i_indices, j_indices], axis=-1)
- return new_indices
-
-@def_function.function(jit_compile=True)
-def dense_bincount_2d(input_arr=[],
- size=None,
- weights=[],
- binary_output=False):
-
- input_arr = array_ops.reshape(input_arr, [array_ops.shape(input_arr)[0],-1])
- idx = prepare_idxs(input_arr)
- output_shape = [input_arr.shape[0], size]
-
- if (binary_output):
- updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.bool)
- output = array_ops.zeros(output_shape, dtype=dtypes.bool)
- elif (len(weights)):
- updates = array_ops.reshape(weights, [array_ops.shape(idx)[0]])
- output = array_ops.zeros(output_shape, dtype=weights.dtype)
- else:
- updates = array_ops.ones(array_ops.shape(idx)[0], dtype=dtypes.int32)
- output = array_ops.zeros(output_shape, dtype=dtypes.int32)
-
- histogram_out = array_ops.tensor_scatter_add(output, idx, updates)
-
- return histogram_out
-
@tf_export("math.bincount", v1=[])
def bincount(arr,
From 4b32db55b229a4f74cff7254814c7797dde5453a Mon Sep 17 00:00:00 2001
From: DEKHTIARJonathan
Date: Tue, 14 Jun 2022 23:13:50 -0400
Subject: [PATCH 050/259] [TF-TRT] TraceMe instrumentation for TRTEngineOp
Instrument getDeviceMemorySize
TraceMe for output bindings
Cache device memory size
---
tensorflow/compiler/tf2tensorrt/BUILD | 2 ++
tensorflow/compiler/tf2tensorrt/common/utils.cc | 4 ++++
.../tf2tensorrt/kernels/trt_engine_op.cc | 16 +++++++++++++---
.../tf2tensorrt/utils/trt_engine_utils.cc | 16 ++++++++++++++++
.../compiler/tf2tensorrt/utils/trt_lru_cache.h | 1 -
.../utils/trt_shape_optimization_profiles.cc | 10 ++++++++++
6 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index ef5c81dc29e021..0221198c132f8b 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -206,6 +206,7 @@ cc_library(
deps = [
"//tensorflow/core:framework",
"//tensorflow/core/platform:logging",
+ "//tensorflow/core/profiler/lib:annotated_traceme",
] + if_tensorrt([":tensorrt_lib"]),
)
@@ -417,6 +418,7 @@ tf_cuda_library(
"//tensorflow/core:framework_headers_lib",
"//tensorflow/core:lib",
"//tensorflow/core/platform:status",
+ "//tensorflow/core/profiler/lib:annotated_traceme",
"//tensorflow/core:stream_executor_headers_lib",
] + if_tensorrt([":tensorrt_lib"]),
)
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc
index 85546f22022c33..251ac0ce1a4ba2 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc
@@ -20,7 +20,9 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
#include "third_party/tensorrt/NvInferPlugin.h"
+
#endif
namespace tensorflow {
@@ -58,6 +60,8 @@ namespace tensorrt {
Status GetTrtBindingIndex(const char* tensor_name, int profile_index,
const nvinfer1::ICudaEngine* cuda_engine,
int* binding_index) {
+ tensorflow::profiler::TraceMe activity(
+ "GetTrtBindingIndex", tensorflow::profiler::TraceMeLevel::kInfo);
// If the engine has been built for K profiles, the first getNbBindings() / K
// bindings are used by profile number 0, the following getNbBindings() / K
// bindings are used by profile number 1 etc.
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 3f284bab11a363..72bc9e28129f82 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -97,8 +97,11 @@ class ContextDeviceMemory {
"Out of GPU memory for execution context");
}
}
- execution_context_->setDeviceMemory(device_memory_);
-
+ {
+ tensorflow::profiler::TraceMe activity(
+ "setDeviceMemory", tensorflow::profiler::TraceMeLevel::kInfo);
+ execution_context_->setDeviceMemory(device_memory_);
+ }
return Status::OK();
}
@@ -967,6 +970,9 @@ Status TRTEngineOp::ExecuteTrtEngine(
ContextDeviceMemory context_device_memory;
if (!has_device_memory) {
+ tensorflow::profiler::TraceMe activity(
+ "TRTEngineOp::AllocateDeviceMemory",
+ tensorflow::profiler::TraceMeLevel::kInfo);
// Allocate device memory for the TensorRT engine execution. The device
// memory will be released when context_device_memory goes out of scope.
TF_RETURN_IF_ERROR(context_device_memory.AllocateDeviceMemory(
@@ -979,6 +985,9 @@ Status TRTEngineOp::ExecuteTrtEngine(
Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
TRTEngineCacheResource** cache_res) {
+ tensorflow::profiler::TraceMe activity(
+ "TRTEngineOp::GetEngineCachResource",
+ tensorflow::profiler::TraceMeLevel::kInfo);
// Canonicalize the op name by removing the scopes if any. This is mainly
// because in TFv2, the function graph can be instantiated in various ways and
// it'll insert scope names to the name of the TRTEngineOps, which will result
@@ -1050,7 +1059,8 @@ StatusOr> TRTEngineOp::GetEngine(
const std::vector& input_concrete_shapes, OpKernelContext* ctx,
TRTEngineCacheResource* cache_res) {
static EngineContext empty_context;
-
+ tensorflow::profiler::TraceMe activity(
+ "TRTEngineOp::GetEngine", tensorflow::profiler::TraceMeLevel::kInfo);
mutex_lock lock(engine_mutex_);
// Using first input to get batch size is reliable - VerifyInputShapes()
// guarantees that the first input is not a scalar. As such we can always use
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index ac35939fea832c..b362331644e263 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -26,6 +26,7 @@ limitations under the License.
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
#if GOOGLE_CUDA && GOOGLE_TENSORRT
#include "third_party/tensorrt/NvInfer.h"
@@ -60,6 +61,8 @@ Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
const nvinfer1::IExecutionContext* execution_context,
int binding_index, bool use_implicit_batch,
int batch_size, TensorShape& shape) {
+ tensorflow::profiler::TraceMe activity(
+ "getBindingDimensions", tensorflow::profiler::TraceMeLevel::kInfo);
nvinfer1::Dims dims =
use_implicit_batch
? cuda_engine->getBindingDimensions(binding_index)
@@ -79,6 +82,8 @@ Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
Status SetupBindings(nvinfer1::ICudaEngine* cuda_engine, const Tensor& tensor,
std::vector& buffers, int binding_index) {
+ tensorflow::profiler::TraceMe activity(
+ "SetBindingPointers", tensorflow::profiler::TraceMeLevel::kInfo);
const auto dtype = cuda_engine->getBindingDataType(binding_index);
VLOG(2) << "<<<<<<<<< SetupBindings with dtype = " << (int)dtype;
switch (dtype) {
@@ -114,6 +119,8 @@ Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine,
int num_batch,
const TrtShapeOptimizationProfile& profiles,
OpKernelContext* ctx, const DataVec* input_vec) {
+ tensorflow::profiler::TraceMe activity(
+ "SetTrtEngineInputs", tensorflow::profiler::TraceMeLevel::kInfo);
int n_inputs = ctx ? ctx->num_inputs() : (input_vec ? input_vec->size() : 0);
// Setup engine inputs.
for (int i = 0; i < n_inputs; i++) {
@@ -150,6 +157,9 @@ Status SetTrtEngineInputs(nvinfer1::ICudaEngine* cuda_engine,
i, binding_index, cuda_engine, execution_context));
if (cuda_engine->isExecutionBinding(binding_index)) {
+ tensorflow::profiler::TraceMe activity(
+ "SetTrtEngineInputs::setBindingDimensions",
+ tensorflow::profiler::TraceMeLevel::kInfo);
nvinfer1::Dims trt_dims;
auto adap = DimsAdapter::Create(input_shape);
TRT_ENSURE_OK(adap);
@@ -187,6 +197,8 @@ Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine,
int trt_profile_idx, std::vector& buffers,
bool use_implicit_batch, int batch_size,
OpKernelContext* ctx, DataVec* outputs) {
+ tensorflow::profiler::TraceMe activity(
+ "SetTrtEngineOutputs", tensorflow::profiler::TraceMeLevel::kInfo);
// Either one of ctx or outpus should be specified
int n_outputs = ctx ? ctx->num_outputs() : (outputs ? outputs->size() : 0);
for (int i = 0; i < n_outputs; i++) {
@@ -205,6 +217,8 @@ Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine,
// Allocate output tensor of TRTEngineOp.
Tensor* output_tensor = nullptr;
if (ctx) {
+ tensorflow::profiler::TraceMe activity(
+ "AllocateOutput", tensorflow::profiler::TraceMeLevel::kInfo);
TF_RETURN_IF_ERROR(ctx->allocate_output(i, output_shape, &output_tensor));
} else {
// This path is used for unit tests. The tensor is already allocated.
@@ -231,6 +245,8 @@ Status SetTrtEngineOutputs(nvinfer1::ICudaEngine* cuda_engine,
Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
std::vector& buffers, cudaStream_t stream,
bool use_implicit_batch, int batch_size) {
+ tensorflow::profiler::TraceMe activity(
+ "TrtEnqueue", tensorflow::profiler::TraceMeLevel::kInfo);
bool ret = false;
if (use_implicit_batch) {
ret = execution_context->enqueue(batch_size, &buffers[0], stream, nullptr);
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 0ba26e67c7b8a2..5c4a6c1fdd8fed 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -185,7 +185,6 @@ struct EngineContext {
// latency. Since its value remains constant, we can cache it.
size_t device_memory_size_;
};
-
// Contains the context required to build the calibration data.
class CalibrationContext {
public:
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 5f96b5f55be777..21f6be4a964561 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
#if GOOGLE_CUDA && GOOGLE_TENSORRT
@@ -139,6 +140,9 @@ void TrtShapeOptimizationProfile::OptimalStrategy(
// Collects the values of tensors that are ShapeTensorCompatible to. The values
// are stored in the actual_shape_values_ member variable.
Status TrtShapeOptimizationProfile::CollectShapeValues(OpKernelContext* ctx) {
+ tensorflow::profiler::TraceMe activity(
+ "TrtShapeOptimizationProfile::CollectShapeValues",
+ tensorflow::profiler::TraceMeLevel::kInfo);
const cudaStream_t* stream = CHECK_NOTNULL(
reinterpret_cast(ctx->op_device_context()
->stream()
@@ -466,6 +470,9 @@ void TrtShapeOptimizationProfile::SetShapeTensorMask(
int TrtShapeOptimizationProfile::GetProfileNumber(
const std::vector& shapes) {
+ tensorflow::profiler::TraceMe activity(
+ "TrtShapeOptimizationProfile::GetProfileNumber",
+ tensorflow::profiler::TraceMeLevel::kInfo);
if (!need_profiles_) return 0;
// TODO(tfeher): Return the best profile not just the first compatible.
for (int i = 0; i < profiles_.size(); i++) {
@@ -509,6 +516,9 @@ Status TrtShapeOptimizationProfile::CreateExecutionContexts(
Status TrtShapeOptimizationProfile::SetInputShapeBinding(
int input_index, int binding_index, nvinfer1::ICudaEngine* cuda_engine,
nvinfer1::IExecutionContext* exec_context) const {
+ tensorflow::profiler::TraceMe activity(
+ "TrtShapeOptimizationProfile::SetInputShapeBinding",
+ tensorflow::profiler::TraceMeLevel::kInfo);
if (cuda_engine->isShapeBinding(binding_index)) {
// Input shape binding data has to be in host memory. That is the reason
// we can't use input_tensor.flat().data(). which contains the same
From a8e5d62eb429f0c5fad7aaa3da9d05ea0f2291f2 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 19:34:11 +0000
Subject: [PATCH 051/259] Workaround for axis=None
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 2 --
tensorflow/python/ops/bincount_ops.py | 2 +-
tensorflow/python/ops/bincount_ops_test.py | 3 +--
3 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 99887ee0b1c3cd..92e9ec3f753236 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
namespace tensorflow {
namespace {
-// TODO: This is only a dummy kernel
class DenseBincountOp : public XlaOpKernel {
private:
bool binary_output_;
@@ -36,7 +35,6 @@ class DenseBincountOp : public XlaOpKernel {
}
void Compile(XlaOpKernelContext* ctx) override {
- // Dumb implementation for the simplest test case
xla::XlaOp input = ctx->Input(0);
xla::XlaOp weights = ctx->Input(2);
StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 610f9ec94befa5..cb9f29d027da5e 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -134,7 +134,7 @@ def bincount(arr,
weights = ops.convert_to_tensor(weights, name="weights")
return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
weights = constant_op.constant([], dtype)
- return gen_math_ops.bincount(arr, output_size, weights)
+ #return gen_math_ops.bincount(arr, output_size, weights)
if not isinstance(arr, sparse_tensor.SparseTensor):
arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr")
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 0a6ae826fa6006..fb98e660b1e5a6 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -188,7 +188,6 @@ def test_dense_input(self,
self.assertAllEqual(expected_values, y.values)
self.assertAllEqual(expected_shape, y.dense_shape)
-
@parameterized.named_parameters(
{
"testcase_name":
@@ -623,7 +622,7 @@ class TestCompiledDenseBincount(test.TestCase, parameterized.TestCase):
"testcase_name": "_all_axes",
"x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
"expected_values": [0, 1, 1, 1, 2, 1],
- "axis": 0 # With None (recursive call) -> Bincount (No registered 'Bincount'
+ "axis": None
})
@test_util.disable_mlir_bridge('TODO: ?')
def test_compiled_dense(self,
From 02331b1c97b80e2501ad42b91ee7758caa9a3753 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 21:43:30 +0000
Subject: [PATCH 052/259] Register dense bincount binary_output not required
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 5 +++--
tensorflow/python/ops/bincount_ops.py | 3 ++-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 92e9ec3f753236..818bdfec301cfe 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -28,10 +28,10 @@ namespace {
class DenseBincountOp : public XlaOpKernel {
private:
- bool binary_output_;
+ bool binary_output_= false;
public:
explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
- OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
+ ctx->GetAttr("binary_output", &binary_output_);
}
void Compile(XlaOpKernelContext* ctx) override {
@@ -122,6 +122,7 @@ class DenseBincountOp : public XlaOpKernel {
};
REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
+REGISTER_XLA_OP(Name("Bincount").CompileTimeConstantInput("size"), DenseBincountOp);
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index cb9f29d027da5e..8f0efeca62c6a4 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -134,7 +134,8 @@ def bincount(arr,
weights = ops.convert_to_tensor(weights, name="weights")
return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
weights = constant_op.constant([], dtype)
- #return gen_math_ops.bincount(arr, output_size, weights)
+ arr = array_ops.reshape(arr, [-1])
+ return gen_math_ops.bincount(arr, output_size, weights)
if not isinstance(arr, sparse_tensor.SparseTensor):
arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr")
From efe8d82c40b222165a49c8aa88889c08b046340b Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 17 Jun 2022 22:05:48 +0000
Subject: [PATCH 053/259] Add bincount XLAlite
---
tensorflow/compiler/jit/mark_for_compilation_pass.cc | 1 +
1 file changed, 1 insertion(+)
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 0b8e9b4815f83a..87c5a435a77f00 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1968,6 +1968,7 @@ absl::flat_hash_set GetKnownXLAAllowlistOp() {
"BesselI1e",
"Betainc",
"BiasAddV1",
+ "Bincount",
"Bucketize",
"Case",
"CheckNumerics",
From 06fd765e7a011fcd7c2a9f0fa1f2641c73ff26b8 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 20 Jun 2022 14:14:03 +0200
Subject: [PATCH 054/259] Update bincount_ops_test.py
---
tensorflow/python/ops/bincount_ops_test.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index fb98e660b1e5a6..1deac4d76bb7b6 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -670,9 +670,9 @@ def f (x,
"x": np.random.randint(100, size=(1000, 1000), dtype=np.int32)
})
@test_util.disable_mlir_bridge('TODO: ?')
- # TODO: Disable performance test on CPU
- # missing scatter emitter for CPU fallback to a serial xla::While
- # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/service/cpu/ir_emitter.cc#L1929-L1931
+ # TODO: Disable performance tests on CPU
+ # missing scatter emitter for CPU fallback to a serial HLO While
+ # https://github.com/tensorflow/tensorflow/issues/56511
@test_util.run_gpu_only
def test_compiled_dense_perf(self,
x,
From c3e748f80efaf8908e777a2a6909034891f7c857 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 20 Jun 2022 14:44:57 +0000
Subject: [PATCH 055/259] Remove per test Minior format
---
.../compiler/tf2xla/kernels/bincount_op.cc | 4 +-
tensorflow/python/ops/bincount_ops_test.py | 191 +-----------------
2 files changed, 2 insertions(+), 193 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 818bdfec301cfe..d1e432608beb2b 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -87,9 +87,7 @@ class DenseBincountOp : public XlaOpKernel {
zero = xla::Zero(ctx->builder(), dtype);
updates = weights;
}
- }
-
- else {
+ } else {
input = xla::Reshape(input, {size, 1});
idx = xla::Reshape(input, {size, 1});
updates = xla::Broadcast(one, {size});
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index fb98e660b1e5a6..6fe6953b9548b0 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -15,8 +15,6 @@
"""Tests for bincount ops."""
from absl.testing import parameterized
-
-import timeit
import numpy as np
from tensorflow.python.eager import context
@@ -32,6 +30,7 @@
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.platform import test
+
class TestSparseCount(test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(
@@ -547,195 +546,7 @@ def test_ragged_input(self,
self.assertAllEqual(expected_values, y.values)
self.assertAllEqual(expected_shape, y.dense_shape)
-class TestCompiledDenseBincount(test.TestCase, parameterized.TestCase):
-
- @parameterized.named_parameters(
- {
- "testcase_name": "_no_maxlength_basic",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
- }, {
- "testcase_name": "_maxlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
- }, {
- "testcase_name": "_minlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 2, 0, 0, 1, 0]]
- }, {
- "testcase_name": "_minlength_larger_values",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 3,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 2, 0, 0, 1]]
- }, {
- "testcase_name": "_no_maxlength_binary",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],
- [0, 0, 0, 0, 1, 1]],
- "binary_output": True,
- }, {
- "testcase_name": "_maxlength_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],
- [1, 0, 0, 0, 1, 0, 0]],
- "binary_output": True,
- }, {
- "testcase_name": "_minlength_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 1, 0, 0, 1, 0]],
- "binary_output": True,
- }, {
- "testcase_name": "_minlength_larger_values_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 3,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 1, 0, 0, 1]],
- "binary_output": True,
- }, {
- "testcase_name": "_no_maxlength_weights",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0. , 2. , 1. , 0.5, 0. , 0. ],
- [0. , 0. , 0. , 0. , 9. , 3. ]],
- "weights": [[0.5, 1, 2], [3, 4, 5]]
- }, {
- "testcase_name": "_1d_no_maxlenght_base",
- "x": np.array([3, 2, 1, 1], dtype=np.int32),
- "expected_values": [0, 2, 1, 1]
- }, {
- "testcase_name": "_1d_binary",
- "x": np.array([3, 2, 1, 1], dtype=np.int32),
- "expected_values": [0, 1, 1, 1],
- "binary_output": True
- }, {
- "testcase_name": "_1d_no_maxlenght_weights",
- "x": np.array([3, 2, 1, 5, 4, 4], dtype=np.int32),
- "weights": [0.5, 1, 2, 3, 4, 5],
- "expected_values": [0. , 2. , 1. , 0.5, 9. , 3. ]
- }, {
- "testcase_name": "_all_axes",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [0, 1, 1, 1, 2, 1],
- "axis": None
- })
- @test_util.disable_mlir_bridge('TODO: ?')
- def test_compiled_dense(self,
- x,
- expected_values,
- minlength=None,
- maxlength=None,
- binary_output=False,
- weights=None,
- axis=-1):
-
- @def_function.function(jit_compile=True)
- def f (x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- return y
-
- res = f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- self.assertAllEqual(expected_values, res)
-
- @parameterized.named_parameters(
- {
- "testcase_name": "_no_maxlength_small",
- "x": np.random.randint(200, size=(200, 200), dtype=np.int32)
- }, {
- "testcase_name": "_no_maxlength_medium",
- "x": np.random.randint(500, size=(500, 500), dtype=np.int32)
- }, {
- "testcase_name": "_no_maxlength_large",
- "x": np.random.randint(100, size=(1000, 1000), dtype=np.int32)
- })
- @test_util.disable_mlir_bridge('TODO: ?')
- # TODO: Disable performance test on CPU
- # missing scatter emitter for CPU fallback to a serial xla::While
- # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/service/cpu/ir_emitter.cc#L1929-L1931
- @test_util.run_gpu_only
- def test_compiled_dense_perf(self,
- x,
- minlength=None,
- maxlength=None,
- binary_output=False,
- weights=None,
- axis=-1):
-
- @def_function.function(jit_compile=True)
- def f_compiled(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- )
- return y
-
- @def_function.function()
- def f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis
- )
- return y
-
- lambda_f = lambda: f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
-
- lambda_fc = lambda: f_compiled(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- # warm-up
- lambda_f(); lambda_fc()
- not_compiled = timeit.timeit(lambda_f, number=10)
- compiled = timeit.timeit(lambda_fc, number=10)
- self.assertLess(compiled, not_compiled)
-
class TestDenseBincount(test.TestCase, parameterized.TestCase):
@parameterized.parameters([{
From 50518ffb16e31edc3e7a7ebe3a239afcb7dbcb3e Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 20 Jun 2022 14:47:13 +0000
Subject: [PATCH 056/259] remove empty line
---
tensorflow/python/ops/bincount_ops_test.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 042f19d9c26812..643bbf1d7c99aa 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -656,8 +656,7 @@ def f (x,
binary_output=binary_output,
axis=axis)
self.assertAllEqual(expected_values, res)
-
-
+
class TestDenseBincount(test.TestCase, parameterized.TestCase):
@parameterized.parameters([{
From 5d947d7265cb08755a4ddc5a49d6365960419bc5 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 20 Jun 2022 15:45:59 +0000
Subject: [PATCH 057/259] Move private Remove mlir bridge exclusion from tests
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 11 +++++------
tensorflow/python/ops/bincount_ops_test.py | 1 -
2 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index d1e432608beb2b..5323df2c65868b 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -27,13 +27,12 @@ namespace tensorflow {
namespace {
class DenseBincountOp : public XlaOpKernel {
+ public:
+ explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+ ctx->GetAttr("binary_output", &binary_output_);
+ }
private:
- bool binary_output_= false;
- public:
- explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
- ctx->GetAttr("binary_output", &binary_output_);
- }
-
+ bool binary_output_= false;
void Compile(XlaOpKernelContext* ctx) override {
xla::XlaOp input = ctx->Input(0);
xla::XlaOp weights = ctx->Input(2);
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 643bbf1d7c99aa..feffe127a57bf4 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -623,7 +623,6 @@ class TestCompiledDenseBincount(test.TestCase, parameterized.TestCase):
"expected_values": [0, 1, 1, 1, 2, 1],
"axis": None
})
- @test_util.disable_mlir_bridge('TODO: ?')
def test_compiled_dense(self,
x,
expected_values,
From e43afd51ebde27e8e10e40955c51fe795b41d25e Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 20 Jun 2022 16:11:50 +0000
Subject: [PATCH 058/259] Add an operation as reuqested
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 5323df2c65868b..081612f7d0385e 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -105,7 +105,7 @@ class DenseBincountOp : public XlaOpKernel {
auto p0 = xla::Parameter(subb.get(), 0, param_shape, "p0");
auto p1 = xla::Parameter(subb.get(), 1, param_shape, "p1");
if (binary_output_) {
- xla::One(subb.get(), dtype);
+ xla::Or(p0, xla::One(subb.get(), dtype));
}
else {
xla::Add(p0, p1);
From d609ecdf200e8d02e951e351f4e1bf2e8573d2a1 Mon Sep 17 00:00:00 2001
From: bhack
Date: Tue, 21 Jun 2022 12:11:22 +0000
Subject: [PATCH 059/259] Sligtly change the impl style Format the code
---
.../compiler/tf2xla/kernels/bincount_op.cc | 67 ++++++++++---------
1 file changed, 36 insertions(+), 31 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 081612f7d0385e..d2a37556ee9095 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -13,29 +13,30 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
-#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
#include "tensorflow/compiler/xla/client/lib/comparators.h"
#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/shape_util.h"
namespace tensorflow {
namespace {
class DenseBincountOp : public XlaOpKernel {
- public:
- explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
- ctx->GetAttr("binary_output", &binary_output_);
- }
- private:
- bool binary_output_= false;
+ public:
+ explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+ ctx->GetAttr("binary_output", &binary_output_);
+ }
+
+ private:
+ bool binary_output_ = false;
void Compile(XlaOpKernelContext* ctx) override {
xla::XlaOp input = ctx->Input(0);
- xla::XlaOp weights = ctx->Input(2);
+ xla::XlaOp weights = ctx->Input(2);
StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
OP_REQUIRES_OK(ctx, weights_shape_or.status());
auto weights_shape = weights_shape_or.ValueOrDie();
@@ -43,11 +44,10 @@ class DenseBincountOp : public XlaOpKernel {
auto input_xla_type = ctx->input_xla_type(0);
xla::PrimitiveType dtype;
bool has_weights;
- if (weights_size){
+ if (weights_size) {
has_weights = true;
dtype = ctx->input_xla_type(2);
- }
- else {
+ } else {
has_weights = false;
dtype = input_xla_type;
}
@@ -64,25 +64,30 @@ class DenseBincountOp : public XlaOpKernel {
xla::ScatterDimensionNumbers scatter_dnums;
scatter_dnums.set_index_vector_dim(1);
scatter_dnums.add_inserted_window_dims(0);
- scatter_dnums.add_scatter_dims_to_operand_dims(0);
+ scatter_dnums.add_scatter_dims_to_operand_dims(0);
auto one = xla::One(ctx->builder(), input_xla_type);
- auto zero = xla::Zero(ctx->builder(), input_xla_type);;
-
+ auto zero = xla::Zero(ctx->builder(), input_xla_type);
+
if (rank == 2) {
output_shape = xla::ShapeUtil::MakeShape(dtype, {size, output_size});
scatter_dnums.add_inserted_window_dims(1);
scatter_dnums.add_scatter_dims_to_operand_dims(1);
- auto i_shape = xla::ShapeUtil::MakeShape(input_xla_type, {input_shape.dimensions()});
+ auto i_shape =
+ xla::ShapeUtil::MakeShape(input_xla_type, {input_shape.dimensions()});
auto i = xla::Iota(ctx->builder(), i_shape, 0);
- i = xla::Reshape(i, {input_shape.dimensions(0)*input_shape.dimensions(1), 1});
- auto j = xla::Reshape(input, {input_shape.dimensions(0)*input_shape.dimensions(1), 1});
+ i = xla::Reshape(
+ i, {input_shape.dimensions(0) * input_shape.dimensions(1), 1});
+ auto j = xla::Reshape(
+ input, {input_shape.dimensions(0) * input_shape.dimensions(1), 1});
std::vector iotas_to_concat;
iotas_to_concat.push_back(i);
iotas_to_concat.push_back(j);
idx = xla::ConcatInDim(ctx->builder(), iotas_to_concat, 1);
- updates = xla::Broadcast(one, {input_shape.dimensions(0)*input_shape.dimensions(1)});
+ updates = xla::Broadcast(
+ one, {input_shape.dimensions(0) * input_shape.dimensions(1)});
if (has_weights) {
- weights = xla::Reshape(weights, {input_shape.dimensions(0)*input_shape.dimensions(1)});
+ weights = xla::Reshape(
+ weights, {input_shape.dimensions(0) * input_shape.dimensions(1)});
zero = xla::Zero(ctx->builder(), dtype);
updates = weights;
}
@@ -97,29 +102,29 @@ class DenseBincountOp : public XlaOpKernel {
}
output = xla::Broadcast(zero, {output_shape.dimensions()});
-
+
xla::XlaComputation assn_computation = [&] {
std::unique_ptr subb =
- ctx->builder()->CreateSubBuilder("scatter_bincount");
+ ctx->builder()->CreateSubBuilder("scatter_bincount");
xla::Shape param_shape = xla::ShapeUtil::MakeShape(dtype, {});
auto p0 = xla::Parameter(subb.get(), 0, param_shape, "p0");
auto p1 = xla::Parameter(subb.get(), 1, param_shape, "p1");
- if (binary_output_) {
- xla::Or(p0, xla::One(subb.get(), dtype));
- }
- else {
+ if (!binary_output_) {
xla::Add(p0, p1);
}
return subb->BuildAndNoteError();
}();
- output = xla::Scatter(output, idx, updates, assn_computation, scatter_dnums, false, false);
+ output = xla::Scatter(output, idx, updates, assn_computation, scatter_dnums,
+ false, false);
ctx->SetOutput(0, output);
}
};
-REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"), DenseBincountOp);
-REGISTER_XLA_OP(Name("Bincount").CompileTimeConstantInput("size"), DenseBincountOp);
+REGISTER_XLA_OP(Name("DenseBincount").CompileTimeConstantInput("size"),
+ DenseBincountOp);
+REGISTER_XLA_OP(Name("Bincount").CompileTimeConstantInput("size"),
+ DenseBincountOp);
} // namespace
} // namespace tensorflow
From 8aac63a827f388d8901c4b8898773df9e26e58e7 Mon Sep 17 00:00:00 2001
From: Kaixi Hou
Date: Tue, 21 Jun 2022 23:13:37 -0700
Subject: [PATCH 060/259] fix filter shape checking
---
tensorflow/core/grappler/optimizers/remapper.cc | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index b0b75e321c82a7..a260a2772f22d8 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -429,10 +429,10 @@ bool IsGpuCompatible(const RemapperContext& ctx,
// in-graph computation in micro benchmarks (see kernels/conv_ops_test.cc),
// and significantly slower in large scale benchmarks.
bool is_spatial_conv = Rank(filter_shape) == 4 && //
+ IsKnown(filter_shape.dim(0)) && //
IsKnown(filter_shape.dim(1)) && //
- IsKnown(filter_shape.dim(2)) && //
- filter_shape.dim(1).size() != 1 && //
- filter_shape.dim(2).size() != 1;
+ filter_shape.dim(0).size() != 1 && //
+ filter_shape.dim(1).size() != 1;
return is_spatial_conv && IsGpuCompatibleConv2D(ctx, &contraction_node);
} else if (IsMatMul(contraction_node)) {
From 9cf2fbfc8fbbc82d7f3b47e6f88a30fc4a0f211e Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 23 Jun 2022 19:53:05 +0200
Subject: [PATCH 061/259] Make internal compiler flags happy
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index d2a37556ee9095..b3431caf9bb723 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -29,7 +29,8 @@ namespace {
class DenseBincountOp : public XlaOpKernel {
public:
explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
- ctx->GetAttr("binary_output", &binary_output_);
+ // It is optional for Bincount and required for DenseBincount
+ (void) ctx->GetAttr("binary_output", &binary_output_);
}
private:
@@ -52,7 +53,7 @@ class DenseBincountOp : public XlaOpKernel {
dtype = input_xla_type;
}
int64_t output_size;
- ctx->ConstantInputAsIntScalar("size", &output_size);
+ OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("size", &output_size));
StatusOr input_shape_or = ctx->builder()->GetShape(input);
OP_REQUIRES_OK(ctx, input_shape_or.status());
auto input_shape = input_shape_or.ValueOrDie();
From 4439a169d2ce55c58e83b85d38c664ce2c802a32 Mon Sep 17 00:00:00 2001
From: gadagashwini <99852755+gadagashwini@users.noreply.github.com>
Date: Sat, 25 Jun 2022 07:58:26 +0530
Subject: [PATCH 062/259] Fix typo in tf.data.experimental.DistributeOptions
---
tensorflow/python/data/ops/options.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/python/data/ops/options.py b/tensorflow/python/data/ops/options.py
index c2fef8b603918f..06a7202187af09 100644
--- a/tensorflow/python/data/ops/options.py
+++ b/tensorflow/python/data/ops/options.py
@@ -267,7 +267,7 @@ class DistributeOptions(options_lib.OptionsBase):
```python
options = tf.data.Options()
- options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+ options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
dataset = dataset.with_options(options)
```
"""
From 869dd04532d89a5aa5a6f8e5761bb26c45c2e3c4 Mon Sep 17 00:00:00 2001
From: Kaixi Hou
Date: Tue, 21 Jun 2022 15:52:51 -0700
Subject: [PATCH 063/259] support fp16 for conv-bias-relu on gpus
---
.../core/grappler/optimizers/remapper.cc | 15 +-
.../core/kernels/conv_ops_fused_half.cc | 2 +
tensorflow/core/kernels/conv_ops_fused_impl.h | 72 ++++++---
tensorflow/core/kernels/conv_ops_gpu.cc | 18 +++
tensorflow/core/ops/nn_ops.cc | 2 +-
tensorflow/python/grappler/remapper_test.py | 137 ++++++++++--------
6 files changed, 155 insertions(+), 91 deletions(-)
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index b0b75e321c82a7..8f998244aaa458 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -307,9 +307,7 @@ bool IsCpuCompatibleDataType(const NodeDef* contraction,
bool IsGpuCompatibleDataType(const NodeDef* contraction,
const string& type_attr = "T") {
DataType dtype = GetDataTypeFromAttr(*contraction, type_attr);
- if (IsConv2D(*contraction)) {
- return dtype == DT_FLOAT;
- } else if (IsMatMul(*contraction)) {
+ if (IsConv2D(*contraction) || IsMatMul(*contraction)) {
return dtype == DT_FLOAT || dtype == DT_HALF;
} else {
return false;
@@ -3129,7 +3127,8 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
const auto is_relu_biasadd_conv_candidate = [&]() -> bool {
if (!IsRelu(*node_def)) return false;
- if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
+ DataType act_dtype = GetDataTypeFromAttr(*node_def, "T");
+ if (act_dtype != DT_FLOAT && act_dtype != DT_HALF) return false;
if (node_view->NumRegularFanins() < 1) return false;
const auto& relu_fanin_0 = node_view->GetRegularFanin(0);
@@ -3138,8 +3137,8 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
if (!IsBiasAdd(*relu_fanin_0_node_def) && !IsAdd(*relu_fanin_0_node_def))
return false;
- if (GetDataTypeFromAttr(*relu_fanin_0_node_def, "T") != DT_FLOAT)
- return false;
+ DataType biasadd_dtype = GetDataTypeFromAttr(*relu_fanin_0_node_def, "T");
+ if (biasadd_dtype != DT_FLOAT && biasadd_dtype != DT_HALF) return false;
if (relu_fanin_0_node_view->NumRegularFanins() < 1) return false;
@@ -3149,8 +3148,8 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
if (!IsConv2D(*biasadd_fanin_0_node_def) &&
!IsConv3D(*biasadd_fanin_0_node_def))
return false;
- if (GetDataTypeFromAttr(*biasadd_fanin_0_node_def, "T") != DT_FLOAT)
- return false;
+ DataType conv_dtype = GetDataTypeFromAttr(*biasadd_fanin_0_node_def, "T");
+ if (conv_dtype != DT_FLOAT && conv_dtype != DT_HALF) return false;
return true;
};
diff --git a/tensorflow/core/kernels/conv_ops_fused_half.cc b/tensorflow/core/kernels/conv_ops_fused_half.cc
index 5086b2b6f1b908..2945fcf530ac9a 100644
--- a/tensorflow/core/kernels/conv_ops_fused_half.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_half.cc
@@ -27,6 +27,8 @@ namespace functor {
DECLARE_FUNCTOR_GPU_SPEC(Eigen::half);
} // namespace functor
+TF_CALL_half(REGISTER_FUSED_GPU_CONV2D);
+
#endif // GOOGLE_CUDA
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 521f06e23c9b26..5189fbc06aab71 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -219,6 +219,9 @@ struct LaunchFusedConv2DOp {
OP_REQUIRES(context, params.data_format == FORMAT_NHWC,
errors::Unimplemented("Fused conv implementation only supports "
"NHWC tensor format for now."));
+ OP_REQUIRES(context, DataTypeToEnum::value != DT_HALF,
+ errors::Unimplemented("Fused conv implementation with half "
+ "precision is not supported on CPU."));
BiasAddArgs bias_add_args;
if (BiasAddArgs::IsSupported(fusion)) {
@@ -419,7 +422,10 @@ struct LaunchFusedConv2DOp {
in_cols = new_in_cols;
}
- if (params.data_format == FORMAT_NHWC) {
+ const bool compute_in_nhwc = DataTypeToEnum::value == DT_HALF &&
+ stream->GetCudaComputeCapability().IsAtLeast(
+ se::CudaComputeCapability::VOLTA);
+ if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
// Convert the input tensor from NHWC to NCHW.
TensorShape nchw_shape =
ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
@@ -451,23 +457,37 @@ struct LaunchFusedConv2DOp {
LOG(FATAL) << "Unsupported fusion type"; // Crash OK
}
+ const TensorFormat compute_data_format =
+ compute_in_nhwc ? FORMAT_NHWC : FORMAT_NCHW;
+ constexpr auto kComputeInNHWC =
+ std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+ se::dnn::FilterLayout::kOutputYXInput);
+ constexpr auto kComputeInNCHW =
+ std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+ se::dnn::FilterLayout::kOutputInputYX);
+ se::dnn::DataLayout compute_data_layout;
+ se::dnn::FilterLayout filter_layout;
+ std::tie(compute_data_layout, filter_layout) =
+ compute_in_nhwc ? kComputeInNHWC : kComputeInNCHW;
+
se::dnn::BatchDescriptor input_desc;
input_desc.set_count(in_batch)
.set_feature_map_count(in_depths)
.set_height(in_rows)
.set_width(in_cols)
- .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+ .set_layout(compute_data_layout);
se::dnn::FilterDescriptor filter_desc;
filter_desc.set_input_filter_height(patch_rows)
.set_input_filter_width(patch_cols)
.set_input_feature_map_count(patch_depths)
- .set_output_feature_map_count(filter.dim_size(3));
+ .set_output_feature_map_count(filter.dim_size(3))
+ .set_layout(filter_layout);
se::dnn::BatchDescriptor bias_desc;
bias_desc.set_count(1)
.set_height(1)
.set_width(1)
.set_feature_map_count(out_depths)
- .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+ .set_layout(compute_data_layout);
se::dnn::ConvolutionDescriptor conv_desc;
conv_desc.set_vertical_dilation_rate(dimensions.dilation_rows)
.set_horizontal_dilation_rate(dimensions.dilation_cols)
@@ -481,22 +501,38 @@ struct LaunchFusedConv2DOp {
.set_height(out_rows)
.set_width(out_cols)
.set_feature_map_count(out_depths)
- .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+ .set_layout(compute_data_layout);
Tensor transformed_filter;
- OP_REQUIRES_OK(context,
- context->allocate_temp(
- DataTypeToEnum::value,
- TensorShape({filter.dim_size(3), filter.dim_size(2),
- filter.dim_size(0), filter.dim_size(1)}),
- &transformed_filter));
- functor::TransformFilter()(
- context->eigen_device(), FORMAT_OIHW,
- To32Bit(filter.tensor()),
- To32Bit(transformed_filter.tensor()));
+ const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
+ VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
+ << " to " << ToString(dst_format);
+
+ TensorShape dst_shape =
+ dst_format == FORMAT_OIHW
+ ? TensorShape({filter.dim_size(3), filter.dim_size(2),
+ filter.dim_size(0), filter.dim_size(1)})
+ : TensorShape({filter.dim_size(3), filter.dim_size(0),
+ filter.dim_size(1), filter.dim_size(2)});
+
+ TF_RETURN_IF_ERROR(context->allocate_temp(
+ DataTypeToEnum::value, dst_shape, &transformed_filter));
+ functor::TransformFilter()(
+ context->eigen_device(), dst_format,
+ To32Bit(filter.tensor()),
+ To32Bit(transformed_filter.tensor()));
+
+ return OkStatus();
+ };
+
+ if (compute_in_nhwc) {
+ OP_REQUIRES_OK(context, transform_filter(FORMAT_OHWI));
+ } else {
+ OP_REQUIRES_OK(context, transform_filter(FORMAT_OIHW));
+ }
Tensor transformed_output;
- if (params.data_format == FORMAT_NHWC) {
+ if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
// Only allocate temporary memory when a layout transformation is needed.
OP_REQUIRES_OK(context,
context->allocate_temp(
@@ -532,7 +568,7 @@ struct LaunchFusedConv2DOp {
in_depths, // in_depths
{{in_rows, // in_rows
in_cols}}, // in_cols
- FORMAT_NCHW, // compute_data_format
+ compute_data_format, // compute_data_format
out_depths, // out_depths
{{patch_rows, // filter_rows
patch_cols, // filter_cols
@@ -615,7 +651,7 @@ struct LaunchFusedConv2DOp {
OP_REQUIRES_OK(context, cudnn_launch_status);
// Convert the output tensor back from NCHW to NHWC.
- if (params.data_format == FORMAT_NHWC) {
+ if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
functor::NCHWToNHWC()(
context->eigen_device(),
const_cast(transformed_output).tensor(),
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index aa6936e6ff8dba..ab66567ec4f091 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -271,6 +271,24 @@ template StatusOr> AutotuneFusedConv(
se::DeviceMemory bias_ptr, se::DeviceMemory side_input_ptr,
int64_t scratch_size_limit);
+template StatusOr>
+AutotuneFusedConv(
+ bool cudnn_use_autotune,
+ AutotuneMap>*
+ autotune_map,
+ const ConvParameters& params, OpKernelContext* ctx,
+ const se::dnn::BatchDescriptor& input_desc,
+ const se::dnn::FilterDescriptor& filter_desc,
+ const se::dnn::BatchDescriptor& bias_desc,
+ const se::dnn::BatchDescriptor& output_desc,
+ const se::dnn::ConvolutionDescriptor& conv_desc,
+ const se::dnn::ActivationMode activation_mode, double conv_scale,
+ double side_input_scale, se::DeviceMemory input_ptr,
+ se::DeviceMemory filter_ptr,
+ se::DeviceMemory output_ptr,
+ se::DeviceMemory bias_ptr,
+ se::DeviceMemory side_input_ptr, int64_t scratch_size_limit);
+
template
StatusOr> AutotuneUnfusedConv(
bool cudnn_use_autotune,
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 78102dc1e201f6..fb1c7bb7d3676d 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -417,7 +417,7 @@ REGISTER_OP("_FusedConv2D")
.Input("filter: T")
.Input("args: num_args * T")
.Output("output: T")
- .Attr("T: {float, double}")
+ .Attr("T: {half, float, double}")
.Attr("num_args: int >= 0")
.Attr("strides: list(int)")
.Attr(GetPaddingAttrStringWithExplicit())
diff --git a/tensorflow/python/grappler/remapper_test.py b/tensorflow/python/grappler/remapper_test.py
index 0a97ddb29ffc89..226635e123d183 100644
--- a/tensorflow/python/grappler/remapper_test.py
+++ b/tensorflow/python/grappler/remapper_test.py
@@ -27,6 +27,7 @@
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
@@ -74,6 +75,43 @@ def _maybe_skip(self, mode):
if mode == 'mkl' and not test_util.IsMklEnabled():
self.skipTest('MKL is not enabled.')
+ def _VerifyValues(self, model_fn, use_low_precision, fused_op, epilog_ops):
+ run_options = config_pb2.RunOptions(output_partition_graphs=True)
+ metadata = config_pb2.RunMetadata()
+ # Compute reference value.
+ config = _get_config(remapping_on=False)
+ with session.Session(config=config) as sess:
+ sess.run(variables.global_variables_initializer())
+ output_ref = sess.run(model_fn, options=run_options,
+ run_metadata=metadata)
+ # Compute output with fusion.
+ config = _get_config(remapping_on=True)
+ with session.Session(config=config) as sess:
+ sess.run(variables.global_variables_initializer())
+ output_val = sess.run(model_fn, options=run_options,
+ run_metadata=metadata)
+ graph = metadata.partition_graphs[0]
+
+ # Graph should contain fused op.
+ found_fused_op = False
+ for node in graph.node:
+ if node.op in fused_op:
+ fused_ops = node.attr['fused_ops'].list.s
+ ops_matched = len(fused_ops) >= 1 and len(fused_ops) == len(epilog_ops)
+ for op_a, op_b in zip(fused_ops, epilog_ops):
+ if op_a != op_b:
+ ops_matched = False
+ break
+ found_fused_op = ops_matched
+ break
+ self.assertTrue(found_fused_op)
+
+ # Computed output value should be close to reference value.
+ tol = 1e-2 if use_low_precision else 1e-5
+ self.assertAllClose(output_ref, output_val, atol=tol, rtol=tol)
+
+ return graph
+
@parameterized.parameters(['cuda', 'mkl'])
@test_util.run_deprecated_v1
@test_util.disable_xla('This test does not pass with XLA')
@@ -81,8 +119,6 @@ def test_matmul_biasadd_gelu_fusion(self, mode):
"""Test MatMul+BiasAdd+Gelu fusion."""
self._maybe_skip(mode)
is_bf16_supported = _pywrap_utils.IsBF16SupportedByOneDNNOnThisCPU()
- run_options = config_pb2.RunOptions(output_partition_graphs=True)
- metadata = config_pb2.RunMetadata()
m, n, k = (3, 3, 4) # Matrix dimensions
for precision in ('float32', 'bfloat16'):
@@ -109,33 +145,11 @@ def test_matmul_biasadd_gelu_fusion(self, mode):
z = nn.bias_add(y, b)
out = nn.gelu(z, approximate=approximate)
- # Compute reference value.
- config = _get_config(remapping_on=False)
- with session.Session(config=config) as sess:
- sess.run(variables.global_variables_initializer())
- output_val_ref = sess.run(
- out, options=run_options, run_metadata=metadata)
- # Compute output with fusion.
- config = _get_config(remapping_on=True)
- with session.Session(config=config) as sess:
- sess.run(variables.global_variables_initializer())
- output_val = sess.run(out, options=run_options, run_metadata=metadata)
- graph = metadata.partition_graphs[0]
-
- # Graph should contain fused op.
- found_fused_op = False
gelu_type = b'GeluApproximate' if approximate else b'GeluExact'
- for node in graph.node:
- if node.op in ('_MklNativeFusedMatMul', '_MklFusedMatMul'):
- fused_ops = node.attr['fused_ops'].list.s
- found_fused_op = len(fused_ops) == 2 and \
- fused_ops[0] == b'BiasAdd' and fused_ops[1] == gelu_type
- break
- self.assertTrue(found_fused_op)
-
- # Computed output value should be close to reference value.
- tol = 1e-5 if precision == 'float32' else 1e-2
- self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+ epilog_ops = [b'BiasAdd', gelu_type]
+ fused_op = ['_MklNativeFusedMatMul', '_MklFusedMatMul']
+ graph = self._VerifyValues(out, precision == 'bfloat16', fused_op,
+ epilog_ops)
@test_util.run_deprecated_v1
@test_util.disable_xla('This test does not pass with XLA')
@@ -143,43 +157,38 @@ def test_conv2d_biasadd_relu_fusion(self):
"""Test Conv2D+BiasAdd+Relu fusion."""
if not test_util.is_gpu_available():
self.skipTest('No GPU available')
- run_options = config_pb2.RunOptions(output_partition_graphs=True)
- metadata = config_pb2.RunMetadata()
-
- n, h, w, c = (5, 3, 3, 4)
-
- ops.reset_default_graph()
- x = _input([n, c, h, w])
- w = _weight([2, 2, c, c])
- b = _bias([c])
- y = nn_ops.conv2d(x, w, strides=(1, 1), padding='SAME', data_format='NCHW')
- z = nn.bias_add(y, b, data_format='NC..')
- out = nn.relu(z)
-
- # Compute reference value.
- config = _get_config(remapping_on=False)
- with session.Session(config=config) as sess:
- sess.run(variables.global_variables_initializer())
- output_val_ref = sess.run(out, options=run_options, run_metadata=metadata)
- # Compute output with fusion.
- config = _get_config(remapping_on=True)
- with session.Session(config=config) as sess:
- sess.run(variables.global_variables_initializer())
- output_val = sess.run(out, options=run_options, run_metadata=metadata)
- graph = metadata.partition_graphs[0]
-
- # Graph should contain fused op.
- found_fused_op = False
- for node in graph.node:
- if node.op == '_FusedConv2D':
- found_fused_op = True
- break
- self.assertTrue(found_fused_op)
-
- # Computed output value should be close to reference value.
- tol = 1e-5
- self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+ N, H, W, C = (5, 3, 3, 4)
+
+ for precision in ('float16', 'float32'):
+ ops.reset_default_graph()
+ x_shape = [N, C, H, W]
+ x_format = 'NCHW'
+ b_format = 'NC..'
+ use_fp16 = precision == 'float16'
+ if use_fp16:
+ x_shape = [N, H, W, C]
+ x_format = 'NHWC'
+ b_format = 'N..C'
+
+ x = _input(x_shape)
+ w = _weight([2, 2, C, C])
+ b = _bias([C])
+
+ if use_fp16:
+ x = math_ops.cast(x, dtypes.float16)
+ w = math_ops.cast(w, dtypes.float16)
+ b = math_ops.cast(b, dtypes.float16)
+
+ y = nn_ops.conv2d(x, w, strides=(1, 1), padding='SAME',
+ data_format=x_format)
+ z = nn.bias_add(y, b, data_format=b_format)
+ out = nn.relu(z)
+ out = array_ops.identity(out)
+
+ epilog_ops = [b'BiasAdd', b'Relu']
+ fused_op = ['_FusedConv2D']
+ graph = self._VerifyValues(out, use_fp16, fused_op, epilog_ops)
if __name__ == '__main__':
test.main()
From 49bc536c56d4a4f96d8a92b018d2e8842a0473a3 Mon Sep 17 00:00:00 2001
From: Andrew Goodbody
Date: Wed, 29 Jun 2022 11:21:35 +0100
Subject: [PATCH 064/259] Remove references to no_oss_py2 tag
---
tensorflow/python/autograph/pyct/BUILD | 9 ---------
tensorflow/python/distribute/BUILD | 1 -
tensorflow/python/tpu/BUILD | 1 -
tensorflow/python/tpu/client/BUILD | 3 ---
tensorflow/tensorflow.bzl | 4 ----
tensorflow/tools/ci_build/linux/cpu/run_mkl.sh | 2 +-
.../tools/ci_build/presubmit/macos/py37_cc/build.sh | 2 +-
tensorflow/tools/docs/BUILD | 3 ---
8 files changed, 2 insertions(+), 23 deletions(-)
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index a8888a1656cc58..0b407dfaf96350 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -65,9 +65,6 @@ py_test(
srcs = ["ast_util_test.py"],
python_version = "PY3",
srcs_version = "PY3",
- tags = [
- "no_oss_py2",
- ],
deps = [
":pyct",
"//tensorflow/python:client_testlib",
@@ -80,9 +77,6 @@ py_test(
srcs = ["cache_test.py"],
python_version = "PY3",
srcs_version = "PY3",
- tags = [
- "no_oss_py2",
- ],
deps = [
":pyct",
"//tensorflow/python:client_testlib",
@@ -95,9 +89,6 @@ py_test(
srcs = ["cfg_test.py"],
python_version = "PY3",
srcs_version = "PY3",
- tags = [
- "no_oss_py2",
- ],
deps = [
":pyct",
"//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index eebbc2ddd20d5c..7eb31a1f7cf276 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -98,7 +98,6 @@ cuda_py_test(
name = "device_util_test",
srcs = ["device_util_test.py"],
python_version = "PY3",
- tags = ["no_oss_py2"],
deps = [
":combinations",
":device_util",
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index f41e1d2c652de1..f20a68e0a4f0b6 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -32,7 +32,6 @@ py_test(
python_version = "PY3",
srcs_version = "PY3",
tags = [
- "no_oss_py2",
"no_oss_py35",
"no_pip",
],
diff --git a/tensorflow/python/tpu/client/BUILD b/tensorflow/python/tpu/client/BUILD
index bb95d4f605798c..b7b657fcfca412 100644
--- a/tensorflow/python/tpu/client/BUILD
+++ b/tensorflow/python/tpu/client/BUILD
@@ -40,9 +40,6 @@ tf_py_test(
grpc_enabled = True,
main = "client_test.py",
python_version = "PY3",
- tags = [
- "no_oss_py2",
- ],
deps = [
":client",
"//tensorflow/python:client_testlib",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 497bfb0dcf415b..060a4e0816aad4 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2416,10 +2416,6 @@ def pywrap_tensorflow_macro(
# //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
def py_test(deps = [], data = [], kernels = [], exec_properties = None, **kwargs):
- # Python version placeholder
- if kwargs.get("python_version", None) == "PY3":
- kwargs["tags"] = kwargs.get("tags", []) + ["no_oss_py2"]
-
if not exec_properties:
exec_properties = tf_exec_properties(kwargs)
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index ca58a673f196b7..534189d32385a4 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -89,7 +89,7 @@ echo ""
# execution in an MKL primitive. This reduces the effects of an oversubscription
# of OpenMP threads caused by executing multiple tests concurrently.
bazel test \
- --test_tag_filters=-no_oss,-no_oss_py2,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only \
+ --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only \
--test_lang_filters=cc,py \
-k \
--jobs=${N_JOBS} \
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
index 657e682ded0c88..8efa17887444cb 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
@@ -29,7 +29,7 @@ function run_build () {
export TF_NEED_CUDA=0
export PYTHON_BIN_PATH=$(which python3.7)
yes "" | $PYTHON_BIN_PATH configure.py
- tag_filters="-no_oss,-no_oss_py2,-gpu,-tpu,-benchmark-test,-nomac,-no_mac,-v1only"
+ tag_filters="-no_oss,-gpu,-tpu,-benchmark-test,-nomac,-no_mac,-v1only"
# Get the default test targets for bazel.
source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index e37580bd87f147..5b2f166f553773 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -53,7 +53,6 @@ py_test(
python_version = "PY3",
shard_count = 4,
tags = [
- "no_oss_py2",
"no_pip",
"no_rocm", # No need to rerun this test for ROCm config.
"no_windows", # numpy prints differently on windows.
@@ -105,7 +104,6 @@ py_test(
main = "tf_doctest.py",
python_version = "PY3",
tags = [
- "no_oss_py2",
"no_pip",
"no_rocm",
"no_windows", # numpy prints differently on windows.
@@ -129,7 +127,6 @@ py_test(
srcs = ["tf_doctest_test.py"],
python_version = "PY3",
tags = [
- "no_oss_py2",
"no_pip",
"noasan",
"nomsan",
From a12ea09d07081961ea891ac0d952b2bc694d9178 Mon Sep 17 00:00:00 2001
From: Amin Benarieb
Date: Thu, 30 Jun 2022 14:16:05 +0600
Subject: [PATCH 065/259] Fixes for formatting
---
tensorflow/lite/g3doc/performance/coreml_delegate.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index 91ae96ecbceaae..096a8693243524 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -62,7 +62,7 @@ TensorFlow Lite 2.4.0 release, this was the only option.
} else {
interpreter = try Interpreter(modelPath: modelPath)
}
-
+
Objective-C
@@ -92,7 +92,7 @@ TensorFlow Lite 2.4.0 release, this was the only option.
if (error != nil) { /* Error handling... */ }
// Run inference ...
-
+
C (Until 2.3.0)
@@ -159,7 +159,7 @@ pass `TfLiteCoreMlDelegateAllDevices`. Following example shows how to do this:
initWithOptions:coreMLOptions];
// Initialize interpreter with delegate
-
+
C
@@ -191,7 +191,7 @@ performance benefits. Following example shows how to do this:
let interpreter = try Interpreter(modelPath: modelPath,
delegates: [delegate!])
-
+
Objective-C
From 7bb8d521535b0a6f5f744bfb5358818e61c064fc Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 30 Jun 2022 18:26:46 +0000
Subject: [PATCH 066/259] Exclude xla test with SparseTensor
---
tensorflow/python/BUILD | 1 +
tensorflow/python/ops/bincount_ops_test.py | 2 ++
2 files changed, 3 insertions(+)
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index abe744412265e0..646b454a2ce680 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2684,6 +2684,7 @@ cuda_py_test(
size = "small",
srcs = ["ops/bincount_ops_test.py"],
python_version = "PY3",
+ xla_enable_strict_auto_jit = True,
deps = [
":bincount_ops",
":platform_test",
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index feffe127a57bf4..3f024786be5fe9 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -658,6 +658,7 @@ def f (x,
class TestDenseBincount(test.TestCase, parameterized.TestCase):
+ @test_util.disable_xla("XLA does not support SparseTensor")
@parameterized.parameters([{
"dtype": np.int32,
}, {
@@ -703,6 +704,7 @@ def test_sparse_input_all_count_with_weights(self, dtype):
self.evaluate(bincount_ops.bincount(
sparse_inp, sparse_weights, axis=0)))
+ @test_util.disable_xla("XLA does not support SparseTensor")
@parameterized.parameters([{
"dtype": np.int32,
}, {
From e44a48ed552e2a45e452ed39b68ddcbe6baa2ff2 Mon Sep 17 00:00:00 2001
From: bhack
Date: Thu, 30 Jun 2022 21:01:35 +0200
Subject: [PATCH 067/259] Update bincount_ops_test.py
---
tensorflow/python/ops/bincount_ops_test.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 3f024786be5fe9..6652c581def904 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -658,12 +658,12 @@ def f (x,
class TestDenseBincount(test.TestCase, parameterized.TestCase):
- @test_util.disable_xla("XLA does not support SparseTensor")
@parameterized.parameters([{
"dtype": np.int32,
}, {
"dtype": np.int64,
}])
+ @test_util.disable_xla("XLA does not support SparseTensor")
def test_sparse_input_all_count(self, dtype):
np.random.seed(42)
num_rows = 128
@@ -704,12 +704,12 @@ def test_sparse_input_all_count_with_weights(self, dtype):
self.evaluate(bincount_ops.bincount(
sparse_inp, sparse_weights, axis=0)))
- @test_util.disable_xla("XLA does not support SparseTensor")
@parameterized.parameters([{
"dtype": np.int32,
}, {
"dtype": np.int64,
}])
+ @test_util.disable_xla("XLA does not support SparseTensor")
def test_sparse_input_all_binary(self, dtype):
np.random.seed(42)
num_rows = 128
From a3e5c230ab7b583d6e639e663472356fad8eaa12 Mon Sep 17 00:00:00 2001
From: DEKHTIARJonathan
Date: Thu, 16 Jun 2022 15:10:06 -0400
Subject: [PATCH 068/259] [TF-TRT] Fix SUPER call in TF-TRT tests
---
tensorflow/python/compiler/tensorrt/test/base_test.py | 2 +-
tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py | 2 +-
.../tensorrt/test/binary_tensor_weight_broadcast_test.py | 2 +-
.../python/compiler/tensorrt/test/dynamic_input_shapes_test.py | 2 +-
tensorflow/python/compiler/tensorrt/test/int32_test.py | 2 +-
tensorflow/python/compiler/tensorrt/test/shape_output_test.py | 2 +-
.../compiler/tensorrt/test/tf_trt_integration_test_base.py | 2 +-
tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py | 2 +-
tensorflow/python/compiler/tensorrt/test/vgg_block_test.py | 2 +-
9 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 3f89bd22607f0b..997c24937ff7ea 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -117,7 +117,7 @@ def ExpectedEnginesToBuild(self, run_params):
}
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
# Disable layout optimizer, since it will convert BiasAdd with NHWC
# format to NCHW format under four dimentional input.
self.DisableNonTrtOptimizers()
diff --git a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
index baa6b98e9666ed..f87910ca8efa1e 100644
--- a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
@@ -105,7 +105,7 @@ def GetParams(self):
[[4, 6680]])
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
# Disable layout optimizer, since it will convert BiasAdd with NHWC
# format to NCHW format under four dimentional input.
self.DisableNonTrtOptimizers()
diff --git a/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
index 676b58f7c53df8..10075f38f48b4d 100644
--- a/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -66,7 +66,7 @@ def ExpectedEnginesToBuild(self, run_params):
# TODO(b/176540862): remove this routine to disallow native segment execution
# for TensorRT 7+.
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
gpus = config.list_physical_devices("GPU")
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index 3112b968a78db1..f6e26ffac02f30 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -77,7 +77,7 @@ def GetParams(self):
expected_output_dims=expected_output_dims)
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
# Disable layout optimizer, since it will convert BiasAdd with NHWC
# format to NCHW format under four dimentional input.
self.DisableNonTrtOptimizers()
diff --git a/tensorflow/python/compiler/tensorrt/test/int32_test.py b/tensorflow/python/compiler/tensorrt/test/int32_test.py
index 0bbe99c2658816..21517e884f08d9 100644
--- a/tensorflow/python/compiler/tensorrt/test/int32_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/int32_test.py
@@ -43,7 +43,7 @@ def GetParams(self):
return self.BuildParams(self.GraphFn, dtypes.int32, [[100, 4]], [[100, 10]])
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
# Disable layout optimizer, since it will convert BiasAdd with NHWC
# format to NCHW format under four dimentional input.
self.DisableNonTrtOptimizers()
diff --git a/tensorflow/python/compiler/tensorrt/test/shape_output_test.py b/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
index 1e298a7aa2f8ec..2966c7b409a0a6 100644
--- a/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/shape_output_test.py
@@ -32,7 +32,7 @@ class ShapeOutputTest(trt_test.TfTrtIntegrationTestBase):
"""Test shape value output with TF-TRT."""
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
self.DisableNonTrtOptimizers()
def GraphFn(self, x):
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 160417f6dddfb2..7b3c372502a938 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -150,7 +150,7 @@ def __init__(self, methodName="runTest"): # pylint: disable=invalid-name
def setUp(self):
"""Setup method."""
- super(TfTrtIntegrationTestBase, self).setUp()
+ super().setUp()
warnings.simplefilter("always")
if not is_tensorrt_enabled():
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 3a352286a336b9..60a88271a05736 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -69,7 +69,7 @@ def ExpectedEnginesToBuild(self, run_params):
# TODO(b/159459919): remove this routine to disallow native segment execution.
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index d8b72f89f7c641..e4b87cf0247fa0 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -60,7 +60,7 @@ def ExpectedEnginesToBuild(self, run_params):
# TODO(b/159459919): remove this routine to disallow native segment execution.
def setUp(self):
- super(trt_test.TfTrtIntegrationTestBase, self).setUp() # pylint: disable=bad-super-call
+ super().setUp()
os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
From 8c40644cf2de359592f8118112b472d0bbdbcfa5 Mon Sep 17 00:00:00 2001
From: Manu Seth
Date: Thu, 30 Jun 2022 21:04:12 +0000
Subject: [PATCH 069/259] update flags and test skip list
---
tensorflow/tools/ci_build/Dockerfile.cpu.arm64 | 2 +-
.../tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh | 12 +++++++++---
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64 b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
index c6aed56189742f..db392b157597d1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
@@ -18,7 +18,7 @@ RUN yum -y check-update || true && \
COPY install/install_bazel.sh /install/
RUN /install/install_bazel.sh
-ARG py_major_minor_version
+ARG py_major_minor_version='3.10'
ENV TF_PYTHON_VERSION=python${py_major_minor_version}
ENV PYTHON_BIN_PATH=/usr/local/bin/${TF_PYTHON_VERSION}
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
index cc3a628ba54948..a7db71b718c3a9 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
@@ -59,8 +59,8 @@ py_ver=$(python -c 'import sys; print(str(sys.version_info.major)+str(sys.versio
export TF_BUILD_FLAGS="--config=mkl_aarch64 --copt=-mtune=generic --copt=-march=armv8-a \
--copt=-O3 --copt=-fopenmp --copt=-flax-vector-conversions --linkopt=-lgomp"
export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
- --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --test_lang_filters=py \
- --define=no_tensorflow_py_deps=true --verbose_failures=true --test_keep_going"
+ --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=no_tensorflow_py_deps=true \
+ --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium --verbose_failures=true --test_keep_going"
export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} \
-//tensorflow/lite/... \
-//tensorflow/compiler/mlir/lite/tests:const-fold.mlir.test \
@@ -76,6 +76,12 @@ export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} \
-//tensorflow/python/eager:forwardprop_test \
-//tensorflow/python/framework:node_file_writer_test \
-//tensorflow/python/grappler:memory_optimizer_test \
+ -//tensorflow/python/kernel_tests/array_ops:array_ops_test_cpu \
+ -//tensorflow/python/kernel_tests/array_ops:concat_op_test_cpu \
+ -//tensorflow/python/kernel_tests/array_ops:pad_op_test_cpu \
+ -//tensorflow/python/kernel_tests/array_ops:slice_op_test_cpu \
+ -//tensorflow/python/kernel_tests/array_ops:split_op_test_cpu \
+ -//tensorflow/python/kernel_tests/control_flow:scan_ops_test_cpu \
-//tensorflow/python/kernel_tests/linalg:linear_operator_householder_test \
-//tensorflow/python/kernel_tests/linalg:linear_operator_inversion_test \
-//tensorflow/python/kernel_tests/linalg:linear_operator_block_diag_test \
@@ -89,7 +95,7 @@ export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} \
-//tensorflow/python/ops/parallel_for:math_test \
-//tensorflow/python/training:server_lib_test"
export TF_PIP_TESTS="test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS="-no_oss,-oss_serial,-no_oss_py${py_ver},-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-requires-gpu"
+export TF_TEST_FILTER_TAGS="-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-requires-gpu,-gpu,-tpu,-benchmark-test,-no_aarch64"
export TF_PIP_TEST_ROOT="pip_test"
export TF_AUDITWHEEL_TARGET_PLAT="manylinux2014"
From 11dc383374ea0c34ea60033160d7de5f57aa5116 Mon Sep 17 00:00:00 2001
From: bhack
Date: Fri, 1 Jul 2022 16:34:44 +0000
Subject: [PATCH 070/259] auto jit in kernel_tests
---
tensorflow/python/kernel_tests/math_ops/BUILD | 1 +
1 file changed, 1 insertion(+)
diff --git a/tensorflow/python/kernel_tests/math_ops/BUILD b/tensorflow/python/kernel_tests/math_ops/BUILD
index aa83285ccfe512..f15908e8157c54 100644
--- a/tensorflow/python/kernel_tests/math_ops/BUILD
+++ b/tensorflow/python/kernel_tests/math_ops/BUILD
@@ -106,6 +106,7 @@ cuda_py_test(
size = "small",
srcs = ["bincount_op_test.py"],
tags = ["no_windows_gpu"],
+ xla_enable_strict_auto_jit = True,
deps = [
"//tensorflow/python:bincount_ops",
"//tensorflow/python:client_testlib",
From 1f15eb520a3d6ca946dcf33726a0f33aa5fbcd48 Mon Sep 17 00:00:00 2001
From: bhack
Date: Sat, 2 Jul 2022 16:52:34 +0000
Subject: [PATCH 071/259] Add some TF checks
---
.../compiler/tf2xla/kernels/bincount_op.cc | 57 +++++++++++++------
1 file changed, 39 insertions(+), 18 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index b3431caf9bb723..38c61bfcfd32ab 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -36,39 +36,60 @@ class DenseBincountOp : public XlaOpKernel {
private:
bool binary_output_ = false;
void Compile(XlaOpKernelContext* ctx) override {
+ int64_t output_size;
+ xla::XlaOp output_size_param = ctx->Input("size");
+ StatusOr output_shape_or = ctx->builder()->GetShape(output_size_param);
+ OP_REQUIRES_OK(ctx, output_shape_or.status());
+ auto output_shape_param = output_shape_or.ValueOrDie();
+ auto output_rank = output_shape_param.rank();
+ OP_REQUIRES(
+ ctx, output_rank == 0,
+ xla::InvalidArgument("Shape must be rank 0 but is rank 1", output_rank));
+
+ OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("size", &output_size));
+ xla::XlaOp idx, updates, output;
xla::XlaOp input = ctx->Input(0);
+ auto input_xla_type = ctx->input_xla_type(0);
+ auto zero = xla::Zero(ctx->builder(), input_xla_type);
+ StatusOr input_shape_or = ctx->builder()->GetShape(input);
+ OP_REQUIRES_OK(ctx, input_shape_or.status());
+ auto input_shape = input_shape_or.ValueOrDie();
+ auto size = input_shape.dimensions(0);
+ if (! size) {
+ output = xla::Broadcast(zero, {output_size});
+ ctx->SetOutput(0, output);
+ return;
+ }
+ auto rank = input_shape.rank();
+
+ OP_REQUIRES(
+ ctx, rank <= 2,
+ xla::InvalidArgument("Shape must be at most rank 2 but is rank ", rank));
+
xla::XlaOp weights = ctx->Input(2);
StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
OP_REQUIRES_OK(ctx, weights_shape_or.status());
+
auto weights_shape = weights_shape_or.ValueOrDie();
auto weights_size = weights_shape.dimensions(0);
- auto input_xla_type = ctx->input_xla_type(0);
xla::PrimitiveType dtype;
bool has_weights;
if (weights_size) {
has_weights = true;
- dtype = ctx->input_xla_type(2);
+ dtype = ctx->InputXlaType("weights");
} else {
has_weights = false;
dtype = input_xla_type;
}
- int64_t output_size;
- OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("size", &output_size));
- StatusOr input_shape_or = ctx->builder()->GetShape(input);
- OP_REQUIRES_OK(ctx, input_shape_or.status());
- auto input_shape = input_shape_or.ValueOrDie();
- auto size = input_shape.dimensions(0);
- auto rank = input_shape.rank();
xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
- xla::XlaOp idx, updates, output;
xla::ScatterDimensionNumbers scatter_dnums;
scatter_dnums.set_index_vector_dim(1);
scatter_dnums.add_inserted_window_dims(0);
scatter_dnums.add_scatter_dims_to_operand_dims(0);
- auto one = xla::One(ctx->builder(), input_xla_type);
- auto zero = xla::Zero(ctx->builder(), input_xla_type);
-
+ zero = xla::Zero(ctx->builder(), dtype);
+ auto one = xla::One(ctx->builder(), dtype);
+
if (rank == 2) {
output_shape = xla::ShapeUtil::MakeShape(dtype, {size, output_size});
scatter_dnums.add_inserted_window_dims(1);
@@ -86,23 +107,23 @@ class DenseBincountOp : public XlaOpKernel {
idx = xla::ConcatInDim(ctx->builder(), iotas_to_concat, 1);
updates = xla::Broadcast(
one, {input_shape.dimensions(0) * input_shape.dimensions(1)});
- if (has_weights) {
+ output = xla::Broadcast(zero, {output_shape.dimensions(0), output_shape.dimensions(1)});
+ if (has_weights and !binary_output_) {
weights = xla::Reshape(
weights, {input_shape.dimensions(0) * input_shape.dimensions(1)});
- zero = xla::Zero(ctx->builder(), dtype);
updates = weights;
}
} else {
input = xla::Reshape(input, {size, 1});
idx = xla::Reshape(input, {size, 1});
updates = xla::Broadcast(one, {size});
- if (has_weights) {
+ output = xla::Broadcast(zero, {output_size});
+ if (has_weights and !binary_output_) {
updates = weights;
- zero = xla::Zero(ctx->builder(), dtype);
}
}
- output = xla::Broadcast(zero, {output_shape.dimensions()});
+
xla::XlaComputation assn_computation = [&] {
std::unique_ptr subb =
From d20f9572f03d4a02e693f0ff0caa0120c3e5c43d Mon Sep 17 00:00:00 2001
From: bhack
Date: Sat, 2 Jul 2022 19:13:23 +0000
Subject: [PATCH 072/259] Add some TF kernel checks in XLA
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 4 +++-
tensorflow/python/kernel_tests/math_ops/bincount_op_test.py | 2 ++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 38c61bfcfd32ab..9622a9710e79f7 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -45,8 +45,10 @@ class DenseBincountOp : public XlaOpKernel {
OP_REQUIRES(
ctx, output_rank == 0,
xla::InvalidArgument("Shape must be rank 0 but is rank 1", output_rank));
-
OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("size", &output_size));
+ OP_REQUIRES(
+ ctx, output_size >= 0,
+ errors::InvalidArgument("size (", output_size, ") must be non-negative"));
xla::XlaOp idx, updates, output;
xla::XlaOp input = ctx->Input(0);
auto input_xla_type = ctx->input_xla_type(0);
diff --git a/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py b/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
index 9161b3b082270c..00c175867d0cde 100644
--- a/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
@@ -105,6 +105,7 @@ def test_random_without_weights(self):
np.bincount(arr, weights))
@test_util.run_gpu_only
+ @test_util.disable_xla("XLA uses scatter")
def test_bincount_determinism_error(self):
arr = np.random.randint(0, 1000, size=1000)
with test_util.deterministic_ops(), self.assertRaisesRegex(
@@ -124,6 +125,7 @@ def test_zero_weights(self):
self.evaluate(bincount_ops.bincount(np.arange(1000), np.zeros(1000))),
np.zeros(1000))
+ @test_util.disable_xla("This is not raised in XLA")
def test_negative(self):
# unsorted_segment_sum will only report InvalidArgumentError on CPU
with self.cached_session(), ops.device("/CPU:0"):
From 897911bd0ab08c46a36074bc4e367fda7bda3cca Mon Sep 17 00:00:00 2001
From: bhack
Date: Sat, 2 Jul 2022 22:30:05 +0000
Subject: [PATCH 073/259] Fix dtype on some edge case tests
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 12 +++---------
.../python/kernel_tests/math_ops/bincount_op_test.py | 4 ++--
tensorflow/python/ops/bincount_ops_test.py | 2 --
3 files changed, 5 insertions(+), 13 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 9622a9710e79f7..72d619c4d8cb86 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -74,16 +74,11 @@ class DenseBincountOp : public XlaOpKernel {
auto weights_shape = weights_shape_or.ValueOrDie();
auto weights_size = weights_shape.dimensions(0);
- xla::PrimitiveType dtype;
- bool has_weights;
+ xla::PrimitiveType dtype = ctx->InputXlaType("weights");;
+ bool has_weights = false;
if (weights_size) {
has_weights = true;
- dtype = ctx->InputXlaType("weights");
- } else {
- has_weights = false;
- dtype = input_xla_type;
- }
-
+ }
xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
xla::ScatterDimensionNumbers scatter_dnums;
scatter_dnums.set_index_vector_dim(1);
@@ -140,7 +135,6 @@ class DenseBincountOp : public XlaOpKernel {
}();
output = xla::Scatter(output, idx, updates, assn_computation, scatter_dnums,
false, false);
-
ctx->SetOutput(0, output);
}
};
diff --git a/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py b/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
index 00c175867d0cde..746a89dcd480f0 100644
--- a/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
@@ -105,7 +105,7 @@ def test_random_without_weights(self):
np.bincount(arr, weights))
@test_util.run_gpu_only
- @test_util.disable_xla("XLA uses scatter")
+ @test_util.disable_xla("XLA uses scatter and could be deterministic")
def test_bincount_determinism_error(self):
arr = np.random.randint(0, 1000, size=1000)
with test_util.deterministic_ops(), self.assertRaisesRegex(
@@ -125,7 +125,7 @@ def test_zero_weights(self):
self.evaluate(bincount_ops.bincount(np.arange(1000), np.zeros(1000))),
np.zeros(1000))
- @test_util.disable_xla("This is not raised in XLA")
+ @test_util.disable_xla("This is not raised on XLA CPU")
def test_negative(self):
# unsorted_segment_sum will only report InvalidArgumentError on CPU
with self.cached_session(), ops.device("/CPU:0"):
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 6652c581def904..feffe127a57bf4 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -663,7 +663,6 @@ class TestDenseBincount(test.TestCase, parameterized.TestCase):
}, {
"dtype": np.int64,
}])
- @test_util.disable_xla("XLA does not support SparseTensor")
def test_sparse_input_all_count(self, dtype):
np.random.seed(42)
num_rows = 128
@@ -709,7 +708,6 @@ def test_sparse_input_all_count_with_weights(self, dtype):
}, {
"dtype": np.int64,
}])
- @test_util.disable_xla("XLA does not support SparseTensor")
def test_sparse_input_all_binary(self, dtype):
np.random.seed(42)
num_rows = 128
From fc6be4def54526725645437b406d7612e7c1701f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower"
Date: Mon, 4 Jul 2022 06:04:30 -0700
Subject: [PATCH 074/259] Fix typo: FusionIterface -> FusionInterface.
PiperOrigin-RevId: 458897270
---
.../mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td | 2 +-
.../Dialect/gml_st/transforms/fusion_interface.td | 2 +-
.../mlir/hlo/lib/Dialect/gml_st/transforms/fusion.cc | 2 +-
.../Dialect/gml_st/transforms/fusion_interface_impl.cc | 8 ++++----
4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td
index dd5831a9d2a5ed..4d951ee1239429 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td
@@ -21,7 +21,7 @@ include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops_base.td"
include "mlir-hlo/Dialect/gml_st/transforms/fusion_interface.td"
def GMLST_DynamicBroadcastInDimOp : GMLST_Op<"dynamic_broadcast_in_dim", [
- NoSideEffect, DeclareOpInterfaceMethods]> {
+ NoSideEffect, DeclareOpInterfaceMethods]> {
let summary = [{Destination-style twin for `mhlo.dynamic_broadcast_in_dim`}];
let arguments = (ins
AnyTensor:$init,
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion_interface.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion_interface.td
index e5292537b2d3ce..0199f65d8bc61a 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion_interface.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion_interface.td
@@ -18,7 +18,7 @@ limitations under the License.
include "mlir/IR/OpBase.td"
-def FusionIterface : OpInterface<"FusionIterface"> {
+def FusionInterface : OpInterface<"FusionInterface"> {
let description = [{
TBD
}];
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion.cc
index 27d114a67e5a54..c556821e599945 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion.cc
@@ -36,7 +36,7 @@ struct FusionPattern : public OpRewritePattern {
Operation* def = op.source().getDefiningOp();
if (!def) return failure();
- auto iface = llvm::dyn_cast(def);
+ auto iface = llvm::dyn_cast(def);
if (!iface) return failure();
Value fused = iface.fuse(op.getLoc(), op.set(), rewriter);
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion_interface_impl.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion_interface_impl.cc
index 4dc0d787dc8825..6f37fb41242db5 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion_interface_impl.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/fusion_interface_impl.cc
@@ -44,8 +44,8 @@ bool isElementwise(linalg::GenericOp genericOp) {
}
struct LingalgGenericFusionInterface
- : public FusionIterface::ExternalModel {
+ : public FusionInterface::ExternalModel {
Value fuse(Operation* op, Location loc, Value subset,
OpBuilder& builder) const {
auto genericOp = llvm::cast(op);
@@ -76,8 +76,8 @@ struct LingalgGenericFusionInterface
template
struct ElementwiseFusionInterface
- : public FusionIterface::ExternalModel,
- OpTy> {
+ : public FusionInterface::ExternalModel,
+ OpTy> {
Value fuse(Operation* op, Location loc, Value subset,
OpBuilder& builder) const {
// Supports tile and point subsets.
From 6b8dede318b3fb516484a636759e9ff82fd68594 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel
Date: Mon, 4 Jul 2022 06:25:21 -0700
Subject: [PATCH 075/259] Add a pass to vectorize linalg.generic ops within
gml_st loops.
PiperOrigin-RevId: 458900061
---
tensorflow/compiler/mlir/hlo/BUILD | 23 ++++++
.../Dialect/gml_st/transforms/pass_detail.h | 3 +
.../Dialect/gml_st/transforms/passes.h | 4 +
.../Dialect/gml_st/transforms/passes.td | 12 ++-
.../Dialect/gml_st/transforms/CMakeLists.txt | 5 ++
.../gml_st/transforms/vectorization.cc | 59 +++++++++++++++
.../hlo/lib/Transforms/gml_st_pipeline.cc | 1 +
.../tests/Dialect/gml_st/bufferization.mlir | 2 +-
.../hlo/tests/Dialect/gml_st/vectorize.mlir | 74 +++++++++++++++++++
.../mlir/hlo/tests/gml_st_pipeline.mlir | 5 +-
10 files changed, 185 insertions(+), 3 deletions(-)
create mode 100644 tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/vectorization.cc
create mode 100644 tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/vectorize.mlir
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index 279fc93612e38f..43a4fe814789dd 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -2193,6 +2193,7 @@ cc_library(
deps = [
":bufferize_pass",
":gml_st_tiling",
+ ":gml_st_vectorization",
":legalize_mhlo_to_gml",
":legalize_to_linalg",
"@llvm-project//mlir:BufferizationTransforms",
@@ -2595,6 +2596,28 @@ cc_library(
],
)
+cc_library(
+ name = "gml_st_vectorization",
+ srcs = [
+ "include/mlir-hlo/Dialect/gml_st/transforms/pass_detail.h",
+ "lib/Dialect/gml_st/transforms/vectorization.cc",
+ ],
+ hdrs = [
+ "include/mlir-hlo/Dialect/gml_st/transforms/passes.h",
+ ],
+ includes = ["include"],
+ deps = [
+ ":gml_st",
+ ":gml_st_passes_inc_gen",
+ "@llvm-project//mlir:FuncDialect",
+ "@llvm-project//mlir:IR",
+ "@llvm-project//mlir:LinalgDialect",
+ "@llvm-project//mlir:LinalgTransforms",
+ "@llvm-project//mlir:Pass",
+ "@llvm-project//mlir:VectorDialect",
+ ],
+)
+
cc_library(
name = "legalize_mhlo_to_gml",
srcs = [
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/pass_detail.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/pass_detail.h
index 0ba57051eff5ff..20d1d42b59b4da 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/pass_detail.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/pass_detail.h
@@ -27,6 +27,9 @@ class FuncOp;
namespace scf {
class SCFDialect;
} // namespace scf
+namespace vector {
+class VectorDialect;
+} // namespace vector
} // namespace mlir
#define GEN_PASS_CLASSES
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h
index 26a4506e37d682..1de73507bff429 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h
@@ -45,6 +45,10 @@ std::unique_ptr> createGmlStToScfPass();
// its body.
std::unique_ptr> CreateTiledLoopBufferizePass();
+/// Pass to vectorize linalg.generic ops tiled to gml_st.parallel and gml_st.for
+/// loops.
+std::unique_ptr> createVectorizeGmlStLoopsPass();
+
#define GEN_PASS_REGISTRATION
#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td
index f8def99b8a3701..745ba18970fcf7 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td
@@ -45,7 +45,17 @@ def GmlStToScf : Pass<"gml-st-to-scf", "mlir::func::FuncOp"> {
let dependentDialects = ["::mlir::scf::SCFDialect"];
}
-def TiledLoopBufferizePass : Pass<"gml-tiled-loop-bufferize", "mlir::func::FuncOp"> {
+def TiledLoopBufferizePass :
+ Pass<"gml-tiled-loop-bufferize", "mlir::func::FuncOp"> {
let summary = "Pass to bufferize linalg.tiled_loop with the ops inside it.";
let constructor = "::mlir::gml_st::CreateTiledLoopBufferizePass()";
}
+
+def VectorizeGmlStLoopsPass :
+ Pass<"vectorize-gml-st-loops", "mlir::func::FuncOp"> {
+ let summary =
+ "Pass to vectorize linalg.generic ops tiled to gml_st.parallel and " #
+ "gml_st.for loops.";
+ let constructor = "::mlir::gml_st::createVectorizeGmlStLoopsPass()";
+ let dependentDialects = ["::mlir::vector::VectorDialect"];
+}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt
index 1aed638b2895f2..afa6a82e141057 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt
@@ -59,6 +59,7 @@ add_mlir_library(GmlStPasses
gml_st_to_scf.cc
legalize_mhlo_to_gml.cc
tiling.cc
+ vectorization.cc
DEPENDS
MLIRGmlStPassIncGen
@@ -69,9 +70,13 @@ add_mlir_library(GmlStPasses
LINK_LIBS PUBLIC
GmlStFusionInterface
GmlStFusionInterfaceImpl
+ MLIRFuncDialect
MLIRIR
+ MLIRLinalgDialect
+ MLIRLinalgTransforms
MLIRPass
MLIRSupport
+ MLIRVectorDialect
)
add_mlir_library(GmlStTransforms
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/vectorization.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/vectorization.cc
new file mode 100644
index 00000000000000..50d6e3084cef37
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/vectorization.cc
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include
+
+#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
+#include "mlir-hlo/Dialect/gml_st/transforms/pass_detail.h"
+#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir {
+namespace gml_st {
+
+struct VectorizeGmlStLoopsPass
+ : public VectorizeGmlStLoopsPassBase {
+ void runOnOperation() override {
+ auto funcOp = getOperation();
+ // Vectorize linalg.generic operations inside gml_st.for and gml_st.parallel
+ // loops.
+ OpPassManager dynamicPM("func.func");
+ linalg::CodegenStrategy strategy;
+ strategy.vectorize(linalg::GenericOp::getOperationName(),
+ [](mlir::Operation *op) {
+ auto generic = mlir::dyn_cast(op);
+ if (!generic) return failure();
+ if (op->getParentOfType() ||
+ op->getParentOfType()) {
+ return success();
+ }
+ return failure();
+ });
+ strategy.configurePassPipeline(dynamicPM, funcOp.getContext());
+ if (failed(runPipeline(dynamicPM, funcOp))) return signalPassFailure();
+ }
+};
+
+std::unique_ptr> createVectorizeGmlStLoopsPass() {
+ return std::make_unique();
+}
+
+} // namespace gml_st
+} // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Transforms/gml_st_pipeline.cc b/tensorflow/compiler/mlir/hlo/lib/Transforms/gml_st_pipeline.cc
index b83c895aa4bfd3..e990ec15efd6b3 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Transforms/gml_st_pipeline.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Transforms/gml_st_pipeline.cc
@@ -48,6 +48,7 @@ void createGmlStPipeline(mlir::OpPassManager& pm,
// Convert Linalg + GmlSt to SCF loops.
pm.addNestedPass(createConvertLinalgToLoopsPass());
+ pm.addNestedPass(gml_st::createVectorizeGmlStLoopsPass());
pm.addNestedPass(gml_st::createGmlStToScfPass());
}
diff --git a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/bufferization.mlir b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/bufferization.mlir
index ab652712a64383..3c765718751a9b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/bufferization.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/bufferization.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-hlo-opt %s -test-gml-st-bufferization -canonicalize -cse \
-// RUN: -split-input-file | FileCheck %s --dump-input=always
+// RUN: -split-input-file | FileCheck %s
func.func @set_space(%input: tensor) -> tensor {
%c0 = arith.constant 0 : index
diff --git a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/vectorize.mlir b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/vectorize.mlir
new file mode 100644
index 00000000000000..c798d4cc18d349
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/vectorize.mlir
@@ -0,0 +1,74 @@
+// Test vectorization of gml_st.parallel and gml_st.for loops.
+// RUN: mlir-hlo-opt %s --vectorize-gml-st-loops | \
+// RUN: FileCheck %s
+
+#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: @parallel_with_tiles(
+func.func @parallel_with_tiles(
+ %arg0: memref, %arg1: memref, %arg2: memref)
+ -> memref {
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %0 = memref.dim %arg0, %c0 : memref
+ %1 = memref.dim %arg0, %c1 : memref
+ gml_st.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c4, %c1) {
+ %6 = memref.subview %arg2[%arg3, %arg4] [4, 1] [1, 1]
+ : memref to memref<4x1xf32, #map0>
+ %7 = memref.subview %arg1[%arg3, %arg4] [4, 1] [1, 1]
+ : memref to memref<4x1xf32, #map0>
+ %8 = memref.subview %arg0[%arg3, %arg4] [4, 1] [1, 1]
+ : memref to memref<4x1xf32, #map0>
+ linalg.generic {indexing_maps = [#map1, #map1, #map1],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%8, %7 : memref<4x1xf32, #map0>, memref<4x1xf32, #map0>)
+ outs(%6 : memref<4x1xf32, #map0>) {
+ ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
+ %9 = arith.addf %arg5, %arg6 : f32
+ linalg.yield %9 : f32
+ }
+ gml_st.set_yield
+ }
+ func.return %arg2 : memref
+}
+// CHECK-NOT: linalg.generic
+// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4x1xf32>
+// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0, %c0]
+
+// CHECK-LABEL: @for_with_tiles(
+func.func @for_with_tiles(
+ %arg0: memref, %arg1: memref, %arg2: memref)
+ -> memref {
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %0 = memref.dim %arg0, %c0 : memref
+ %1 = memref.dim %arg0, %c1 : memref
+ gml_st.for (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c4, %c1) {
+ %6 = memref.subview %arg2[%arg3, %arg4] [4, 1] [1, 1]
+ : memref to memref<4x1xf32, #map0>
+ %7 = memref.subview %arg1[%arg3, %arg4] [4, 1] [1, 1]
+ : memref to memref<4x1xf32, #map0>
+ %8 = memref.subview %arg0[%arg3, %arg4] [4, 1] [1, 1]
+ : memref to memref<4x1xf32, #map0>
+ linalg.generic {indexing_maps = [#map1, #map1, #map1],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%8, %7 : memref<4x1xf32, #map0>, memref<4x1xf32, #map0>)
+ outs(%6 : memref<4x1xf32, #map0>) {
+ ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
+ %9 = arith.addf %arg5, %arg6 : f32
+ linalg.yield %9 : f32
+ }
+ gml_st.set_yield
+ }
+ func.return %arg2 : memref
+}
+// CHECK-NOT: linalg.generic
+// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4x1xf32>
+// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0, %c0]
diff --git a/tensorflow/compiler/mlir/hlo/tests/gml_st_pipeline.mlir b/tensorflow/compiler/mlir/hlo/tests/gml_st_pipeline.mlir
index cebc57e8cf7596..05581109c8ce96 100644
--- a/tensorflow/compiler/mlir/hlo/tests/gml_st_pipeline.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/gml_st_pipeline.mlir
@@ -1,6 +1,9 @@
// RUN: mlir-hlo-opt --split-input-file %s \
// RUN: --gml-st-pipeline="tile-sizes=256" \
-// RUN: | FileCheck --dump-input=always %s
+// RUN: | FileCheck %s
+
+// TODO(akuegel): Also run with the option lower-to-loops. This fails currently
+// due to not having a bufferization for gml_st.dynamic_broadcast_in_dim.
// CHECK-LABEL: func @log(
// CHECK-SAME: %[[ARG0:.*]]: tensor<2048xf32>)
From 01d375389fae269dbd5c83479961a1fb88861906 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower"
Date: Mon, 4 Jul 2022 06:37:05 -0700
Subject: [PATCH 076/259] Make inputs and outputs explicit in
dynamic_broadcast_in_dim syntax.
PiperOrigin-RevId: 458901786
---
.../Dialect/gml_st/IR/gml_st_extension_ops.td | 13 ++++++++-----
.../mlir/hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc | 2 +-
.../gml_st/transforms/legalize_mhlo_to_gml.cc | 2 +-
.../mlir/hlo/tests/Dialect/gml_st/fusion.mlir | 11 +++++++----
.../tests/Dialect/gml_st/legalize_mhlo_to_gml.mlir | 5 ++++-
.../compiler/mlir/hlo/tests/Dialect/gml_st/ops.mlir | 11 +++++++++++
6 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td
index 4d951ee1239429..27a5fa1eccbe77 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_extension_ops.td
@@ -20,20 +20,23 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops_base.td"
include "mlir-hlo/Dialect/gml_st/transforms/fusion_interface.td"
-def GMLST_DynamicBroadcastInDimOp : GMLST_Op<"dynamic_broadcast_in_dim", [
- NoSideEffect, DeclareOpInterfaceMethods]> {
+def GMLST_DynamicBroadcastInDimOp : GMLST_Op<"dynamic_broadcast_in_dim",
+ [NoSideEffect,
+ TypesMatchWith<"result and init types match", "init", "result", "$_self">,
+ DeclareOpInterfaceMethods]> {
let summary = [{Destination-style twin for `mhlo.dynamic_broadcast_in_dim`}];
let arguments = (ins
- AnyTensor:$init,
AnyTensor:$operand,
+ AnyTensor:$init,
I64ElementsAttr:$broadcast_dimensions,
OptionalAttr:$known_expanding_dimensions,
OptionalAttr:$known_nonexpanding_dimensions
);
let results = (outs AnyTensor:$result);
let assemblyFormat = [{
- $init `,` $operand `,` custom($broadcast_dimensions)
- attr-dict `:` type($init) `,` type($operand) `->` type($result)
+ `ins` `(` $operand `:` type($operand) `)`
+ `outs` `(` $init `:` type($init) `)`
+ attr-dict
}];
let hasVerifier = 0;
}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc
index 62df69fcb33798..c1cc198aa9ed0e 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc
@@ -1501,7 +1501,7 @@ Value DynamicBroadcastInDimOp::fuse(Location loc, Value set,
auto tiledResultTy =
RankedTensorType::get(tileTy.getShape(), resultTy.getElementType());
return builder.create(
- loc, tiledResultTy, tiledInit, tiledOperand, broadcast_dimensions(),
+ loc, tiledResultTy, tiledOperand, tiledInit, broadcast_dimensions(),
known_expanding_dimensionsAttr(), known_nonexpanding_dimensionsAttr());
}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/legalize_mhlo_to_gml.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/legalize_mhlo_to_gml.cc
index 5fba0f6591958b..ee694c6e5fa9e5 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/legalize_mhlo_to_gml.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/gml_st/transforms/legalize_mhlo_to_gml.cc
@@ -56,7 +56,7 @@ struct DynamicBroadcastInDimOpPattern
loc, dynamicDims, staticShapeInfo, resultTy.getElementType());
rewriter.replaceOpWithNewOp(
- op, resultTy, initTensor, op.operand(), op.broadcast_dimensions(),
+ op, resultTy, op.operand(), initTensor, op.broadcast_dimensions(),
op.known_expanding_dimensionsAttr(),
op.known_nonexpanding_dimensionsAttr());
return success();
diff --git a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/fusion.mlir b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/fusion.mlir
index 8b8b1f928135a0..871defbe142427 100644
--- a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/fusion.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/fusion.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-fusion | FileCheck %s
+// RUN: mlir-hlo-opt %s --split-input-file --gml-fusion | FileCheck %s --dump-input=always
// CHECK-LABEL: @dynamic_broadcast_in_dim
// CHECK-SAME: %[[ARG:.*]]: tensor, %[[SHAPE:.*]]: tensor<3xindex>
@@ -48,7 +48,10 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor,
// Check tiled broadcast.
// CHECK-DAG: %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]][%[[RES_TILE]]] : tensor[!gml_st.tile<3x4x5>]
// CHECK-DAG: %[[ARG_SUB:.*]] = gml_st.materialize %[[ARG]][%[[ARG_TILE]]] : tensor[!gml_st.tile]
- // CHECK-DAG: %[[RES:.*]] = gml_st.dynamic_broadcast_in_dim %[[INIT_SUB]], %[[ARG_SUB]], [0, 2] : tensor<3x4x5xf32>, tensor -> tensor<3x4x5xf32>
+ // CHECK-NEXT: %[[RES:.*]] = gml_st.dynamic_broadcast_in_dim
+ // CHECK-SAME ins(%[[ARG_SUB]] : tensor)
+ // CHECK-SAME outs(%[[INIT_SUB]] : tensor<3x4x5xf32>)
+ // CHECK-SAME {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>}
// CHECK: return %[[RES]] : tensor<3x4x5xf32>
%c0 = arith.constant 0 : index
@@ -60,8 +63,8 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor,
%d1 = tensor.extract %shape[%c1] : tensor<3xindex>
%d2 = tensor.extract %shape[%c2] : tensor<3xindex>
%dst = linalg.init_tensor [%d0, %d1, %d2] : tensor
- %bcast = gml_st.dynamic_broadcast_in_dim %dst, %arg, [0, 2]
- : tensor, tensor -> tensor
+ %bcast = gml_st.dynamic_broadcast_in_dim ins(%arg: tensor)
+ outs(%dst: tensor) { broadcast_dimensions = dense<[0, 2]> : tensor<2xi64> }
// Materialze a tile.
%space = gml_st.space [%d0, %d1, %d2] : !gml_st.tile
diff --git a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/legalize_mhlo_to_gml.mlir b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/legalize_mhlo_to_gml.mlir
index 91a66988f0c3d9..3a38aaf0cb8848 100644
--- a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/legalize_mhlo_to_gml.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/legalize_mhlo_to_gml.mlir
@@ -10,7 +10,10 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor, %shape : tensor<3xin
// CHECK-DAG: %[[SHAPE_D1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
// CHECK-DAG: %[[SHAPE_D2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
// CHECK-DAG: %[[INIT:.*]] = linalg.init_tensor [%[[SHAPE_D0]], %[[SHAPE_D1]], %[[SHAPE_D2]]] : tensor
- // CHECK-DAG: %[[BCAST:.*]] = gml_st.dynamic_broadcast_in_dim %[[INIT]], %[[ARG]], [0, 2] : tensor, tensor -> tensor
+ // CHECK-NEXT: %[[BCAST:.*]] = gml_st.dynamic_broadcast_in_dim
+ // CHECK-SAME: ins(%[[ARG]] : tensor)
+ // CHECK-SAME: outs(%[[INIT]] : tensor)
+ // CHECK-SAME: {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>}
// CHECK: return %[[BCAST]]
%0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape)
{ broadcast_dimensions = dense<[0, 2]> : tensor<2xi64> }
diff --git a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/ops.mlir b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/ops.mlir
index 58319e91f6c9e8..592fc47d90fe29 100644
--- a/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/Dialect/gml_st/ops.mlir
@@ -309,3 +309,14 @@ func.func @for_loop(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
func.return %sum : tensor<8xf32>
}
// CHECK-LABEL: func @for_loop
+
+func.func @dynamic_broadcast_in_dim(%arg: tensor,
+ %dst: tensor) {
+ %bcast = gml_st.dynamic_broadcast_in_dim
+ ins(%arg: tensor)
+ outs(%dst: tensor) {
+ broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>
+ }
+ func.return
+}
+// CHECK-LABEL: func @dynamic_broadcast_in_dim
\ No newline at end of file
From c83f376c5392c83580653768b77997364e9b1095 Mon Sep 17 00:00:00 2001
From: bhack
Date: Sat, 2 Jul 2022 22:40:02 +0000
Subject: [PATCH 077/259] Remove compile test
---
tensorflow/python/ops/bincount_ops_test.py | 109 ---------------------
1 file changed, 109 deletions(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index feffe127a57bf4..6fe6953b9548b0 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -546,115 +546,6 @@ def test_ragged_input(self,
self.assertAllEqual(expected_values, y.values)
self.assertAllEqual(expected_shape, y.dense_shape)
-class TestCompiledDenseBincount(test.TestCase, parameterized.TestCase):
-
- @parameterized.named_parameters(
- {
- "testcase_name": "_no_maxlength_basic",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],[0, 0, 0, 0, 2, 1]]
- }, {
- "testcase_name": "_maxlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 2, 0, 0]]
- }, {
- "testcase_name": "_minlength",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 2, 0, 0, 1, 0]]
- }, {
- "testcase_name": "_minlength_larger_values",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 3,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 2, 0, 0, 1]]
- }, {
- "testcase_name": "_no_maxlength_binary",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0, 1, 1, 1, 0, 0],
- [0, 0, 0, 0, 1, 1]],
- "binary_output": True,
- }, {
- "testcase_name": "_maxlength_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "maxlength": 7,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0],
- [1, 0, 0, 0, 1, 0, 0]],
- "binary_output": True,
- }, {
- "testcase_name": "_minlength_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 9,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 1, 0, 0, 1, 0]],
- "binary_output": True,
- }, {
- "testcase_name": "_minlength_larger_values_binary",
- "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
- "minlength": 3,
- "expected_values": [[0, 1, 1, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 1, 0, 0, 1]],
- "binary_output": True,
- }, {
- "testcase_name": "_no_maxlength_weights",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [[0. , 2. , 1. , 0.5, 0. , 0. ],
- [0. , 0. , 0. , 0. , 9. , 3. ]],
- "weights": [[0.5, 1, 2], [3, 4, 5]]
- }, {
- "testcase_name": "_1d_no_maxlenght_base",
- "x": np.array([3, 2, 1, 1], dtype=np.int32),
- "expected_values": [0, 2, 1, 1]
- }, {
- "testcase_name": "_1d_binary",
- "x": np.array([3, 2, 1, 1], dtype=np.int32),
- "expected_values": [0, 1, 1, 1],
- "binary_output": True
- }, {
- "testcase_name": "_1d_no_maxlenght_weights",
- "x": np.array([3, 2, 1, 5, 4, 4], dtype=np.int32),
- "weights": [0.5, 1, 2, 3, 4, 5],
- "expected_values": [0. , 2. , 1. , 0.5, 9. , 3. ]
- }, {
- "testcase_name": "_all_axes",
- "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
- "expected_values": [0, 1, 1, 1, 2, 1],
- "axis": None
- })
- def test_compiled_dense(self,
- x,
- expected_values,
- minlength=None,
- maxlength=None,
- binary_output=False,
- weights=None,
- axis=-1):
-
- @def_function.function(jit_compile=True)
- def f (x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis):
- y = bincount_ops.bincount(
- x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- return y
-
- res = f(x,
- weights=weights,
- minlength=minlength,
- maxlength=maxlength,
- binary_output=binary_output,
- axis=axis)
- self.assertAllEqual(expected_values, res)
class TestDenseBincount(test.TestCase, parameterized.TestCase):
From 4ecaf526a8c90dce08936fd56d20a1d9f3cb558d Mon Sep 17 00:00:00 2001
From: bhack
Date: Sat, 2 Jul 2022 22:41:44 +0000
Subject: [PATCH 078/259] Remove unused import
---
tensorflow/python/ops/bincount_ops_test.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 6fe6953b9548b0..253f325bdb077c 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -18,7 +18,6 @@
import numpy as np
from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
from tensorflow.python.framework import errors
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor
From 7fc77efdeaf41a36272dc3c28ed3810d8532a87d Mon Sep 17 00:00:00 2001
From: bhack
Date: Sun, 3 Jul 2022 20:37:39 +0200
Subject: [PATCH 079/259] Fix copybara
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 72d619c4d8cb86..e6f8b45ee12c52 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -66,7 +66,8 @@ class DenseBincountOp : public XlaOpKernel {
OP_REQUIRES(
ctx, rank <= 2,
- xla::InvalidArgument("Shape must be at most rank 2 but is rank ", rank));
+ errors::InvalidArgument("Shape must be at most rank 2
+ but is rank ", rank));
xla::XlaOp weights = ctx->Input(2);
StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
From f5a1221ecfe3ffd6fb440531388b89879f2cb2d2 Mon Sep 17 00:00:00 2001
From: bhack
Date: Sun, 3 Jul 2022 20:59:20 +0200
Subject: [PATCH 080/259] Update bincount_op.cc
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index e6f8b45ee12c52..9ca384c374827b 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -66,8 +66,7 @@ class DenseBincountOp : public XlaOpKernel {
OP_REQUIRES(
ctx, rank <= 2,
- errors::InvalidArgument("Shape must be at most rank 2
- but is rank ", rank));
+ errors::InvalidArgument("Shape must be at most rank 2 but is rank ", rank));
xla::XlaOp weights = ctx->Input(2);
StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
From 0233380ec2aec96d7b4517005d1eeea72ddf7eec Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 4 Jul 2022 11:03:37 +0200
Subject: [PATCH 081/259] Update bincount_op.cc
---
tensorflow/compiler/tf2xla/kernels/bincount_op.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 9ca384c374827b..9975bd5a8404c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -44,7 +44,7 @@ class DenseBincountOp : public XlaOpKernel {
auto output_rank = output_shape_param.rank();
OP_REQUIRES(
ctx, output_rank == 0,
- xla::InvalidArgument("Shape must be rank 0 but is rank 1", output_rank));
+ errors::InvalidArgument("Shape must be rank 0 but is rank 1", output_rank));
OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("size", &output_size));
OP_REQUIRES(
ctx, output_size >= 0,
From 5bbc27f8fd2cf62794b38a47b854b4a44f82ce29 Mon Sep 17 00:00:00 2001
From: bhack
Date: Mon, 4 Jul 2022 14:35:04 +0000
Subject: [PATCH 082/259] Small refactoring Fix string Formatting
---
.../compiler/tf2xla/kernels/bincount_op.cc | 42 +++++++++----------
1 file changed, 21 insertions(+), 21 deletions(-)
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 9975bd5a8404c1..6be66898b1e366 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -30,7 +30,7 @@ class DenseBincountOp : public XlaOpKernel {
public:
explicit DenseBincountOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
// It is optional for Bincount and required for DenseBincount
- (void) ctx->GetAttr("binary_output", &binary_output_);
+ (void)ctx->GetAttr("binary_output", &binary_output_);
}
private:
@@ -38,35 +38,39 @@ class DenseBincountOp : public XlaOpKernel {
void Compile(XlaOpKernelContext* ctx) override {
int64_t output_size;
xla::XlaOp output_size_param = ctx->Input("size");
- StatusOr output_shape_or = ctx->builder()->GetShape(output_size_param);
+ StatusOr output_shape_or =
+ ctx->builder()->GetShape(output_size_param);
OP_REQUIRES_OK(ctx, output_shape_or.status());
auto output_shape_param = output_shape_or.ValueOrDie();
auto output_rank = output_shape_param.rank();
- OP_REQUIRES(
- ctx, output_rank == 0,
- errors::InvalidArgument("Shape must be rank 0 but is rank 1", output_rank));
+ OP_REQUIRES(ctx, output_rank == 0,
+ errors::InvalidArgument("Shape must be rank 0 but is rank ",
+ output_rank));
OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("size", &output_size));
- OP_REQUIRES(
- ctx, output_size >= 0,
- errors::InvalidArgument("size (", output_size, ") must be non-negative"));
+ OP_REQUIRES(ctx, output_size >= 0,
+ errors::InvalidArgument("size (", output_size,
+ ") must be non-negative"));
xla::XlaOp idx, updates, output;
xla::XlaOp input = ctx->Input(0);
auto input_xla_type = ctx->input_xla_type(0);
- auto zero = xla::Zero(ctx->builder(), input_xla_type);
+ xla::PrimitiveType dtype = ctx->InputXlaType("weights");
+ auto zero = xla::Zero(ctx->builder(), dtype);
+ auto one = xla::One(ctx->builder(), dtype);
StatusOr input_shape_or = ctx->builder()->GetShape(input);
OP_REQUIRES_OK(ctx, input_shape_or.status());
auto input_shape = input_shape_or.ValueOrDie();
auto size = input_shape.dimensions(0);
- if (! size) {
+
+ if (!size) {
output = xla::Broadcast(zero, {output_size});
ctx->SetOutput(0, output);
return;
}
auto rank = input_shape.rank();
- OP_REQUIRES(
- ctx, rank <= 2,
- errors::InvalidArgument("Shape must be at most rank 2 but is rank ", rank));
+ OP_REQUIRES(ctx, rank <= 2,
+ errors::InvalidArgument(
+ "Shape must be at most rank 2 but is rank ", rank));
xla::XlaOp weights = ctx->Input(2);
StatusOr weights_shape_or = ctx->builder()->GetShape(weights);
@@ -74,19 +78,16 @@ class DenseBincountOp : public XlaOpKernel {
auto weights_shape = weights_shape_or.ValueOrDie();
auto weights_size = weights_shape.dimensions(0);
- xla::PrimitiveType dtype = ctx->InputXlaType("weights");;
bool has_weights = false;
if (weights_size) {
has_weights = true;
- }
+ }
xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
xla::ScatterDimensionNumbers scatter_dnums;
scatter_dnums.set_index_vector_dim(1);
scatter_dnums.add_inserted_window_dims(0);
scatter_dnums.add_scatter_dims_to_operand_dims(0);
- zero = xla::Zero(ctx->builder(), dtype);
- auto one = xla::One(ctx->builder(), dtype);
-
+
if (rank == 2) {
output_shape = xla::ShapeUtil::MakeShape(dtype, {size, output_size});
scatter_dnums.add_inserted_window_dims(1);
@@ -104,7 +105,8 @@ class DenseBincountOp : public XlaOpKernel {
idx = xla::ConcatInDim(ctx->builder(), iotas_to_concat, 1);
updates = xla::Broadcast(
one, {input_shape.dimensions(0) * input_shape.dimensions(1)});
- output = xla::Broadcast(zero, {output_shape.dimensions(0), output_shape.dimensions(1)});
+ output = xla::Broadcast(
+ zero, {output_shape.dimensions(0), output_shape.dimensions(1)});
if (has_weights and !binary_output_) {
weights = xla::Reshape(
weights, {input_shape.dimensions(0) * input_shape.dimensions(1)});
@@ -120,8 +122,6 @@ class DenseBincountOp : public XlaOpKernel {
}
}
-
-
xla::XlaComputation assn_computation = [&] {
std::unique_ptr subb =
ctx->builder()->CreateSubBuilder("scatter_bincount");
From ee9eb61bb7a210168bd2a607c89ab849c7acbf7b Mon Sep 17 00:00:00 2001
From: Raman Sarokin
Date: Mon, 4 Jul 2022 07:42:47 -0700
Subject: [PATCH 083/259] Clarified SupportsZeroClampForImages for Vulkan and
OpenGL.
PiperOrigin-RevId: 458911333
---
tensorflow/lite/delegates/gpu/common/gpu_info.cc | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 2764f3b0d5eb4d..dbe91d7b94b6e9 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -774,9 +774,9 @@ bool GpuInfo::SupportsZeroClampForImages() const {
} else if (IsApiOpenCl()) {
return true;
} else if (IsApiVulkan()) {
- return true;
+ return false;
} else if (IsApiOpenGl()) {
- return opengl_info.IsApiOpenGl32OrAbove();
+ return false;
} else {
return false;
}
From ef7a770583b1dcf00749eb66e2870e15db765904 Mon Sep 17 00:00:00 2001
From: Raman Sarokin
Date: Mon, 4 Jul 2022 08:07:02 -0700
Subject: [PATCH 084/259] Added single pass MeanStdDevNormalization version.
PiperOrigin-RevId: 458914577
---
.../common/tasks/mean_stddev_normalization.cc | 80 ++++++++++++-------
.../common/tasks/mean_stddev_normalization.h | 9 ++-
.../mean_stddev_normalization_test_util.cc | 41 ++++++++++
3 files changed, 99 insertions(+), 31 deletions(-)
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.cc
index 7e07c361bb9156..c3847f976df232 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.cc
@@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
#include
+#include