From 48c2ad2041e969c8b390d339fa7ba2e8ffd76332 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 14 Aug 2023 12:14:14 -0500
Subject: [PATCH 01/15] Clean up of operator special methods

1. Removed unused usm_ndarray._clone static C-only method
2. Removed _dispatch* utilities
3. Used direct calls to unary/binary operators in implementation
   of special methods
---
 dpctl/tensor/_usmarray.pxd |   1 -
 dpctl/tensor/_usmarray.pyx | 219 +++++++------------------------------
 2 files changed, 39 insertions(+), 181 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pxd b/dpctl/tensor/_usmarray.pxd
index 603b296103..120ad7a066 100644
--- a/dpctl/tensor/_usmarray.pxd
+++ b/dpctl/tensor/_usmarray.pxd
@@ -58,7 +58,6 @@ cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]:
 
     cdef void _reset(usm_ndarray self)
     cdef void _cleanup(usm_ndarray self)
-    cdef usm_ndarray _clone(usm_ndarray self)
     cdef Py_ssize_t get_offset(usm_ndarray self) except *
 
     cdef char* get_data(self)
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 98986d84d6..2b16df1060 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -44,58 +44,6 @@ include "_types.pxi"
 include "_slicing.pxi"
 
 
-def _dispatch_unary_elementwise(ary, name):
-    try:
-        mod = ary.__array_namespace__()
-    except AttributeError:
-        return NotImplemented
-    if mod is None and "dpnp" in sys.modules:
-        fn = getattr(sys.modules["dpnp"], name)
-        if callable(fn):
-            return fn(ary)
-    elif hasattr(mod, name):
-        fn = getattr(mod, name)
-        if callable(fn):
-            return fn(ary)
-
-    return NotImplemented
-
-
-def _dispatch_binary_elementwise(ary, name, other):
-    try:
-        mod = ary.__array_namespace__()
-    except AttributeError:
-        return NotImplemented
-    if mod is None and "dpnp" in sys.modules:
-        fn = getattr(sys.modules["dpnp"], name)
-        if callable(fn):
-            return fn(ary, other)
-    elif hasattr(mod, name):
-        fn = getattr(mod, name)
-        if callable(fn):
-            return fn(ary, other)
-
-    return NotImplemented
-
-
-def _dispatch_binary_elementwise2(other, name, ary):
-    try:
-        mod = ary.__array_namespace__()
-    except AttributeError:
-        return NotImplemented
-    mod = ary.__array_namespace__()
-    if mod is None and "dpnp" in sys.modules:
-        fn = getattr(sys.modules["dpnp"], name)
-        if callable(fn):
-            return fn(other, ary)
-    elif hasattr(mod, name):
-        fn = getattr(mod, name)
-        if callable(fn):
-            return fn(other, ary)
-
-    return NotImplemented
-
-
 cdef class InternalUSMArrayError(Exception):
     """
     A InternalError exception is raised when internal
@@ -200,28 +148,6 @@ cdef class usm_ndarray:
             PyMem_Free(self.strides_)
         self._reset()
 
-    cdef usm_ndarray _clone(usm_ndarray self):
-        """
-        Provides a copy of Python object pointing to the same data
-        """
-        cdef Py_ssize_t offset_elems = self.get_offset()
-        cdef usm_ndarray res = usm_ndarray.__new__(
-            usm_ndarray, _make_int_tuple(self.nd_, self.shape_),
-            dtype=_make_typestr(self.typenum_),
-            strides=(
-                _make_int_tuple(self.nd_, self.strides_) if (self.strides_)
-                else None),
-            buffer=self.base_,
-            offset=offset_elems,
-            order=('C' if (self.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F')
-        )
-        res.flags_ = self.flags_
-        res.array_namespace_ = self.array_namespace_
-        if (res.data_ != self.data_):
-            raise InternalUSMArrayError(
-                "Data pointers of cloned and original objects are different.")
-        return res
-
     def __cinit__(self, shape, dtype=None, strides=None, buffer='device',
                   Py_ssize_t offset=0, order='C',
                   buffer_ctor_kwargs=dict(),
@@ -966,32 +892,17 @@ cdef class usm_ndarray:
         raise IndexError("only integer arrays are valid indices")
 
     def __abs__(self):
-        return _dispatch_unary_elementwise(self, "abs")
+        return dpctl.tensor.abs(self)
 
     def __add__(first, other):
         """
-        Cython 0.* never calls `__radd__`, always calls `__add__`
-        but first argument need not be an instance of this class,
-        so dispatching is needed.
-
-        This changes in Cython 3.0, where first is guaranteed to
-        be `self`.
-
-        [1] http://docs.cython.org/en/latest/src/userguide/special_methods.html
+        Implementation for operator.add
         """
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "add", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "add", other)
-        return NotImplemented
+        return dpctl.tensor.add(first, other)
 
     def __and__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_and", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_and", other)
-        return NotImplemented
+        "Implementation for operator.and"
+        return dpctl.tensor.bitwise_and(first, other)
 
     def __dlpack__(self, stream=None):
         """
@@ -1037,27 +948,22 @@ cdef class usm_ndarray:
             )
 
     def __eq__(self, other):
-        return _dispatch_binary_elementwise(self, "equal", other)
+        return dpctl.tensor.equal(self, other)
 
     def __floordiv__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "floor_divide", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "floor_divide", other)
-        return NotImplemented
+        return dpctl.tensor.floor_divide(first, other)
 
     def __ge__(self, other):
-        return _dispatch_binary_elementwise(self, "greater_equal", other)
+        return dpctl.tensor.greater_equal(self, other)
 
     def __gt__(self, other):
-        return _dispatch_binary_elementwise(self, "greater", other)
+        return dpctl.tensor.greater(self, other)
 
     def __invert__(self):
-        return _dispatch_unary_elementwise(self, "bitwise_invert")
+        return dpctl.tensor.bitwise_invert(self)
 
     def __le__(self, other):
-        return _dispatch_binary_elementwise(self, "less_equal", other)
+        return dpctl.tensor.less_equal(self, other)
 
     def __len__(self):
         if (self.nd_):
@@ -1067,72 +973,40 @@ cdef class usm_ndarray:
 
     def __lshift__(first, other):
         "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_left_shift", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_left_shift", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_left_shift(first, other)
 
     def __lt__(self, other):
-        return _dispatch_binary_elementwise(self, "less", other)
+        return dpctl.tensor.less(self, other)
 
     def __matmul__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "matmul", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "matmul", other)
         return NotImplemented
 
     def __mod__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "remainder", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "remainder", other)
-        return NotImplemented
+        return dpctl.tensor.remainder(first, other)
 
     def __mul__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "multiply", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "multiply", other)
-        return NotImplemented
+        return dpctl.tensor.multiply(first, other)
 
     def __ne__(self, other):
-        return _dispatch_binary_elementwise(self, "not_equal", other)
+        return dpctl.tensor.not_equal(self, other)
 
     def __neg__(self):
-        return _dispatch_unary_elementwise(self, "negative")
+        return dpctl.tensor.negative(self)
 
     def __or__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_or", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_or", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_or(first, other)
 
     def __pos__(self):
-        return self  # _dispatch_unary_elementwise(self, "positive")
+        return dpctl.tensor.positive(self)
 
     def __pow__(first, other, mod):
-        "See comment in __add__"
         if mod is None:
-            if isinstance(first, usm_ndarray):
-                return _dispatch_binary_elementwise(first, "pow", other)
-            elif isinstance(other, usm_ndarray):
-                return _dispatch_binary_elementwise(first, "pow", other)
-        return NotImplemented
+            return dpctl.tensor.pow(first, other)
+        else:
+            return NotImplemented
 
     def __rshift__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_right_shift", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_right_shift", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_right_shift(first, other)
 
     def __setitem__(self, key, rhs):
         cdef tuple _meta
@@ -1223,67 +1097,52 @@ cdef class usm_ndarray:
 
 
     def __sub__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "subtract", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "subtract", other)
-        return NotImplemented
+        return dpctl.tensor.subtract(first, other)
 
     def __truediv__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "divide", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "divide", other)
-        return NotImplemented
+        return dpctl.tensor.divide(first, other)
 
     def __xor__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_xor", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_xor", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_xor(first, other)
 
     def __radd__(self, other):
-        return _dispatch_binary_elementwise(self, "add", other)
+        return dpctl.tensor.add(other, self)
 
     def __rand__(self, other):
-        return _dispatch_binary_elementwise(self, "bitwise_and", other)
+        return dpctl.tensor.bitwise_and(other, self)
 
     def __rfloordiv__(self, other):
-        return _dispatch_binary_elementwise2(other, "floor_divide", self)
+        return dpctl.tensor.floor_divide(other, self)
 
     def __rlshift__(self, other):
-        return _dispatch_binary_elementwise2(other, "bitwise_left_shift", self)
+        return dpctl.tensor.bitwise_left_shift(other, self)
 
     def __rmatmul__(self, other):
-        return _dispatch_binary_elementwise2(other, "matmul", self)
+        return NotImplemented
 
     def __rmod__(self, other):
-        return _dispatch_binary_elementwise2(other, "remainder", self)
+        return dpctl.tensor.remainder(other, self)
 
     def __rmul__(self, other):
-        return _dispatch_binary_elementwise(self, "multiply", other)
+        return dpctl.tensor.multiply(other, self)
 
     def __ror__(self, other):
-        return _dispatch_binary_elementwise(self, "bitwise_or", other)
+        return dpctl.tensor.bitwise_or(other, self)
 
     def __rpow__(self, other):
-        return _dispatch_binary_elementwise2(other, "pow", self)
+        return dpctl.tensor.pow(other, self)
 
     def __rrshift__(self, other):
-        return _dispatch_binary_elementwise2(other, "bitwise_right_shift", self)
+        return dpctl.tensor.bitwise_right_shift(other, self)
 
     def __rsub__(self, other):
-        return _dispatch_binary_elementwise2(other, "subtract", self)
+        return dpctl.tensor.subtract(other, self)
 
     def __rtruediv__(self, other):
-        return _dispatch_binary_elementwise2(other, "divide", self)
+        return dpctl.tensor.divide(other, self)
 
     def __rxor__(self, other):
-        return _dispatch_binary_elementwise2(other, "bitwise_xor", self)
+        return dpctl.tensor.bitwise_xor(other, self)
 
     def __iadd__(self, other):
         from ._elementwise_funcs import add

From 7d9974ef339b057950aa047e760a726630fb2d54 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 15 Aug 2023 09:22:22 -0500
Subject: [PATCH 02/15] Removed leftover include iostream

---
 .../include/kernels/elementwise_functions/negative.hpp          | 2 --
 .../libtensor/include/kernels/elementwise_functions/sign.hpp    | 2 --
 2 files changed, 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 98ce88557a..830794680c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -37,8 +37,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index fb387bdcb2..8f0488746e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -37,8 +37,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor

From 5f298e6d405cec2f6701148bc03d4e5c4455ea8c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 13 Aug 2023 05:42:09 -0500
Subject: [PATCH 03/15] Closes gh-1279

Provide cabs private method implementating abs for complex types, paying
attention to array-API mandated special values.

To work-around gh-1279, use std::hypot to compute value for finite inputs.
Compile with -DUSE_STD_ABS_FOR_COMPLEX_TYPES to use std::abs(z) instead of
std::hypot(std::real(z), std::imag(z)).
---
 .../kernels/elementwise_functions/abs.hpp     | 57 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index c734fd54ec..9c42ba1bb4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -25,8 +25,10 @@
 #pragma once
 #include <CL/sycl.hpp>
 #include <cmath>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -36,8 +38,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor
@@ -72,7 +72,58 @@ template <typename argT, typename resT> struct AbsFunctor
             return x;
         }
         else {
-            return std::abs(x);
+            if constexpr (is_complex<argT>::value) {
+                return cabs(x);
+            }
+            else if constexpr (std::is_same_v<argT, sycl::half> ||
+                               std::is_floating_point_v<argT>)
+            {
+                return (std::signbit(x) ? -x : x);
+            }
+            else {
+                return std::abs(x);
+            }
+        }
+    }
+
+private:
+    template <typename realT> realT cabs(std::complex<realT> const &z) const
+    {
+        // Special values for cabs( x + y * 1j):
+        //   * If x is either +infinity or -infinity and y is any value
+        //   (including NaN), the result is +infinity.
+        //   * If x is any value (including NaN) and y is either +infinity or
+        //   -infinity, the result is +infinity.
+        //   * If x is either +0 or -0, the result is equal to abs(y).
+        //   * If y is either +0 or -0, the result is equal to abs(x).
+        //   * If x is NaN and y is a finite number, the result is NaN.
+        //   * If x is a finite number and y is NaN, the result is NaN.
+        //   * If x is NaN and y is NaN, the result is NaN.
+
+        const realT x = std::real(z);
+        const realT y = std::imag(z);
+
+        constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+        constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+
+        if (std::isinf(x)) {
+            return p_inf;
+        }
+        else if (std::isinf(y)) {
+            return p_inf;
+        }
+        else if (std::isnan(x)) {
+            return q_nan;
+        }
+        else if (std::isnan(y)) {
+            return q_nan;
+        }
+        else {
+#ifdef USE_STD_ABS_FOR_COMPLEX_TYPES
+            return std::abs(z);
+#else
+            return std::hypot(std::real(z), std::imag(z));
+#endif
         }
     }
 };

From 4a2578fffdc147c03732f7b89df29f021d4b232e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 15 Aug 2023 10:13:19 -0500
Subject: [PATCH 04/15] Closes gh-1279 for dpt.sqrt

This change provides private method csqrt to evaluate square-root
for complex types. It handles special values as mandated by array API.

The finite input, it provides its own implementation based on std::hypot
and std::sqrt for real types instead of calling std::sqrt on finite
input of complex type.

Compile with -DUSE_STD_SQRT_FOR_COMPLEX_TYPES to use std::sqrt instead
of custom implementation.

Cursory performance study suggests that custom implementation is at least
not worse than std::sqrt one.
---
 .../kernels/elementwise_functions/sqrt.hpp    | 94 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 7853428227..f72b5d963b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -26,8 +26,10 @@
 #pragma once
 #include <CL/sycl.hpp>
 #include <cmath>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -66,7 +68,97 @@ template <typename argT, typename resT> struct SqrtFunctor
 
     resT operator()(const argT &in)
     {
-        return std::sqrt(in);
+        if constexpr (is_complex<argT>::value) {
+            // #ifdef _WINDOWS
+            //             return csqrt(in);
+            // #else
+            //             return std::sqrt(in);
+            // #endif
+            return csqrt(in);
+        }
+        else {
+            return std::sqrt(in);
+        }
+    }
+
+private:
+    template <typename T> std::complex<T> csqrt(std::complex<T> const &z) const
+    {
+        // csqrt(x + y*1j)
+        //  * csqrt(x - y * 1j) = conj(csqrt(x + y * 1j))
+        //  * If x is either +0 or -0 and y is +0, the result is +0 + 0j.
+        //  * If x is any value (including NaN) and y is +infinity, the result
+        //  is +infinity + infinity j.
+        //  * If x is a finite number and y is NaN, the result is NaN + NaN j.
+
+        //  * If x -infinity and y is a positive (i.e., greater than 0) finite
+        //  number, the result is NaN + NaN j.
+        //  * If x is +infinity and y is a positive (i.e., greater than 0)
+        //  finite number, the result is +0 + infinity j.
+        //  * If x is -infinity and y is NaN, the result is NaN + infinity j
+        //  (sign of the imaginary component is unspecified).
+        //  * If x is +infinity and y is NaN, the result is +infinity + NaN j.
+        //  * If x is NaN and y is any value, the result is NaN + NaN j.
+
+        using realT = T;
+        constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+        constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+        constexpr realT zero = realT(0);
+
+        realT x = std::real(z);
+        realT y = std::imag(z);
+
+        if (std::isinf(y)) {
+            return {p_inf, y};
+        }
+        else if (std::isnan(x)) {
+            return {x, q_nan};
+        }
+        else if (std::isinf(x)) { // x is an infinity
+            // y is either finite, or nan
+            if (std::signbit(x)) { // x == -inf
+                return {(std::isfinite(y) ? zero : y), std::copysign(p_inf, y)};
+            }
+            else {
+                return {p_inf, (std::isfinite(y) ? std::copysign(zero, y) : y)};
+            }
+        }
+        else { // x is finite
+            if (std::isfinite(y)) {
+#ifdef USE_STD_SQRT_FOR_COMPLEX_TYPES
+                return std::sqrt(z);
+#else
+                return csqrt_finite(x, y);
+#endif
+            }
+            else {
+                return {q_nan, y};
+            }
+        }
+    }
+
+    template <typename T>
+    std::complex<T> csqrt_finite(T const &x, T const &y) const
+    {
+        // csqrt(x + y*1j) =
+        //     sqrt((cabs(x, y) + x) / 2) +
+        //     1j * copysign(sqrt((cabs(x, y) - x) / 2), y)
+
+        using realT = T;
+        constexpr realT half = realT(0x1.0p-1f); // 1/2
+        constexpr realT zero = realT(0);
+
+        if (std::signbit(x)) {
+            realT m = std::hypot(x, y);
+            realT d = std::sqrt((m - x) * half);
+            return {(d == zero ? zero : std::abs(y) / d * half),
+                    std::copysign(d, y)};
+        }
+        else {
+            realT m = std::hypot(x, y);
+            realT d = std::sqrt((m + x) * half);
+            return {d, (d == zero) ? std::copysign(zero, y) : y * half / d};
+        }
     }
 };
 

From 26862b44a7cec8ca91bcd86b017be59f36482e5a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 15 Aug 2023 10:16:49 -0500
Subject: [PATCH 05/15] Implements dpctl.tensor.allclose

This utility function is based on symmetric check, unlike numpy.allclose,
and verifies that abs(x1-x2) < atol + rtol * max(abs(x1), abs(x2))

This way allclose(x1, x2) is symmetric, and allclose(x1,x2) implies
allclose(x2, x1).
---
 dpctl/tensor/__init__.py           |   2 +
 dpctl/tensor/_testing.py           | 144 +++++++++++++++++++++++++++++
 dpctl/tests/test_tensor_testing.py |  86 +++++++++++++++++
 3 files changed, 232 insertions(+)
 create mode 100644 dpctl/tensor/_testing.py
 create mode 100644 dpctl/tests/test_tensor_testing.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index f1ec439f3a..7438fb8a67 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -158,6 +158,7 @@
     trunc,
 )
 from ._reduction import sum
+from ._testing import allclose
 
 __all__ = [
     "Device",
@@ -301,4 +302,5 @@
     "tan",
     "tanh",
     "trunc",
+    "allclose",
 ]
diff --git a/dpctl/tensor/_testing.py b/dpctl/tensor/_testing.py
new file mode 100644
index 0000000000..185ed816a4
--- /dev/null
+++ b/dpctl/tensor/_testing.py
@@ -0,0 +1,144 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+from ._manipulation_functions import _broadcast_shape_impl
+from ._type_utils import _to_device_supported_dtype
+
+
+def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
+    z1r = dpt.real(z1)
+    z1i = dpt.imag(z1)
+    z2r = dpt.real(z2)
+    z2i = dpt.imag(z2)
+    if equal_nan:
+        check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all(
+            dpt.isnan(z1i) == dpt.isnan(z2i)
+        )
+    else:
+        check1 = (
+            dpt.logical_not(dpt.any(dpt.isnan(z1r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z1i)))
+        ) and (
+            dpt.logical_not(dpt.any(dpt.isnan(z2r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z2i)))
+        )
+    if not check1:
+        return check1
+    mr = dpt.isinf(z1r)
+    mi = dpt.isinf(z1i)
+    check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i))
+    if not check2:
+        return check2
+    check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi])
+    if not check3:
+        return check3
+    mr = dpt.isfinite(z1r)
+    mi = dpt.isfinite(z1i)
+    mv1 = z1r[mr]
+    mv2 = z2r[mr]
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+    )
+    if not check4:
+        return check4
+    mv1 = z1i[mi]
+    mv2 = z2i[mi]
+    check5 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+    )
+    return check5
+
+
+def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
+    if equal_nan:
+        check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2))
+    else:
+        check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not(
+            dpt.any(dpt.isnan(r2))
+        )
+    if not check1:
+        return check1
+    mr = dpt.isinf(r1)
+    check2 = dpt.all(mr == dpt.isinf(r2))
+    if not check2:
+        return check2
+    check3 = dpt.all(r1[mr] == r2[mr])
+    if not check3:
+        return check3
+    m = dpt.isfinite(r1)
+    mv1 = r1[m]
+    mv2 = r2[m]
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+    )
+    return check4
+
+
+def _allclose_others(r1, r2):
+    return dpt.all(r1 == r2)
+
+
+def allclose(a1, a2, atol=1e-5, rtol=1e-8, equal_nan=False):
+    """allclose(a1, a2, atol=1e-5, rtol=1e-8)
+
+    Returns True if two arrays are element-wise equal within tolerance.
+    """
+    if not isinstance(a1, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray type, got {type(a1)}."
+        )
+    if not isinstance(a2, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray type, got {type(a2)}."
+        )
+    atol = float(atol)
+    rtol = float(rtol)
+    equal_nan = bool(equal_nan)
+    exec_q = du.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
+    if exec_q is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_sh = _broadcast_shape_impl([a1.shape, a2.shape])
+    b1 = a1
+    b2 = a2
+    if b1.dtype == b2.dtype:
+        res_dt = b1.dtype
+    else:
+        res_dt = np.promote_types(b1.dtype, b2.dtype)
+        res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
+        b1 = dpt.astype(b1, res_dt)
+        b2 = dpt.astype(b2, res_dt)
+
+    b1 = dpt.broadcast_to(b1, res_sh)
+    b2 = dpt.broadcast_to(b2, res_sh)
+
+    k = b1.dtype.kind
+    if k == "c":
+        return _allclose_complex_fp(b1, b2, atol, rtol, equal_nan)
+    elif k == "f":
+        return _allclose_real_fp(b1, b2, atol, rtol, equal_nan)
+    else:
+        return _allclose_others(b1, b2)
diff --git a/dpctl/tests/test_tensor_testing.py b/dpctl/tests/test_tensor_testing.py
new file mode 100644
index 0000000000..21437459d9
--- /dev/null
+++ b/dpctl/tests/test_tensor_testing.py
@@ -0,0 +1,86 @@
+import itertools
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_allclose(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    a1 = dpt.ones(10, dtype=dtype)
+    a2 = dpt.ones(10, dtype=dtype)
+
+    assert dpt.allclose(a1, a2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_allclose_real_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+    a1 = dpt.asarray(v[2:], dtype=dtype)
+    a2 = dpt.asarray(v[2:], dtype=dtype)
+
+    tol = dpt.finfo(a1.dtype).resolution
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol)
+
+    a1 = dpt.asarray(v, dtype=dtype)
+    a2 = dpt.asarray(v, dtype=dtype)
+
+    assert not dpt.allclose(a1, a2, atol=tol, rtol=tol)
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_allclose_complex_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+
+    not_nans = [complex(*xy) for xy in itertools.product(v[2:], repeat=2)]
+    z1 = dpt.asarray(not_nans, dtype=dtype)
+    z2 = dpt.asarray(not_nans, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol)
+
+    both = [complex(*xy) for xy in itertools.product(v, repeat=2)]
+    z1 = dpt.asarray(both, dtype=dtype)
+    z2 = dpt.asarray(both, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert not dpt.allclose(z1, z2, atol=tol, rtol=tol)
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol, equal_nan=True)
+
+
+def test_allclose_validation():
+    with pytest.raises(TypeError):
+        dpt.allclose(True, False)
+
+    get_queue_or_skip()
+    x = dpt.asarray(True)
+    with pytest.raises(TypeError):
+        dpt.allclose(x, False)

From b121d67eb69a9a106d67a81fd1ea119918ca6753 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 15 Aug 2023 13:45:09 -0500
Subject: [PATCH 06/15] Adds tests for special FP values for dpt.abs and
 dpt.sqrt

---
 dpctl/tests/elementwise/test_abs.py  | 56 ++++++++++++++++++-
 dpctl/tests/elementwise/test_sqrt.py | 80 +++++++++++++++++++++++-----
 2 files changed, 122 insertions(+), 14 deletions(-)

diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index ab0d34d54d..9c800af812 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -15,6 +15,7 @@
 #  limitations under the License.
 
 import itertools
+import warnings
 
 import numpy as np
 import pytest
@@ -22,7 +23,13 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _no_complex_dtypes, _usm_types
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -135,3 +142,50 @@ def test_abs_out_overlap(dtype):
     assert Y is not X
     assert np.allclose(dpt.asnumpy(X), Xnp)
     assert np.allclose(dpt.asnumpy(Y), Ynp)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_abs_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.abs(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_abs_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.abs(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py
index a15f5262a7..07b97733e3 100644
--- a/dpctl/tests/elementwise/test_sqrt.py
+++ b/dpctl/tests/elementwise/test_sqrt.py
@@ -15,6 +15,7 @@
 #  limitations under the License.
 
 import itertools
+import warnings
 
 import numpy as np
 import pytest
@@ -23,7 +24,13 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _map_to_device_dtype,
+    _real_fp_dtypes,
+    _usm_types,
+)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -115,18 +122,6 @@ def test_sqrt_order(dtype):
             assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
-@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
-def test_sqrt_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
-    )
-    Xnp = dpt.asnumpy(X)
-
-    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
-
-
 @pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
 def test_sqrt_out_overlap(dtype):
     q = get_queue_or_skip()
@@ -149,3 +144,62 @@ def test_sqrt_out_overlap(dtype):
     assert Y is not X
     assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
     assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_sqrt_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_sqrt_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.sqrt(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_sqrt_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.sqrt(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)

From d60d58ef41cc2a3756f436797f33c2cf45a1c9a5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 15 Aug 2023 15:44:50 -0500
Subject: [PATCH 07/15] Added tests for type promotion in tensor.allclose

---
 dpctl/tests/test_tensor_testing.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dpctl/tests/test_tensor_testing.py b/dpctl/tests/test_tensor_testing.py
index 21437459d9..426d4bb586 100644
--- a/dpctl/tests/test_tensor_testing.py
+++ b/dpctl/tests/test_tensor_testing.py
@@ -84,3 +84,12 @@ def test_allclose_validation():
     x = dpt.asarray(True)
     with pytest.raises(TypeError):
         dpt.allclose(x, False)
+
+
+def test_all_close_promotion():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones(10, dtype="i8")
+
+    assert dpt.allclose(x1, x2)

From ea6dd27c65065f40f78a8f196db52c754953c2f8 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 16 Aug 2023 13:30:05 -0500
Subject: [PATCH 08/15] Fixes per PR review feedback

1. Aligned default values with those of np.allclose
2. Replaced less test with less_equal to align with NumPy.
---
 dpctl/tensor/_testing.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/_testing.py b/dpctl/tensor/_testing.py
index 185ed816a4..1aa7d82099 100644
--- a/dpctl/tensor/_testing.py
+++ b/dpctl/tensor/_testing.py
@@ -64,7 +64,7 @@ def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
     mv2 = z2i[mi]
     check5 = dpt.all(
         dpt.abs(mv1 - mv2)
-        < atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+        <= atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
     )
     return check5
 
@@ -90,7 +90,7 @@ def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
     mv2 = r2[m]
     check4 = dpt.all(
         dpt.abs(mv1 - mv2)
-        < atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+        <= atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
     )
     return check4
 
@@ -99,10 +99,10 @@ def _allclose_others(r1, r2):
     return dpt.all(r1 == r2)
 
 
-def allclose(a1, a2, atol=1e-5, rtol=1e-8, equal_nan=False):
-    """allclose(a1, a2, atol=1e-5, rtol=1e-8)
+def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
+    """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False)
 
-    Returns True if two arrays are element-wise equal within tolerance.
+    Returns True if two arrays are element-wise equal within tolerances.
     """
     if not isinstance(a1, dpt.usm_ndarray):
         raise TypeError(

From f36af57cc90810540a0e51338e72bb33bea09c15 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 16 Aug 2023 13:31:07 -0500
Subject: [PATCH 09/15] Adds tests for atol/rtol

Also added tests for early exits to improve coverage.
---
 dpctl/tests/test_tensor_testing.py | 56 +++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/test_tensor_testing.py b/dpctl/tests/test_tensor_testing.py
index 426d4bb586..d1cb4df4ab 100644
--- a/dpctl/tests/test_tensor_testing.py
+++ b/dpctl/tests/test_tensor_testing.py
@@ -86,10 +86,64 @@ def test_allclose_validation():
         dpt.allclose(x, False)
 
 
-def test_all_close_promotion():
+def test_allclose_type_promotion():
     get_queue_or_skip()
 
     x1 = dpt.ones(10, dtype="i4")
     x2 = dpt.ones(10, dtype="i8")
 
     assert dpt.allclose(x1, x2)
+
+
+def test_allclose_tolerance():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="f4")
+    atol = 1e-5
+    y = dpt.full_like(x, atol)
+    assert dpt.allclose(x, y, atol=atol, rtol=0)
+
+    # about 8e-6
+    tol = float.fromhex("0x1.0p-17")
+    x = dpt.ones(10, dtype="f4")
+    y = x - tol
+    assert dpt.allclose(x, y, atol=0, rtol=tol)
+
+
+def test_allclose_real_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="f4")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="f4")
+
+    # early exists, inf positions are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="f4")
+
+    # early exists, inf positions are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+
+def test_allclose_complex_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="c8")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+    x1 = dpt.asarray([0.0, dpt.inf * 1j, -dpt.inf * 1j], dtype="c8")
+    x2 = dpt.asarray([dpt.inf * 1j, 0.0, -dpt.inf * 1j], dtype="c8")
+
+    # early exists, inf positions of imag parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf * 1j, dpt.inf * 1j], dtype="c8")
+    assert not dpt.allclose(x1, x2)

From a75fff8f8394c4931f0f1e25414daeb695fa85e0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 16 Aug 2023 13:41:41 -0500
Subject: [PATCH 10/15] tensor.allclose to use abs(a-b) < max(atol,
 rtol*max(abs(a), abs(b)))

---
 dpctl/tensor/_testing.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/_testing.py b/dpctl/tensor/_testing.py
index 1aa7d82099..4d4a73408c 100644
--- a/dpctl/tensor/_testing.py
+++ b/dpctl/tensor/_testing.py
@@ -56,7 +56,7 @@ def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
     mv2 = z2r[mr]
     check4 = dpt.all(
         dpt.abs(mv1 - mv2)
-        < atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+        < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     if not check4:
         return check4
@@ -64,7 +64,7 @@ def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
     mv2 = z2i[mi]
     check5 = dpt.all(
         dpt.abs(mv1 - mv2)
-        <= atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     return check5
 
@@ -90,7 +90,7 @@ def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
     mv2 = r2[m]
     check4 = dpt.all(
         dpt.abs(mv1 - mv2)
-        <= atol + rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2))
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     return check4
 
@@ -103,6 +103,10 @@ def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
     """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False)
 
     Returns True if two arrays are element-wise equal within tolerances.
+
+    The testing is based on the following elementwise comparison:
+
+           abs(a - b) <= max(atol, rtol * max(abs(a), abs(b)))
     """
     if not isinstance(a1, dpt.usm_ndarray):
         raise TypeError(
@@ -114,6 +118,10 @@ def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
         )
     atol = float(atol)
     rtol = float(rtol)
+    if atol < 0.0 or rtol < 0.0:
+        raise ValueError(
+            "Absolute and relative tolerances must be non-negative"
+        )
     equal_nan = bool(equal_nan)
     exec_q = du.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
     if exec_q is None:

From ff3c680202551e486e44b59c7091eb5bd09158ae Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 16 Aug 2023 16:12:47 -0500
Subject: [PATCH 11/15] Completion of fix for gh-1058

This changes builds up on gh-1265 and takes into account queue
from the pre-allocated buffer, if provided.
---
 dpctl/tensor/_usmarray.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 2b16df1060..7573109c65 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -182,7 +182,10 @@ cdef class usm_ndarray:
                 raise TypeError("Argument shape must be a list or a tuple.")
         nd = len(shape)
         if dtype is None:
-            q = buffer_ctor_kwargs.get("queue")
+            if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
+                q = buffer.sycl_queue
+            else:
+                q = buffer_ctor_kwargs.get("queue")
             if q is not None:
                 dtype = default_device_fp_type(q)
             else:

From c4312cb08c40d2de4c88617ee0219ebe20d67172 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 16 Aug 2023 18:56:46 -0500
Subject: [PATCH 12/15] Scale down arguments and scale back the result

This improves accuracy at extremes of supported range.

Use sycl:: namespace ldexp and ilogb to prevent problem
with VS 2017 headers.
---
 .../kernels/elementwise_functions/sqrt.hpp    | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index f72b5d963b..a50859595a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -148,16 +148,26 @@ template <typename argT, typename resT> struct SqrtFunctor
         constexpr realT half = realT(0x1.0p-1f); // 1/2
         constexpr realT zero = realT(0);
 
-        if (std::signbit(x)) {
-            realT m = std::hypot(x, y);
-            realT d = std::sqrt((m - x) * half);
-            return {(d == zero ? zero : std::abs(y) / d * half),
-                    std::copysign(d, y)};
+        const int exp_x = sycl::ilogb(x);
+        const int exp_y = sycl::ilogb(y);
+
+        int sc = std::max<int>(exp_x, exp_y) / 2;
+        const realT xx = sycl::ldexp(x, -sc * 2);
+        const realT yy = sycl::ldexp(y, -sc * 2);
+
+        if (std::signbit(xx)) {
+            const realT m = std::hypot(xx, yy);
+            const realT d = std::sqrt((m - xx) * half);
+            const realT res_re = (d == zero ? zero : std::abs(yy) / d * half);
+            const realT res_im = std::copysign(d, yy);
+            return {sycl::ldexp(res_re, sc), sycl::ldexp(res_im, sc)};
         }
         else {
-            realT m = std::hypot(x, y);
-            realT d = std::sqrt((m + x) * half);
-            return {d, (d == zero) ? std::copysign(zero, y) : y * half / d};
+            const realT m = std::hypot(xx, yy);
+            const realT d = std::sqrt((m + xx) * half);
+            const realT res_im =
+                (d == zero) ? std::copysign(zero, yy) : yy * half / d;
+            return {sycl::ldexp(d, sc), sycl::ldexp(res_im, sc)};
         }
     }
 };

From bb52bb1774e0295b13de8792d41f0ec4efeb5a83 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 17 Aug 2023 11:49:04 -0500
Subject: [PATCH 13/15] Avoid using sycl::ilogb, but use own implementation

ilogb would have to pay attention to correctly computing
scale of denormal floats, while simpler code suffices.

Also use unscaled version in most cases, and scaled version
only for very large inputs.
---
 .../kernels/elementwise_functions/sqrt.hpp    | 85 ++++++++++++++++++-
 1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index a50859595a..808e82539e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -137,8 +137,42 @@ template <typename argT, typename resT> struct SqrtFunctor
         }
     }
 
+    int get_normal_scale_float(const float &v) const
+    {
+        constexpr int float_significant_bits = 23;
+        constexpr std::uint32_t exponent_mask = 0xff;
+        constexpr int exponent_bias = 127;
+        const int scale = static_cast<int>(
+            (sycl::bit_cast<std::uint32_t>(v) >> float_significant_bits) &
+            exponent_mask);
+        return scale - exponent_bias;
+    }
+
+    int get_normal_scale_double(const double &v) const
+    {
+        constexpr int float_significant_bits = 53;
+        constexpr std::uint64_t exponent_mask = 0x7ff;
+        constexpr int exponent_bias = 1023;
+        const int scale = static_cast<int>(
+            (sycl::bit_cast<std::uint64_t>(v) >> float_significant_bits) &
+            exponent_mask);
+        return scale - exponent_bias;
+    }
+
+    template <typename T> int get_normal_scale(const T &v) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>);
+
+        if constexpr (std::is_same_v<T, float>) {
+            return get_normal_scale_float(v);
+        }
+        else {
+            return get_normal_scale_double(v);
+        }
+    }
+
     template <typename T>
-    std::complex<T> csqrt_finite(T const &x, T const &y) const
+    std::complex<T> csqrt_finite_scaled(T const &x, T const &y) const
     {
         // csqrt(x + y*1j) =
         //     sqrt((cabs(x, y) + x) / 2) +
@@ -148,8 +182,8 @@ template <typename argT, typename resT> struct SqrtFunctor
         constexpr realT half = realT(0x1.0p-1f); // 1/2
         constexpr realT zero = realT(0);
 
-        const int exp_x = sycl::ilogb(x);
-        const int exp_y = sycl::ilogb(y);
+        const int exp_x = get_normal_scale<realT>(x);
+        const int exp_y = get_normal_scale<realT>(y);
 
         int sc = std::max<int>(exp_x, exp_y) / 2;
         const realT xx = sycl::ldexp(x, -sc * 2);
@@ -170,6 +204,51 @@ template <typename argT, typename resT> struct SqrtFunctor
             return {sycl::ldexp(d, sc), sycl::ldexp(res_im, sc)};
         }
     }
+
+    template <typename T>
+    std::complex<T> csqrt_finite_unscaled(T const &x, T const &y) const
+    {
+        // csqrt(x + y*1j) =
+        //     sqrt((cabs(x, y) + x) / 2) +
+        //     1j * copysign(sqrt((cabs(x, y) - x) / 2), y)
+
+        using realT = T;
+        constexpr realT half = realT(0x1.0p-1f); // 1/2
+        constexpr realT zero = realT(0);
+
+        if (std::signbit(x)) {
+            const realT m = std::hypot(x, y);
+            const realT d = std::sqrt((m - x) * half);
+            const realT res_re = (d == zero ? zero : std::abs(y) / d * half);
+            const realT res_im = std::copysign(d, y);
+            return {res_re, res_im};
+        }
+        else {
+            const realT m = std::hypot(x, y);
+            const realT d = std::sqrt((m + x) * half);
+            const realT res_im =
+                (d == zero) ? std::copysign(zero, y) : y * half / d;
+            return {d, res_im};
+        }
+    }
+
+    template <typename T> T scaling_threshold() const
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            return T(0x1.0p+126f);
+        }
+        else {
+            return T(0x1.0p+1022);
+        }
+    }
+
+    template <typename T>
+    std::complex<T> csqrt_finite(T const &x, T const &y) const
+    {
+        return (std::max<T>(std::abs(x), std::abs(y)) < scaling_threshold<T>())
+                   ? csqrt_finite_unscaled(x, y)
+                   : csqrt_finite_scaled(x, y);
+    }
 };
 
 template <typename argTy,

From ba9a59509f580a22811b02c3026c2bb36e3e9895 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 17 Aug 2023 12:44:47 -0500
Subject: [PATCH 14/15] Set defines to use std::abs and std::sqrt on Linux

We work around issues with these functions when their implementation
is taken from VS 2017 headers on Windows though.
---
 dpctl/tensor/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index de45ee47bd..ca83b8350b 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -58,10 +58,15 @@ set_source_files_properties(
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
   PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math")
+if (UNIX)
+  set_source_files_properties(
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
+    PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
+endif()
 target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
 if(UNIX)
-    # this option is support on Linux only
+    # this option is supported on Linux only
     target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
 endif()
 target_include_directories(${python_module_name}

From 142190fe35f32577b526bd2a3b45db4b0ce2493b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 17 Aug 2023 12:52:47 -0500
Subject: [PATCH 15/15] Removed stray include iostream

---
 .../include/kernels/elementwise_functions/positive.hpp          | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index f75b93d0f5..d5fab29315 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -37,8 +37,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor