diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index de45ee47bd..ca83b8350b 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -58,10 +58,15 @@ set_source_files_properties(
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
   PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math")
+if (UNIX)
+  set_source_files_properties(
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
+    PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
+endif()
 target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
 if(UNIX)
-    # this option is support on Linux only
+    # this option is supported on Linux only
     target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code)
 endif()
 target_include_directories(${python_module_name}
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index f1ec439f3a..7438fb8a67 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -158,6 +158,7 @@
     trunc,
 )
 from ._reduction import sum
+from ._testing import allclose
 
 __all__ = [
     "Device",
@@ -301,4 +302,5 @@
     "tan",
     "tanh",
     "trunc",
+    "allclose",
 ]
diff --git a/dpctl/tensor/_testing.py b/dpctl/tensor/_testing.py
new file mode 100644
index 0000000000..4d4a73408c
--- /dev/null
+++ b/dpctl/tensor/_testing.py
@@ -0,0 +1,152 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+from ._manipulation_functions import _broadcast_shape_impl
+from ._type_utils import _to_device_supported_dtype
+
+
+def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
+    z1r = dpt.real(z1)
+    z1i = dpt.imag(z1)
+    z2r = dpt.real(z2)
+    z2i = dpt.imag(z2)
+    if equal_nan:
+        check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all(
+            dpt.isnan(z1i) == dpt.isnan(z2i)
+        )
+    else:
+        check1 = (
+            dpt.logical_not(dpt.any(dpt.isnan(z1r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z1i)))
+        ) and (
+            dpt.logical_not(dpt.any(dpt.isnan(z2r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z2i)))
+        )
+    if not check1:
+        return check1
+    mr = dpt.isinf(z1r)
+    mi = dpt.isinf(z1i)
+    check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i))
+    if not check2:
+        return check2
+    check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi])
+    if not check3:
+        return check3
+    mr = dpt.isfinite(z1r)
+    mi = dpt.isfinite(z1i)
+    mv1 = z1r[mr]
+    mv2 = z2r[mr]
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
+    )
+    if not check4:
+        return check4
+    mv1 = z1i[mi]
+    mv2 = z2i[mi]
+    check5 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
+    )
+    return check5
+
+
+def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
+    if equal_nan:
+        check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2))
+    else:
+        check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not(
+            dpt.any(dpt.isnan(r2))
+        )
+    if not check1:
+        return check1
+    mr = dpt.isinf(r1)
+    check2 = dpt.all(mr == dpt.isinf(r2))
+    if not check2:
+        return check2
+    check3 = dpt.all(r1[mr] == r2[mr])
+    if not check3:
+        return check3
+    m = dpt.isfinite(r1)
+    mv1 = r1[m]
+    mv2 = r2[m]
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
+    )
+    return check4
+
+
+def _allclose_others(r1, r2):
+    return dpt.all(r1 == r2)
+
+
+def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
+    """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False)
+
+    Returns True if two arrays are element-wise equal within tolerances.
+
+    The testing is based on the following elementwise comparison:
+
+           abs(a - b) <= max(atol, rtol * max(abs(a), abs(b)))
+    """
+    if not isinstance(a1, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray type, got {type(a1)}."
+        )
+    if not isinstance(a2, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray type, got {type(a2)}."
+        )
+    atol = float(atol)
+    rtol = float(rtol)
+    if atol < 0.0 or rtol < 0.0:
+        raise ValueError(
+            "Absolute and relative tolerances must be non-negative"
+        )
+    equal_nan = bool(equal_nan)
+    exec_q = du.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
+    if exec_q is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_sh = _broadcast_shape_impl([a1.shape, a2.shape])
+    b1 = a1
+    b2 = a2
+    if b1.dtype == b2.dtype:
+        res_dt = b1.dtype
+    else:
+        res_dt = np.promote_types(b1.dtype, b2.dtype)
+        res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
+        b1 = dpt.astype(b1, res_dt)
+        b2 = dpt.astype(b2, res_dt)
+
+    b1 = dpt.broadcast_to(b1, res_sh)
+    b2 = dpt.broadcast_to(b2, res_sh)
+
+    k = b1.dtype.kind
+    if k == "c":
+        return _allclose_complex_fp(b1, b2, atol, rtol, equal_nan)
+    elif k == "f":
+        return _allclose_real_fp(b1, b2, atol, rtol, equal_nan)
+    else:
+        return _allclose_others(b1, b2)
diff --git a/dpctl/tensor/_usmarray.pxd b/dpctl/tensor/_usmarray.pxd
index 603b296103..120ad7a066 100644
--- a/dpctl/tensor/_usmarray.pxd
+++ b/dpctl/tensor/_usmarray.pxd
@@ -58,7 +58,6 @@ cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]:
 
     cdef void _reset(usm_ndarray self)
     cdef void _cleanup(usm_ndarray self)
-    cdef usm_ndarray _clone(usm_ndarray self)
     cdef Py_ssize_t get_offset(usm_ndarray self) except *
 
     cdef char* get_data(self)
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 1336063323..4fc90fbf94 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -44,58 +44,6 @@ include "_types.pxi"
 include "_slicing.pxi"
 
 
-def _dispatch_unary_elementwise(ary, name):
-    try:
-        mod = ary.__array_namespace__()
-    except AttributeError:
-        return NotImplemented
-    if mod is None and "dpnp" in sys.modules:
-        fn = getattr(sys.modules["dpnp"], name)
-        if callable(fn):
-            return fn(ary)
-    elif hasattr(mod, name):
-        fn = getattr(mod, name)
-        if callable(fn):
-            return fn(ary)
-
-    return NotImplemented
-
-
-def _dispatch_binary_elementwise(ary, name, other):
-    try:
-        mod = ary.__array_namespace__()
-    except AttributeError:
-        return NotImplemented
-    if mod is None and "dpnp" in sys.modules:
-        fn = getattr(sys.modules["dpnp"], name)
-        if callable(fn):
-            return fn(ary, other)
-    elif hasattr(mod, name):
-        fn = getattr(mod, name)
-        if callable(fn):
-            return fn(ary, other)
-
-    return NotImplemented
-
-
-def _dispatch_binary_elementwise2(other, name, ary):
-    try:
-        mod = ary.__array_namespace__()
-    except AttributeError:
-        return NotImplemented
-    mod = ary.__array_namespace__()
-    if mod is None and "dpnp" in sys.modules:
-        fn = getattr(sys.modules["dpnp"], name)
-        if callable(fn):
-            return fn(other, ary)
-    elif hasattr(mod, name):
-        fn = getattr(mod, name)
-        if callable(fn):
-            return fn(other, ary)
-
-    return NotImplemented
-
-
 cdef class InternalUSMArrayError(Exception):
     """
     A InternalError exception is raised when internal
@@ -200,28 +148,6 @@ cdef class usm_ndarray:
             PyMem_Free(self.strides_)
         self._reset()
 
-    cdef usm_ndarray _clone(usm_ndarray self):
-        """
-        Provides a copy of Python object pointing to the same data
-        """
-        cdef Py_ssize_t offset_elems = self.get_offset()
-        cdef usm_ndarray res = usm_ndarray.__new__(
-            usm_ndarray, _make_int_tuple(self.nd_, self.shape_),
-            dtype=_make_typestr(self.typenum_),
-            strides=(
-                _make_int_tuple(self.nd_, self.strides_) if (self.strides_)
-                else None),
-            buffer=self.base_,
-            offset=offset_elems,
-            order=('C' if (self.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F')
-        )
-        res.flags_ = self.flags_
-        res.array_namespace_ = self.array_namespace_
-        if (res.data_ != self.data_):
-            raise InternalUSMArrayError(
-                "Data pointers of cloned and original objects are different.")
-        return res
-
     def __cinit__(self, shape, dtype=None, strides=None, buffer='device',
                   Py_ssize_t offset=0, order='C',
                   buffer_ctor_kwargs=dict(),
@@ -256,7 +182,10 @@ cdef class usm_ndarray:
                 raise TypeError("Argument shape must be a list or a tuple.")
         nd = len(shape)
         if dtype is None:
-            q = buffer_ctor_kwargs.get("queue")
+            if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
+                q = buffer.sycl_queue
+            else:
+                q = buffer_ctor_kwargs.get("queue")
             if q is not None:
                 dtype = default_device_fp_type(q)
             else:
@@ -966,32 +895,17 @@ cdef class usm_ndarray:
         raise IndexError("only integer arrays are valid indices")
 
     def __abs__(self):
-        return _dispatch_unary_elementwise(self, "abs")
+        return dpctl.tensor.abs(self)
 
     def __add__(first, other):
         """
-        Cython 0.* never calls `__radd__`, always calls `__add__`
-        but first argument need not be an instance of this class,
-        so dispatching is needed.
-
-        This changes in Cython 3.0, where first is guaranteed to
-        be `self`.
-
-        [1] http://docs.cython.org/en/latest/src/userguide/special_methods.html
+        Implementation for operator.add
         """
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "add", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "add", other)
-        return NotImplemented
+        return dpctl.tensor.add(first, other)
 
     def __and__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_and", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_and", other)
-        return NotImplemented
+        "Implementation for operator.and"
+        return dpctl.tensor.bitwise_and(first, other)
 
     def __dlpack__(self, stream=None):
         """
@@ -1037,27 +951,22 @@ cdef class usm_ndarray:
             )
 
     def __eq__(self, other):
-        return _dispatch_binary_elementwise(self, "equal", other)
+        return dpctl.tensor.equal(self, other)
 
     def __floordiv__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "floor_divide", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "floor_divide", other)
-        return NotImplemented
+        return dpctl.tensor.floor_divide(first, other)
 
     def __ge__(self, other):
-        return _dispatch_binary_elementwise(self, "greater_equal", other)
+        return dpctl.tensor.greater_equal(self, other)
 
     def __gt__(self, other):
-        return _dispatch_binary_elementwise(self, "greater", other)
+        return dpctl.tensor.greater(self, other)
 
     def __invert__(self):
-        return _dispatch_unary_elementwise(self, "bitwise_invert")
+        return dpctl.tensor.bitwise_invert(self)
 
     def __le__(self, other):
-        return _dispatch_binary_elementwise(self, "less_equal", other)
+        return dpctl.tensor.less_equal(self, other)
 
     def __len__(self):
         if (self.nd_):
@@ -1067,72 +976,40 @@ cdef class usm_ndarray:
 
     def __lshift__(first, other):
         "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_left_shift", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_left_shift", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_left_shift(first, other)
 
     def __lt__(self, other):
-        return _dispatch_binary_elementwise(self, "less", other)
+        return dpctl.tensor.less(self, other)
 
     def __matmul__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "matmul", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "matmul", other)
         return NotImplemented
 
     def __mod__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "remainder", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "remainder", other)
-        return NotImplemented
+        return dpctl.tensor.remainder(first, other)
 
     def __mul__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "multiply", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "multiply", other)
-        return NotImplemented
+        return dpctl.tensor.multiply(first, other)
 
     def __ne__(self, other):
-        return _dispatch_binary_elementwise(self, "not_equal", other)
+        return dpctl.tensor.not_equal(self, other)
 
     def __neg__(self):
-        return _dispatch_unary_elementwise(self, "negative")
+        return dpctl.tensor.negative(self)
 
     def __or__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_or", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_or", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_or(first, other)
 
     def __pos__(self):
-        return self  # _dispatch_unary_elementwise(self, "positive")
+        return dpctl.tensor.positive(self)
 
     def __pow__(first, other, mod):
-        "See comment in __add__"
         if mod is None:
-            if isinstance(first, usm_ndarray):
-                return _dispatch_binary_elementwise(first, "pow", other)
-            elif isinstance(other, usm_ndarray):
-                return _dispatch_binary_elementwise(first, "pow", other)
-        return NotImplemented
+            return dpctl.tensor.pow(first, other)
+        else:
+            return NotImplemented
 
     def __rshift__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_right_shift", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_right_shift", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_right_shift(first, other)
 
     def __setitem__(self, key, rhs):
         cdef tuple _meta
@@ -1223,67 +1100,52 @@ cdef class usm_ndarray:
 
 
     def __sub__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "subtract", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "subtract", other)
-        return NotImplemented
+        return dpctl.tensor.subtract(first, other)
 
     def __truediv__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "divide", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "divide", other)
-        return NotImplemented
+        return dpctl.tensor.divide(first, other)
 
     def __xor__(first, other):
-        "See comment in __add__"
-        if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "bitwise_xor", other)
-        elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "bitwise_xor", other)
-        return NotImplemented
+        return dpctl.tensor.bitwise_xor(first, other)
 
     def __radd__(self, other):
-        return _dispatch_binary_elementwise(self, "add", other)
+        return dpctl.tensor.add(other, self)
 
     def __rand__(self, other):
-        return _dispatch_binary_elementwise(self, "bitwise_and", other)
+        return dpctl.tensor.bitwise_and(other, self)
 
     def __rfloordiv__(self, other):
-        return _dispatch_binary_elementwise2(other, "floor_divide", self)
+        return dpctl.tensor.floor_divide(other, self)
 
     def __rlshift__(self, other):
-        return _dispatch_binary_elementwise2(other, "bitwise_left_shift", self)
+        return dpctl.tensor.bitwise_left_shift(other, self)
 
     def __rmatmul__(self, other):
-        return _dispatch_binary_elementwise2(other, "matmul", self)
+        return NotImplemented
 
     def __rmod__(self, other):
-        return _dispatch_binary_elementwise2(other, "remainder", self)
+        return dpctl.tensor.remainder(other, self)
 
     def __rmul__(self, other):
-        return _dispatch_binary_elementwise(self, "multiply", other)
+        return dpctl.tensor.multiply(other, self)
 
     def __ror__(self, other):
-        return _dispatch_binary_elementwise(self, "bitwise_or", other)
+        return dpctl.tensor.bitwise_or(other, self)
 
     def __rpow__(self, other):
-        return _dispatch_binary_elementwise2(other, "pow", self)
+        return dpctl.tensor.pow(other, self)
 
     def __rrshift__(self, other):
-        return _dispatch_binary_elementwise2(other, "bitwise_right_shift", self)
+        return dpctl.tensor.bitwise_right_shift(other, self)
 
     def __rsub__(self, other):
-        return _dispatch_binary_elementwise2(other, "subtract", self)
+        return dpctl.tensor.subtract(other, self)
 
     def __rtruediv__(self, other):
-        return _dispatch_binary_elementwise2(other, "divide", self)
+        return dpctl.tensor.divide(other, self)
 
     def __rxor__(self, other):
-        return _dispatch_binary_elementwise2(other, "bitwise_xor", self)
+        return dpctl.tensor.bitwise_xor(other, self)
 
     def __iadd__(self, other):
         from ._elementwise_funcs import add
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index c734fd54ec..9c42ba1bb4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -25,8 +25,10 @@
 #pragma once
 #include <CL/sycl.hpp>
 #include <cmath>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -36,8 +38,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor
@@ -72,7 +72,58 @@ template <typename argT, typename resT> struct AbsFunctor
             return x;
         }
         else {
-            return std::abs(x);
+            if constexpr (is_complex<argT>::value) {
+                return cabs(x);
+            }
+            else if constexpr (std::is_same_v<argT, sycl::half> ||
+                               std::is_floating_point_v<argT>)
+            {
+                return (std::signbit(x) ? -x : x);
+            }
+            else {
+                return std::abs(x);
+            }
+        }
+    }
+
+private:
+    template <typename realT> realT cabs(std::complex<realT> const &z) const
+    {
+        // Special values for cabs( x + y * 1j):
+        //   * If x is either +infinity or -infinity and y is any value
+        //   (including NaN), the result is +infinity.
+        //   * If x is any value (including NaN) and y is either +infinity or
+        //   -infinity, the result is +infinity.
+        //   * If x is either +0 or -0, the result is equal to abs(y).
+        //   * If y is either +0 or -0, the result is equal to abs(x).
+        //   * If x is NaN and y is a finite number, the result is NaN.
+        //   * If x is a finite number and y is NaN, the result is NaN.
+        //   * If x is NaN and y is NaN, the result is NaN.
+
+        const realT x = std::real(z);
+        const realT y = std::imag(z);
+
+        constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+        constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+
+        if (std::isinf(x)) {
+            return p_inf;
+        }
+        else if (std::isinf(y)) {
+            return p_inf;
+        }
+        else if (std::isnan(x)) {
+            return q_nan;
+        }
+        else if (std::isnan(y)) {
+            return q_nan;
+        }
+        else {
+#ifdef USE_STD_ABS_FOR_COMPLEX_TYPES
+            return std::abs(z);
+#else
+            return std::hypot(std::real(z), std::imag(z));
+#endif
         }
     }
 };
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index 98ce88557a..830794680c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -37,8 +37,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index f75b93d0f5..d5fab29315 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -37,8 +37,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index fb387bdcb2..8f0488746e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -37,8 +37,6 @@
 #include "utils/type_utils.hpp"
 #include <pybind11/pybind11.h>
 
-#include <iostream>
-
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 7853428227..808e82539e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -26,8 +26,10 @@
 #pragma once
 #include <CL/sycl.hpp>
 #include <cmath>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -66,7 +68,186 @@ template <typename argT, typename resT> struct SqrtFunctor
 
     resT operator()(const argT &in)
     {
-        return std::sqrt(in);
+        if constexpr (is_complex<argT>::value) {
+            // #ifdef _WINDOWS
+            //             return csqrt(in);
+            // #else
+            //             return std::sqrt(in);
+            // #endif
+            return csqrt(in);
+        }
+        else {
+            return std::sqrt(in);
+        }
+    }
+
+private:
+    template <typename T> std::complex<T> csqrt(std::complex<T> const &z) const
+    {
+        // csqrt(x + y*1j)
+        //  * csqrt(x - y * 1j) = conj(csqrt(x + y * 1j))
+        //  * If x is either +0 or -0 and y is +0, the result is +0 + 0j.
+        //  * If x is any value (including NaN) and y is +infinity, the result
+        //  is +infinity + infinity j.
+        //  * If x is a finite number and y is NaN, the result is NaN + NaN j.
+
+        //  * If x -infinity and y is a positive (i.e., greater than 0) finite
+        //  number, the result is NaN + NaN j.
+        //  * If x is +infinity and y is a positive (i.e., greater than 0)
+        //  finite number, the result is +0 + infinity j.
+        //  * If x is -infinity and y is NaN, the result is NaN + infinity j
+        //  (sign of the imaginary component is unspecified).
+        //  * If x is +infinity and y is NaN, the result is +infinity + NaN j.
+        //  * If x is NaN and y is any value, the result is NaN + NaN j.
+
+        using realT = T;
+        constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+        constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+        constexpr realT zero = realT(0);
+
+        realT x = std::real(z);
+        realT y = std::imag(z);
+
+        if (std::isinf(y)) {
+            return {p_inf, y};
+        }
+        else if (std::isnan(x)) {
+            return {x, q_nan};
+        }
+        else if (std::isinf(x)) { // x is an infinity
+            // y is either finite, or nan
+            if (std::signbit(x)) { // x == -inf
+                return {(std::isfinite(y) ? zero : y), std::copysign(p_inf, y)};
+            }
+            else {
+                return {p_inf, (std::isfinite(y) ? std::copysign(zero, y) : y)};
+            }
+        }
+        else { // x is finite
+            if (std::isfinite(y)) {
+#ifdef USE_STD_SQRT_FOR_COMPLEX_TYPES
+                return std::sqrt(z);
+#else
+                return csqrt_finite(x, y);
+#endif
+            }
+            else {
+                return {q_nan, y};
+            }
+        }
+    }
+
+    int get_normal_scale_float(const float &v) const
+    {
+        constexpr int float_significant_bits = 23;
+        constexpr std::uint32_t exponent_mask = 0xff;
+        constexpr int exponent_bias = 127;
+        const int scale = static_cast<int>(
+            (sycl::bit_cast<std::uint32_t>(v) >> float_significant_bits) &
+            exponent_mask);
+        return scale - exponent_bias;
+    }
+
+    int get_normal_scale_double(const double &v) const
+    {
+        constexpr int float_significant_bits = 53;
+        constexpr std::uint64_t exponent_mask = 0x7ff;
+        constexpr int exponent_bias = 1023;
+        const int scale = static_cast<int>(
+            (sycl::bit_cast<std::uint64_t>(v) >> float_significant_bits) &
+            exponent_mask);
+        return scale - exponent_bias;
+    }
+
+    template <typename T> int get_normal_scale(const T &v) const
+    {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>);
+
+        if constexpr (std::is_same_v<T, float>) {
+            return get_normal_scale_float(v);
+        }
+        else {
+            return get_normal_scale_double(v);
+        }
+    }
+
+    template <typename T>
+    std::complex<T> csqrt_finite_scaled(T const &x, T const &y) const
+    {
+        // csqrt(x + y*1j) =
+        //     sqrt((cabs(x, y) + x) / 2) +
+        //     1j * copysign(sqrt((cabs(x, y) - x) / 2), y)
+
+        using realT = T;
+        constexpr realT half = realT(0x1.0p-1f); // 1/2
+        constexpr realT zero = realT(0);
+
+        const int exp_x = get_normal_scale<realT>(x);
+        const int exp_y = get_normal_scale<realT>(y);
+
+        int sc = std::max<int>(exp_x, exp_y) / 2;
+        const realT xx = sycl::ldexp(x, -sc * 2);
+        const realT yy = sycl::ldexp(y, -sc * 2);
+
+        if (std::signbit(xx)) {
+            const realT m = std::hypot(xx, yy);
+            const realT d = std::sqrt((m - xx) * half);
+            const realT res_re = (d == zero ? zero : std::abs(yy) / d * half);
+            const realT res_im = std::copysign(d, yy);
+            return {sycl::ldexp(res_re, sc), sycl::ldexp(res_im, sc)};
+        }
+        else {
+            const realT m = std::hypot(xx, yy);
+            const realT d = std::sqrt((m + xx) * half);
+            const realT res_im =
+                (d == zero) ? std::copysign(zero, yy) : yy * half / d;
+            return {sycl::ldexp(d, sc), sycl::ldexp(res_im, sc)};
+        }
+    }
+
+    template <typename T>
+    std::complex<T> csqrt_finite_unscaled(T const &x, T const &y) const
+    {
+        // csqrt(x + y*1j) =
+        //     sqrt((cabs(x, y) + x) / 2) +
+        //     1j * copysign(sqrt((cabs(x, y) - x) / 2), y)
+
+        using realT = T;
+        constexpr realT half = realT(0x1.0p-1f); // 1/2
+        constexpr realT zero = realT(0);
+
+        if (std::signbit(x)) {
+            const realT m = std::hypot(x, y);
+            const realT d = std::sqrt((m - x) * half);
+            const realT res_re = (d == zero ? zero : std::abs(y) / d * half);
+            const realT res_im = std::copysign(d, y);
+            return {res_re, res_im};
+        }
+        else {
+            const realT m = std::hypot(x, y);
+            const realT d = std::sqrt((m + x) * half);
+            const realT res_im =
+                (d == zero) ? std::copysign(zero, y) : y * half / d;
+            return {d, res_im};
+        }
+    }
+
+    template <typename T> T scaling_threshold() const
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            return T(0x1.0p+126f);
+        }
+        else {
+            return T(0x1.0p+1022);
+        }
+    }
+
+    template <typename T>
+    std::complex<T> csqrt_finite(T const &x, T const &y) const
+    {
+        return (std::max<T>(std::abs(x), std::abs(y)) < scaling_threshold<T>())
+                   ? csqrt_finite_unscaled(x, y)
+                   : csqrt_finite_scaled(x, y);
     }
 };
 
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index ab0d34d54d..9c800af812 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -15,6 +15,7 @@
 #  limitations under the License.
 
 import itertools
+import warnings
 
 import numpy as np
 import pytest
@@ -22,7 +23,13 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _no_complex_dtypes, _usm_types
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -135,3 +142,50 @@ def test_abs_out_overlap(dtype):
     assert Y is not X
     assert np.allclose(dpt.asnumpy(X), Xnp)
     assert np.allclose(dpt.asnumpy(Y), Ynp)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_abs_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.abs(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_abs_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.abs(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py
index a15f5262a7..07b97733e3 100644
--- a/dpctl/tests/elementwise/test_sqrt.py
+++ b/dpctl/tests/elementwise/test_sqrt.py
@@ -15,6 +15,7 @@
 #  limitations under the License.
 
 import itertools
+import warnings
 
 import numpy as np
 import pytest
@@ -23,7 +24,13 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _map_to_device_dtype,
+    _real_fp_dtypes,
+    _usm_types,
+)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -115,18 +122,6 @@ def test_sqrt_order(dtype):
             assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
-@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
-def test_sqrt_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
-    )
-    Xnp = dpt.asnumpy(X)
-
-    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
-
-
 @pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
 def test_sqrt_out_overlap(dtype):
     q = get_queue_or_skip()
@@ -149,3 +144,62 @@ def test_sqrt_out_overlap(dtype):
     assert Y is not X
     assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
     assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_sqrt_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_sqrt_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.sqrt(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_sqrt_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.sqrt(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/test_tensor_testing.py b/dpctl/tests/test_tensor_testing.py
new file mode 100644
index 0000000000..d1cb4df4ab
--- /dev/null
+++ b/dpctl/tests/test_tensor_testing.py
@@ -0,0 +1,149 @@
+import itertools
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_allclose(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    a1 = dpt.ones(10, dtype=dtype)
+    a2 = dpt.ones(10, dtype=dtype)
+
+    assert dpt.allclose(a1, a2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_allclose_real_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+    a1 = dpt.asarray(v[2:], dtype=dtype)
+    a2 = dpt.asarray(v[2:], dtype=dtype)
+
+    tol = dpt.finfo(a1.dtype).resolution
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol)
+
+    a1 = dpt.asarray(v, dtype=dtype)
+    a2 = dpt.asarray(v, dtype=dtype)
+
+    assert not dpt.allclose(a1, a2, atol=tol, rtol=tol)
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_allclose_complex_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+
+    not_nans = [complex(*xy) for xy in itertools.product(v[2:], repeat=2)]
+    z1 = dpt.asarray(not_nans, dtype=dtype)
+    z2 = dpt.asarray(not_nans, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol)
+
+    both = [complex(*xy) for xy in itertools.product(v, repeat=2)]
+    z1 = dpt.asarray(both, dtype=dtype)
+    z2 = dpt.asarray(both, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert not dpt.allclose(z1, z2, atol=tol, rtol=tol)
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol, equal_nan=True)
+
+
+def test_allclose_validation():
+    with pytest.raises(TypeError):
+        dpt.allclose(True, False)
+
+    get_queue_or_skip()
+    x = dpt.asarray(True)
+    with pytest.raises(TypeError):
+        dpt.allclose(x, False)
+
+
+def test_allclose_type_promotion():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones(10, dtype="i8")
+
+    assert dpt.allclose(x1, x2)
+
+
+def test_allclose_tolerance():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="f4")
+    atol = 1e-5
+    y = dpt.full_like(x, atol)
+    assert dpt.allclose(x, y, atol=atol, rtol=0)
+
+    # about 8e-6
+    tol = float.fromhex("0x1.0p-17")
+    x = dpt.ones(10, dtype="f4")
+    y = x - tol
+    assert dpt.allclose(x, y, atol=0, rtol=tol)
+
+
+def test_allclose_real_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="f4")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="f4")
+
+    # early exists, inf positions are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="f4")
+
+    # early exists, inf positions are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+
+def test_allclose_complex_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="c8")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+    x1 = dpt.asarray([0.0, dpt.inf * 1j, -dpt.inf * 1j], dtype="c8")
+    x2 = dpt.asarray([dpt.inf * 1j, 0.0, -dpt.inf * 1j], dtype="c8")
+
+    # early exists, inf positions of imag parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf * 1j, dpt.inf * 1j], dtype="c8")
+    assert not dpt.allclose(x1, x2)