diff --git a/src/interface/set.h b/src/interface/set.h
index 956603ad..c9df4fa4 100644
--- a/src/interface/set.h
+++ b/src/interface/set.h
@@ -663,6 +663,30 @@ namespace CTF {
         std::sort((dtypePair<dtype>*)pairs,((dtypePair<dtype>*)pairs)+n);
       }
 
+      void to_s_of_a(int64_t n, char const * pairs, int64_t * keys, char * vals) const {
+        Pair<dtype> const * dpairs = (Pair<dtype> const *)pairs;
+        dtype * dvals = (dtype*)vals;
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int64_t i=0; i<n; i++){
+          keys[i] = dpairs[i].k;
+          dvals[i] = dpairs[i].d;
+        }
+      }
+
+      void to_a_of_s(int64_t n, int64_t const * keys, char const * vals, char * pairs) const {
+        Pair<dtype> * dpairs = (Pair<dtype>*)pairs;
+        dtype const * dvals = (dtype const*)vals;
+#ifdef _OPENMP
+        #pragma omp parallel for
+#endif
+        for (int64_t i=0; i<n; i++){
+          dpairs[i].k = keys[i];
+          dpairs[i].d = dvals[i];
+        }
+      }
+
       void copy(char * a, char const * b) const {
         ((dtype *)a)[0] = ((dtype const *)b)[0];
       }
diff --git a/src/interface/tensor.cxx b/src/interface/tensor.cxx
index 6ca9970f..b21de664 100644
--- a/src/interface/tensor.cxx
+++ b/src/interface/tensor.cxx
@@ -360,15 +360,7 @@ namespace CTF {
     *data = (dtype*)CTF_int::alloc((*npair)*sizeof(dtype));
     CTF_int::memprof_dealloc(*global_idx);
     CTF_int::memprof_dealloc(*data);
-    CTF_int::PairIterator pairs(sr, cpairs);
-
-#ifdef _OPENMP
-    #pragma omp parallel for
-#endif
-    for (i=0; i<(*npair); i++){
-      (*global_idx)[i] = pairs[i].k();
-      pairs[i].read_val((char*)((*data)+i));
-    }
+    sr->to_s_of_a(*npair, cpairs, *global_idx, (char*)*data);
     if (cpairs != NULL) sr->pair_dealloc(cpairs);
   }
 
@@ -425,13 +417,14 @@ namespace CTF {
     int64_t i;
     char * cpairs = sr->pair_alloc(npair);
     Pair< dtype > * pairs =(Pair< dtype >*)cpairs;
-#ifdef _OPENMP
-    #pragma omp parallel for
-#endif
-    for (i=0; i<npair; i++){
-      pairs[i].k = global_idx[i];
-      pairs[i].d = data[i];
-    }
+    sr->to_a_of_s(npair, global_idx, (char*)data, (char*)pairs);
+//#ifdef _OPENMP
+//    #pragma omp parallel for
+//#endif
+//    for (i=0; i<npair; i++){
+//      pairs[i].k = global_idx[i];
+//      pairs[i].d = data[i];
+//    }
     ret = CTF_int::tensor::read(npair, cpairs);
     if (ret != CTF_int::SUCCESS){ printf("CTF ERROR: failed to execute function read\n"); IASSERT(0); return; }
     for (i=0; i<npair; i++){
@@ -463,16 +456,17 @@ namespace CTF {
                             int64_t const * global_idx,
                             dtype const *   data) {
     int ret;
-    int64_t i;
+    //int64_t i;
     char * cpairs = sr->pair_alloc(npair);
     Pair< dtype > * pairs =(Pair< dtype >*)cpairs;
-#ifdef _OPENMP
-    #pragma omp parallel for
-#endif
-    for (i=0; i<npair; i++){
-      pairs[i].k = global_idx[i];
-      pairs[i].d = data[i];
-    }
+    sr->to_a_of_s(npair, global_idx, (char*)data, (char*)pairs);
+//#ifdef _OPENMP
+//    #pragma omp parallel for
+//#endif
+//    for (i=0; i<npair; i++){
+//      pairs[i].k = global_idx[i];
+//      pairs[i].d = data[i];
+//    }
     /*char * cpairs = sr->pair_alloc(npair);
     CTF_int::PairIterator pairs = CTF_int::PairIterator(sr, cpairs);
     for (i=0; i<npair; i++){
@@ -721,10 +715,11 @@ namespace CTF {
     int64_t i;
     char * cpairs = sr->pair_alloc(npair);
     Pair< dtype > * pairs =(Pair< dtype >*)cpairs;
-    for (i=0; i<npair; i++){
-      pairs[i].k = global_idx[i];
-      pairs[i].d = data[i];
-    }
+    sr->to_a_of_s(npair, global_idx, (char*)data, (char*)pairs);
+    //for (i=0; i<npair; i++){
+    //  pairs[i].k = global_idx[i];
+    //  pairs[i].d = data[i];
+    //}
     ret = CTF_int::tensor::read(npair, (char*)&alpha, (char*)&beta, cpairs);
     if (ret != CTF_int::SUCCESS){ printf("CTF ERROR: failed to execute function read\n"); IASSERT(0); return; }
     for (i=0; i<npair; i++){
diff --git a/src/tensor/algstrct.cxx b/src/tensor/algstrct.cxx
index 855615e2..ce9ee570 100644
--- a/src/tensor/algstrct.cxx
+++ b/src/tensor/algstrct.cxx
@@ -262,6 +262,48 @@ namespace CTF_int {
 
   }
 
+  void algstrct::to_s_of_a(int64_t n, char const * pairs, int64_t * keys, char * vals) const {
+    ConstPairIterator rA(this, pairs);
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int64_t i=0; i<n; i++){
+      keys[i] = rA[i].k();
+      rA[i].read_val(vals + i*el_size);
+    }
+  }
+
+  void algstrct::to_a_of_s(int64_t n, int64_t const * keys, char const * vals, char * pairs) const {
+    PairIterator wA(this, pairs);
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int64_t i=0; i<n; i++){
+      wA[i].write_key(keys[i]);
+      wA[i].write_val(vals + i*el_size);
+    }
+  }
+
+  void ConstPairIterator::permute(int64_t n, int order, int64_t const * old_lens, int64_t const * new_lda, PairIterator wA){
+    ConstPairIterator rA = * this;
+#ifdef USE_OMP
+    #pragma omp parallel for
+#endif
+    for (int64_t i=0; i<n; i++){
+      int64_t k = rA[i].k();
+      int64_t k_new = 0;
+      for (int j=0; j<order; j++){
+        k_new += (k%old_lens[j])*new_lda[j];
+        k = k/old_lens[j];
+      }
+      ((int64_t*)wA[i].ptr)[0] = k_new;
+      wA[i].write_val(rA[i].d());
+      //printf("value %lf old key %ld new key %ld\n",((double*)wA[i].d())[0], rA[i].k(), wA[i].k());
+    }
+
+
+  }
+
   void algstrct::scal(int          n,
                       char const * alpha,
                       char       * X,
@@ -918,26 +960,6 @@ namespace CTF_int {
     sr->sort(n, ptr);
   }
 
-  void ConstPairIterator::permute(int64_t n, int order, int64_t const * old_lens, int64_t const * new_lda, PairIterator wA){
-    ConstPairIterator rA = * this;
-#ifdef USE_OMP
-    #pragma omp parallel for
-#endif
-    for (int64_t i=0; i<n; i++){
-      int64_t k = rA[i].k();
-      int64_t k_new = 0;
-      for (int j=0; j<order; j++){
-        k_new += (k%old_lens[j])*new_lda[j];
-        k = k/old_lens[j];
-      }
-      ((int64_t*)wA[i].ptr)[0] = k_new;
-      wA[i].write_val(rA[i].d());
-      //printf("value %lf old key %ld new key %ld\n",((double*)wA[i].d())[0], rA[i].k(), wA[i].k());
-    }
-
-
-  }
-
   void ConstPairIterator::pin(int64_t n, int order, int64_t const * lens, int const * divisor, PairIterator pi_new){
     TAU_FSTART(pin);
     ConstPairIterator pi = *this;
diff --git a/src/tensor/algstrct.h b/src/tensor/algstrct.h
index d6a57c05..6672b5d6 100644
--- a/src/tensor/algstrct.h
+++ b/src/tensor/algstrct.h
@@ -339,6 +339,25 @@ namespace CTF_int {
        * \brief sorts n sets of pairs using std::sort 
        */
       virtual void sort(int64_t n, char * pairs) const;
+
+      /**
+       * \brief converts pair array into struct of arrays
+       * \param[in] n number of key, value pairs to extract
+       * \param[in] pairs to convert
+       * \param[in,out] keys array of keys
+       * \param[in,out] vals array of values
+       */
+      virtual void to_s_of_a(int64_t n, char const * pairs, int64_t * keys, char * vals) const;
+
+       /**
+       * \brief converts pair array into array of structs
+       * \param[in] n number of key, value pairs to extract
+       * \param[in] keys array of keys
+       * \param[in] vals array of values
+       * \param[in,out] pairs to convert
+       */
+      virtual void to_a_of_s(int64_t n, int64_t const * keys, char const * vals, char * pairs) const;
+
  
       /** estimate time in seconds necessary for CSR reduction with input of size msg_sz */
       double estimate_csr_red_time(int64_t msg_sz, CommData const * cdt) const;
diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index 134b01fe..de688100 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -2405,6 +2405,13 @@ namespace CTF_int {
     return ipr.ptr;
   }
 
+  void tensor::read_all_pairs(int64_t * num_pair, bool unpack, int64_t ** inds, char ** vals, bool nonzero_only) const {
+    char * pairs = read_all_pairs(num_pair, unpack, nonzero_only);
+    *inds = (int64_t*)malloc(sizeof(int64_t)*(*num_pair));
+    *vals = (char*)malloc(sr->el_size*(*num_pair));
+    sr->to_s_of_a(*num_pair, pairs, *inds, *vals);
+  }
+
   int64_t tensor::get_tot_size(bool packed=false){
     if (!packed){
       int64_t tsize = 1;
diff --git a/src/tensor/untyped_tensor.h b/src/tensor/untyped_tensor.h
index 6a24b6a5..915afdc4 100644
--- a/src/tensor/untyped_tensor.h
+++ b/src/tensor/untyped_tensor.h
@@ -508,6 +508,18 @@ namespace CTF_int {
        */
       char * read_all_pairs(int64_t * num_pair, bool unpack, bool nonzero_only=false) const;
 
+
+      /**
+       * \brief read all pairs with each processor (packed)
+       * \param[out] num_pair number of values read
+       * \param[in] unpack whether to read all or unique pairs up to symmetry
+       * \param[out] inds array of indices of nonzeros or all values
+       * \param[out] vals array of nonzeros or all values
+       * \param[in] nonzero_only whether to read only nonzeros
+       */
+      void read_all_pairs(int64_t * num_pair, bool unpack, int64_t ** inds, char ** vals, bool nonzero_only=false) const;
+
+
        /**
        * \brief accumulates out a slice (block) of this tensor = B
        *   B[offsets,ends)=beta*B[offsets,ends) + alpha*A[offsets_A,ends_A)
diff --git a/src_python/ctf/chelper.pxd b/src_python/ctf/chelper.pxd
index beba8bec..0bc865fc 100644
--- a/src_python/ctf/chelper.pxd
+++ b/src_python/ctf/chelper.pxd
@@ -1,7 +1,7 @@
 from libc.stdint cimport int64_t
 
 cdef char* char_arr_py_to_c(a)
-
 cdef int64_t* int64_t_arr_py_to_c(a)
-
 cdef int* int_arr_py_to_c(a)
+cdef _cast_carray_as_python(n, char * cdata, dtype)
+
diff --git a/src_python/ctf/chelper.pyx b/src_python/ctf/chelper.pyx
index 4431486f..93b4d5c8 100644
--- a/src_python/ctf/chelper.pyx
+++ b/src_python/ctf/chelper.pyx
@@ -1,5 +1,11 @@
 import numpy as np
 from libc.stdlib cimport malloc, free
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+
+ctypedef double complex complex128_t
+ctypedef float complex complex64_t
+ctypedef long long iint64_t
 
 cdef char* char_arr_py_to_c(a):
     cdef char * ca
@@ -32,4 +38,40 @@ cdef int* int_arr_py_to_c(a):
         ca[i] = a[i]
     return ca
 
+cdef _cast_carray_as_python(n, char * cdata, dtype):
+    if dtype == np.float64:
+        return np.asarray(<double[:n]><double*>cdata)
+    elif dtype == np.float32:
+        return np.asarray(<float[:n]><float*>cdata)
+    elif dtype == np.complex64:
+        return np.asarray(<complex64_t[:n]><complex64_t*>cdata)
+    elif dtype == np.complex128:
+        return np.asarray(<complex128_t[:n]><complex128_t*>cdata)
+    elif dtype == np.int64:
+        return np.asarray(<iint64_t[:n]><iint64_t*>cdata)
+    elif dtype == np.int32:
+        return np.asarray(<int[:n]><int*>cdata)
+    elif dtype == np.bool_:
+        return np.asarray(<bool[:n]><bool*>cdata)
+
+    else:
+        print(dtype)
+        raise ValueError('CTF PYTHON ERROR: bad dtype')
+        return np.ndarray()
+
+#WARNING: copy versions below inadequate for by-reference usage of above to write into C++ arrays
+#cdef _cast_complex128_array_as_python(n, complex128_t * cdata):
+#    cdef complex128_t[:] cview = <complex128_t[:n]>cdata
+#    data = np.empty(n, dtype=np.cdouble)
+#    data[:] = cview[:]
+#    free(cdata)
+#    return data
+#
+#
+#cdef _cast_int64_array_as_python(n, int64_t * cdata):
+#    cdef int64_t[:] cview = <int64_t[:n]>cdata
+#    data = np.empty(n, dtype=np.int64)
+#    data[:] = cview[:]
+#    return data
+
 
diff --git a/src_python/ctf/linalg.pyx b/src_python/ctf/linalg.pyx
index edb14ab1..c52b05a3 100644
--- a/src_python/ctf/linalg.pyx
+++ b/src_python/ctf/linalg.pyx
@@ -1,9 +1,9 @@
 from libcpp cimport bool
 
-from tensor cimport ctensor, tensor
+from ctf.tensor cimport ctensor, tensor
 
-from helper import *
-from chelper cimport *
+from ctf.helper import *
+from ctf.chelper cimport *
 from libc.stdlib cimport malloc, free
 
 import ctf.profile
diff --git a/src_python/ctf/multilinear.pyx b/src_python/ctf/multilinear.pyx
index a7811191..2b05094d 100644
--- a/src_python/ctf/multilinear.pyx
+++ b/src_python/ctf/multilinear.pyx
@@ -1,8 +1,8 @@
 import numpy as np
-from tensor cimport Tensor, tensor, ctensor
+from ctf.tensor cimport Tensor, tensor, ctensor
 from libcpp cimport bool
 from libc.stdlib cimport malloc, free
-from profile import timer
+from ctf.profile import timer
 
 cdef extern from "ctf.hpp" namespace "CTF":
     cdef void TTTP_ "CTF::TTTP"[dtype](Tensor[dtype] * T, int num_ops, int * modes, Tensor[dtype] ** mat_list, bool aux_mode_first)
diff --git a/src_python/ctf/partition.pyx b/src_python/ctf/partition.pyx
index d126b36f..3594392a 100644
--- a/src_python/ctf/partition.pyx
+++ b/src_python/ctf/partition.pyx
@@ -1,5 +1,5 @@
 from libc.stdlib cimport malloc, free
-from chelper cimport *
+from ctf.chelper cimport *
 
 cdef class partition:
     """
diff --git a/src_python/ctf/tensor.pxd b/src_python/ctf/tensor.pxd
index aa1c76e2..12c7ecbd 100644
--- a/src_python/ctf/tensor.pxd
+++ b/src_python/ctf/tensor.pxd
@@ -1,8 +1,8 @@
 from libc.stdint cimport int64_t
 from libcpp cimport bool
 cimport numpy as cnp
-from partition cimport Idx_Partition
-from world cimport World
+from ctf.partition cimport Idx_Partition
+from ctf.world cimport World
 
 cdef extern from "ctf.hpp" namespace "CTF":
     cdef cppclass Term:
@@ -61,6 +61,7 @@ cdef extern from "ctf.hpp" namespace "CTF_int":
         void reshape(ctensor * tsr, char * alpha, char * beta)
         void allread(int64_t * num_pair, char * data, bool unpack)
         char * read_all_pairs(int64_t * num_pair, bool unpack, bool nonzeros_only)
+        void read_all_pairs(int64_t * num_pair, bool unpack, int64_t ** inds, char ** vals, bool nonzero_only) const;
         void slice(int64_t *, int64_t *, char *, ctensor *, int64_t *, int64_t *, char *)
         int64_t get_tot_size(bool packed)
         void get_raw_data(char **, int64_t * size)
diff --git a/src_python/ctf/tensor.pyx b/src_python/ctf/tensor.pyx
index cd85938f..01225bb0 100644
--- a/src_python/ctf/tensor.pyx
+++ b/src_python/ctf/tensor.pyx
@@ -24,6 +24,7 @@ from ctf.partition cimport idx_partition
 
 #from ctf.helper import ctf.helper._ord_comp, ctf.helper.type_index, ctf.helper._rev_array, ctf.helper._get_np_dtype, ctf.helper._get_num_str, ctf.helper._use_align_for_pair
 from ctf.chelper cimport *
+#cdef extern from "numpy/arrayobject.h":
 #from ctf.tensor_aux import ctf.tensor_aux.astensor, ctf.tensor_aux.transpose, ctf.tensor_aux.power, ctf.tensor_aux.dot, ctf.tensor_aux.reshape, ctf.tensor_aux.zeros, ctf.tensor_aux.conj, ctf.tensor_aux._match_tensor_types, ctf.tensor_aux._div, ctf.tensor_aux._setgetitem_helper, ctf.tensor_aux.trace, ctf.tensor_aux.diagonal, ctf.tensor_aux.take, ctf.tensor_aux.ravel tensorctf.tensor_aux.dot
 #from ctf.term import itensor
 #from ctf.world import comm
@@ -735,7 +736,9 @@ cdef class tensor:
 
         [idx_A, idx_B, idx_C, out_tsr] = tsr._ufunc_interpret(otsr)
         out_tsr.i(idx_C) << tsr.i(idx_A)
-        out_tsr.i(idx_C) << -1*otsr.i(idx_B)
+        iotsr = otsr.i(idx_B)
+        iotsr.scale(-1)
+        out_tsr.i(idx_C) << iotsr
         return out_tsr
 
     def __isub__(self, other_in):
@@ -747,9 +750,13 @@ cdef class tensor:
             raise ValueError('CTF PYTHON ERROR: invalid call to __isub__ (-=)')
         if self.dtype != other.dtype:
             [tsr, otsr] = ctf.tensor_aux._match_tensor_types(self,other) # solve the bug when np.float64 -= np.int64
-            self.i(idx_C) << -1*otsr.i(idx_A)
+            iotsr = otsr.i(idx_A)
+            iotsr.scale(-1)
+            self.i(idx_C) << iotsr
         else:
-            self.i(idx_C) << -1*other.i(idx_A)
+            iotsr = other.i(idx_A)
+            iotsr.scale(-1)
+            self.i(idx_C) << iotsr
         return self
 
     def __truediv__(self, other):
@@ -1691,17 +1698,9 @@ cdef class tensor:
         cdef int64_t * cinds
         cdef char * cdata
         cdef int64_t n
-        self.dt.read_local(&n,&cdata,unpack_sym)
-        inds = np.empty(n, dtype=np.int64)
-        vals = np.empty(n, dtype=self.dtype)
-
-        cdef cnp.ndarray buf = np.empty(len(inds), dtype=np.dtype([('a','i8'),('b',self.dtype)],align=ctf.helper._use_align_for_pair(self.dtype)))
-        d = buf.data
-        buf.data = cdata
-        vals[:] = buf['b'][:]
-        inds[:] = buf['a'][:]
-        buf.data = d
-        delete_pairs(self.dt, cdata)
+        self.dt.read_local(&n,&cinds,&cdata,unpack_sym)
+        inds = _cast_carray_as_python(n, <char*>cinds, np.int64)
+        vals = _cast_carray_as_python(n, cdata, self.dtype)
         return inds, vals
 
     def dot(self, other, out=None):
@@ -1793,16 +1792,9 @@ cdef class tensor:
         cdef int64_t * cinds
         cdef char * cdata
         cdef int64_t n
-        self.dt.read_local_nnz(&n,&cdata,unpack_sym)
-        inds = np.empty(n, dtype=np.int64)
-        vals = np.empty(n, dtype=self.dtype)
-        cdef cnp.ndarray buf = np.empty(len(inds), dtype=np.dtype([('a','i8'),('b',self.dtype)],align=ctf.helper._use_align_for_pair(self.dtype)))
-        d = buf.data
-        buf.data = cdata
-        vals[:] = buf['b'][:]
-        inds[:] = buf['a'][:]
-        buf.data = d
-        delete_arr(self.dt, cdata)
+        self.dt.read_local_nnz(&n,&cinds,&cdata,unpack_sym)
+        inds = _cast_carray_as_python(n, <char*>cinds, np.int64)
+        vals = _cast_carray_as_python(n, cdata, self.dtype)
         return inds, vals
 
     def tot_size(self, unpack=True):
@@ -1839,17 +1831,11 @@ cdef class tensor:
                 return
         cvals = <char*> malloc(sz*tB)
         self.dt.allread(&sz, cvals, unpack)
-        cdef cnp.ndarray buf = np.empty(sz, dtype=self.dtype)
-        odata = buf.data
-        buf.data = cvals
+        vals = _cast_carray_as_python(sz, cvals, self.dtype)
         if arr is None:
-            sbuf = np.asarray(buf)
-            free(odata)
-            return buf
+            return vals
         else:
-            arr[:] = buf[:]
-            free(cvals)
-            buf.data = odata
+            arr[:] = vals[:]
 
     def read_all_nnz(self, unpack=True):
         """
@@ -1869,16 +1855,9 @@ cdef class tensor:
         cdef int64_t * cinds
         cdef char * cdata
         cdef int64_t n
-        cdata = self.dt.read_all_pairs(&n, unpack, True)
-        inds = np.empty(n, dtype=np.int64)
-        vals = np.empty(n, dtype=self.dtype)
-        cdef cnp.ndarray buf = np.empty(len(inds), dtype=np.dtype([('a','i8'),('b',self.dtype)],align=ctf.helper._use_align_for_pair(self.dtype)))
-        d = buf.data
-        buf.data = cdata
-        vals[:] = buf['b'][:]
-        inds[:] = buf['a'][:]
-        buf.data = d
-        delete_arr(self.dt, cdata)
+        self.dt.read_all_pairs(&n, unpack, &cinds, &cdata, True)
+        inds = _cast_carray_as_python(n, <char*>cinds, np.int64)
+        vals = _cast_carray_as_python(n, cdata, self.dtype)
         return inds, vals
 
     def __read_all(self, arr):
@@ -1893,11 +1872,8 @@ cdef class tensor:
         sz = self.dt.get_tot_size(False)
         tB = arr.dtype.itemsize
         self.dt.get_raw_data(&cvals, &sz)
-        cdef cnp.ndarray buf = np.empty(sz, dtype=self.dtype)
-        odata = buf.data
-        buf.data = cvals
-        arr[:] = buf[:]
-        buf.data = odata
+        vals = _cast_carray_as_python(sz, cvals, self.dtype)
+        arr[:] = vals[:]
 
     def __write_all(self, arr):
         """
@@ -1911,12 +1887,9 @@ cdef class tensor:
         sz = self.dt.get_tot_size(False)
         tB = arr.dtype.itemsize
         self.dt.get_raw_data(&cvals, &sz)
-        cdef cnp.ndarray buf = np.empty(sz, dtype=self.dtype)
-        odata = buf.data
-        buf.data = cvals
+        vals = _cast_carray_as_python(sz, cvals, self.dtype)
         rarr = arr.ravel()
-        buf[:] = rarr[:]
-        buf.data = odata
+        vals[:] = rarr[:]
 
     def conj(self):
         """
diff --git a/src_python/ctf/tensor_aux.pyx b/src_python/ctf/tensor_aux.pyx
index ca9c003e..4d3b3be6 100644
--- a/src_python/ctf/tensor_aux.pyx
+++ b/src_python/ctf/tensor_aux.pyx
@@ -14,6 +14,7 @@ from ctf.chelper cimport *
 from ctf.tensor cimport tensor, ctensor
 from ctf.profile import timer
 from ctf.linalg import svd
+from ctf.term cimport itensor
 
 cdef extern from "../ctf_ext.h" namespace "CTF_int":
     cdef void pow_helper[dtype](ctensor * A, ctensor * B, ctensor * C, char * idx_A, char * idx_B, char * idx_C);
@@ -514,9 +515,6 @@ def take(init_A, indices, axis=None, out=None, mode='raise'):
         if axis > len(A.shape):
             raise IndexError("CTF PYTHON ERROR: axis out of bounds")
         if indices.shape == () or indices.shape== (1,):
-            total_size = 1
-            for i in range(len(A.shape)):
-                total_size *= A[i]
             if indices >= A.shape[axis]:
                 raise IndexError("CTF PYTHON ERROR: index out of bounds")
             ret_shape = list(A.shape)
@@ -542,9 +540,6 @@ def take(init_A, indices, axis=None, out=None, mode='raise'):
         else:
             if len(indices.shape) > 1:
                 raise ValueError("CTF PYTHON ERROR: current ctf does not support when specify axis and the len(indices.shape) > 1")
-            total_size = 1
-            for i in range(len(A.shape)):
-                total_size *= A[i]
             for i in range(len(indices)):
                 if indices[i] >= A.shape[axis]:
                     raise IndexError("index out of bounds")
@@ -2479,7 +2474,9 @@ def einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe
     operand = new_operands[0].i(inds[0])
     for i in range(1,numop):
         operand = operand * new_operands[i].i(inds[i])
-    out_scale*output.i(out_inds) << operand
+    itsr = output.i(out_inds)
+    itsr.scale(out_scale)
+    itsr << operand
     if out is None:
         if len(out_inds) == 0:
             output = output.item()
@@ -2752,5 +2749,3 @@ def _comp_all(tensor A, axis=None, out=None, keepdims=None):
     if axis is None:
         x = A._bool_sum()
         return x == A.tot_size()
-
-
diff --git a/test/python/test_sparse.py b/test/python/test_sparse.py
index c8bf9bf6..3a74c9e2 100644
--- a/test/python/test_sparse.py
+++ b/test/python/test_sparse.py
@@ -51,8 +51,8 @@ def test_scaled_expression(self):
         b_np = ctf.to_nparray(b_dn)
         c_np = ctf.to_nparray(c_dn)
 
-        c_sp.i("ijk") << 2.3*a_sp.i("ijl")*b_sp.i("kjl") + 7*c_sp.i("ijk") - a_sp.i("ijk") - 1. * a_sp.i("ijk") - 2 * b_sp.i("ijk")
-        c_dn.i("ijk") << 2.3*a_dn.i("ijl")*b_dn.i("kjl") + 7*c_dn.i("ijk") - a_dn.i("ijk") - 1. * a_dn.i("ijk") - 2 * b_dn.i("ijk")
+        c_sp.i("ijk") << a_sp.i("ijl")*b_sp.i("kjl")*2.3 + c_sp.i("ijk")*7 - a_sp.i("ijk") - a_sp.i("ijk") -  b_sp.i("ijk")*2
+        c_dn.i("ijk") << a_dn.i("ijl")*b_dn.i("kjl")*2.3 + c_dn.i("ijk")*7 - a_dn.i("ijk") -  a_dn.i("ijk") -  b_dn.i("ijk")*2
         c_np += 2.3*np.einsum("ijl,kjl->ijk",a_np,b_np) + 7*c_np - a_np - 1. * a_np - 2 * b_np
         self.assertTrue(allclose(c_np,c_dn))  
         self.assertTrue(allclose(c_np,c_sp))  
@@ -62,7 +62,11 @@ def test_complex(self):
         b0 = np.arange(27.).reshape(3,3,3)
         a1 = ctf.astensor(a0)
         b1 = ctf.astensor(b0)
-        .2*b1.i("ijk") << .7*a1.i("kij")
+        b1i = b1.i("ijk")
+        b1i.scale(.2)
+        a1i = b1.i("kij")
+        a1i.scale(.7)
+        b1i << a1i
         b0 = .2*b0 + .7*a0.transpose([1,2,0])
         self.assertTrue(allclose(b0,b1))